In [None]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

# Step-1: Data gathering

In [None]:
data=pd.read_csv('data\\Iris.csv',index_col='Id') # ID column use as index column
data.columns=['SepalLength','SepalWidth','PetalLength','PetalWidth','Species']

In [None]:
data.head(3)

In [None]:
data.shape

# Step-2:  Data Preprocessing

In [None]:
data['Species']=data['Species'].apply(lambda s: s[5:])
data

In [None]:
data.info()

In [None]:
data.isnull().sum()

# Step-3: EDA

In [None]:
data.describe().T

In [None]:
data.groupby(by='Species').mean() # return mean value of every species acording to species colume

In [None]:
#plt.figure(figsize=(10,10))
sns.set(style='dark')
sns.pairplot(data,hue='Species')
plt.grid()

In [None]:
plt.figure(figsize=(10,8))
sns.set(style='dark')
sns.scatterplot(x='SepalLength',y='SepalWidth', data=data,hue='Species',)
plt.grid()

In [None]:
plt.figure(figsize=(10,8))
sns.set(style='dark')
sns.scatterplot(x='PetalLength',y='PetalWidth', data=data,hue='Species',)
plt.grid()

In [None]:
sns.boxplot(data=data,x='PetalLength')

In [None]:
sns.boxplot(data=data,x='PetalWidth')

# Step-4: Feature Engineering and selection

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [None]:
# remove warning
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
data['Species']=encoder.fit_transform(data.Species.values.ravel())
data

In [None]:
data['Species'].ndim

In [None]:
# Another process
# target.iloc[:,0]=encoder.fit_transform(target.values.ravel()) 
# target

In [None]:
correlation=data.corr() # getting correlation
correlation

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(correlation,annot=True,cmap='Greens')

In [None]:
data.Species.value_counts() # count number of values of Species column

In [None]:
Features=data.drop(columns=['Species'])
Features

In [None]:
target=data[['Species']] # target value put 1D necessary of prediction
target

    Train test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(Features,target,test_size=0.3,random_state=42)

In [None]:
xtrain.shape

In [None]:
xtest.shape

    Feature selection

In [None]:
from sklearn.feature_selection import mutual_info_classif
selector=mutual_info_classif(xtrain,ytrain,random_state=42) # it helps us to select feature which will be important to predict data 
selector # these values are score value of every columns (last two column are most important)

In [None]:
selector=pd.DataFrame(selector) # selector converts into dataframe
selector.columns=['Score']
selector

In [None]:
selector.index=Features.columns # selector index take access features column name
selector

In [None]:
selector.sort_values('Score', ascending=False) # sorting all values of score

In [None]:
#selector.plot.barh() # ploting bar chart
selector.plot.bar()

In [None]:
# Extra 
selector.sort_index(ascending=False) # # sorting all name of index
Features.sort_values('PetalLength' ,ascending=False)

    Best Feature Selection

In [None]:
from sklearn.feature_selection import SelectKBest
extractor=SelectKBest(mutual_info_classif,k=2) # best feature selection according to selector values
extractor.fit(xtrain,ytrain)


In [None]:
best_feature=extractor.get_feature_names_out() # getting best feature name
best_feature

# Step-5: Modeling

| SVC |

    || C : float, default=1.0

(Limition of error/ miss classificion acception and It uses for soft the margin)

Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty.



    || kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable 
    default='rbf'

Specifies the kernel type to be used in the algorithm. If none is given, 'rbf' will be used. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape (n_samples, n_samples).
Rbf = radial basis function kernel.
precomputed use for customize function

    || degree : int, default=3

Degree of the polynomial kernel function ('poly'). Ignored by all other kernels.
It use on only for polynomial 

    || gamma : {'scale', 'auto'} or float, default='scale'
        Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.

if gamma='scale' (default) is passed then it uses 1 / (n_features * X.var()) as value of gamma,
if 'auto', uses 1 / n_features.

    || max_iter : int, default=-1
    
Hard limit on iterations within solver, or -1 for no limit.

    || decision_function_shape : {'ovo', 'ovr'}, default='ovr'

Whether to return a one-vs-rest ('ovr') decision function of shape (n_samples, n_classes) as all other classifiers, or the original one-vs-one ('ovo') decision function of libsvm which has shape (n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one ('ovo') is always used as multi-class strategy. The parameter is ignored for binary classification.

    || random_state : int, RandomState instance or None, default=None
Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False. Pass an int for reproducible output across multiple function calls. See Glossary <random_state>.

In [None]:
from sklearn import svm # import support vector mechine module 
clf=svm.SVC()             # SVC means  Support vector classifier, which is use for classification

clf.fit(xtrain[best_feature],ytrain) # fit the model

In [None]:
clf.predict(xtest[best_feature]) # prediction

In [None]:
clf.predict(xtest[best_feature])==ytest.values.ravel() # testing model accuricy

# step-6: Evaluation

In [95]:
from sklearn.metrics import accuracy_score , confusion_matrix, plot_confusion_matrix    # sklearn.metrics stores all evaluation module
accuracy_score(clf.predict(xtest[best_feature]), ytest) # accuracy_score=1 means our model is 100% fitted/accurate, here no miss classification in this data

1.0

# step-7: Cross validation

In [96]:
from sklearn.model_selection import cross_val_score 
cv=cross_val_score(clf, Features[best_feature], target, cv=5) # cross validation checking 
                                                               # cv=5 referce the number of cross validation set
cv

array([0.96666667, 0.96666667, 0.93333333, 0.93333333, 1.        ])

In [97]:
cv.mean() # mean value of all cross values which is actual accracy

0.96

# Step-8: Model tuning

In [None]:
clf2=LogisticRegression(C=5,penalty='l2', solver='saga') # Tuning the model manually
cv=cross_val_score(clf2,Features[best_feature],target,cv=5) # again cross value checking
cv

In [None]:
cv.mean()

In [None]:
LogisticRegression().get_params() # Return Parameter names mapped to their values.

** Another way of model tuning  **

Another approach of model tuning which works dynamically
By this we can easily find uot best model applying cross validation
It will do all the possible combination and try to find out the best combination which provide best accuricy

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
Grid=GridSearchCV(
    clf2,
    param_grid={

    'C':[2,3,4,5,6],
    'solver':['newton-cg','liblinear', 'sag','saga','lbfgs']
    },
    cv=5, scoring='accuracy' # here depend on accuracy score we find out best model
)
Grid.fit(Features[best_feature],target) # fit the model to the grid

In [None]:
Grid.best_params_ # find the best parametter for this data set

# Step-9: Model Visualization

In [None]:
clf3=LogisticRegression(C=3, solver='saga') # set the best parametter
clf3.fit(Features[best_feature],target)

In [None]:
Features.PetalLength.values #convert into values

linspeice work on only one axis
meshgrid work on two axis

In [None]:
xmin,xmax=Features.PetalLength.values.min(), Features.PetalLength.values.max()  # petalLength stay x-axis 
ymin,ymax=Features.PetalWidth.values.min(), Features.PetalWidth.values.max() # petalWidth stay y-axis 
xx,yy=np.meshgrid(
    np.arange(xmin-0.2, xmax+0.2, 0.001), np.arange(ymin-0.2,ymax+0.2,0.001)
)


In [None]:
grid=np.c_[xx.ravel(), yy.ravel()] # covert 2D to 1D and concat value of x(PetalLength) with value of y(PetalWidth)
grid

In [None]:
ypred_2=clf3.predict(grid) # predict the grid
ypred_2

In [None]:
ypred_2=ypred_2.reshape(yy.shape) # reshape 1D to 2D 
ypred_2

In [None]:
plt.figure(1,figsize=(10,8))
plt.pcolormesh(xx,yy,ypred_2)
plt.set_cmap(plt.cm.Accent_r)
predictions=clf3.predict(Features[best_feature])
plt.scatter(Features.PetalLength,Features.PetalWidth, c=predictions,edgecolors='red')
plt.xlabel('PetalLength')
plt.ylabel('PetalWidth')
plt.show()

In [None]:
xx.shape

In [None]:
yy.shape

In [None]:
ytest.shape

In [None]:
data.head()