Task No : 2

Problem Statement : Use the Iris dataset to develop a model that can classify iris flowers into different species based on their sepal and petal
measurements.

In [92]:
#Import warnings
from warnings import filterwarnings
filterwarnings('ignore')

In [93]:
#Step 1 : Read the dataset
import pandas as pd
df = pd.read_csv('IRIS.csv')
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [94]:
#Step 2 : Perform basic data quality check
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [95]:
df.isna().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [96]:
df.duplicated().sum()

3

In [97]:
#Step 3:Seperate catand con features
cat = df.columns[df.dtypes==object]
con = df.columns[df.dtypes!=object]

In [98]:
cat

Index(['species'], dtype='object')

In [99]:
con

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], dtype='object')

In [100]:
#Step 4:Seperate x and y
x = df.drop(columns=['species'])
y = df[['species']]

In [101]:
x.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [102]:
y.head()

Unnamed: 0,species
0,Iris-setosa
1,Iris-setosa
2,Iris-setosa
3,Iris-setosa
4,Iris-setosa


In [103]:
y.value_counts()

species        
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64

In [104]:
#Step 4: Apply preprocessing on X
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [105]:
num_pipe = Pipeline(steps=[('impute', SimpleImputer(strategy='mean')),
                           ('scaler', StandardScaler())]).set_output(transform='pandas')

In [106]:
X_pre = num_pipe.fit_transform(x)
X_pre.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,-0.900681,1.032057,-1.341272,-1.312977
1,-1.143017,-0.124958,-1.341272,-1.312977
2,-1.385353,0.337848,-1.398138,-1.312977
3,-1.506521,0.106445,-1.284407,-1.312977
4,-1.021849,1.26346,-1.341272,-1.312977


In [107]:
#Step 5: Apply Train test split
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X_pre, Y, test_size=0.33, random_state=21)

In [108]:
xtrain.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
52,1.28034,0.106445,0.649027,0.396172
49,-1.021849,0.569251,-1.341272,-1.312977
65,1.038005,0.106445,0.364699,0.264699
118,2.249683,-1.050569,1.786341,1.447956
55,-0.173674,-0.587764,0.421564,0.133226


In [109]:
ytrain.head()

Unnamed: 0,species
52,Iris-versicolor
49,Iris-setosa
65,Iris-versicolor
118,Iris-virginica
55,Iris-versicolor


In [110]:
xtest.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
92,-0.052506,-1.050569,0.137236,0.001753
44,-0.900681,1.726266,-1.056944,-1.050031
7,-1.021849,0.800654,-1.284407,-1.312977
21,-0.900681,1.494863,-1.284407,-1.050031
95,-0.173674,-0.124958,0.250967,0.001753


In [111]:
ytest.head()

Unnamed: 0,species
92,Iris-versicolor
44,Iris-setosa
7,Iris-setosa
21,Iris-setosa
95,Iris-versicolor


In [112]:
#Step 6: Model Building
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(xtrain, ytrain)


In [113]:
model.classes_

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [114]:
model.intercept_

array([-0.17009434,  1.82600148, -1.65590715])

In [115]:
model.coef_

array([[-0.93555991,  1.24735493, -1.63468997, -1.61414063],
       [ 0.38437553, -0.37161075, -0.41059697, -0.59865882],
       [ 0.55118438, -0.87574418,  2.04528694,  2.21279945]])

In [116]:
#Step 7: Evaluating model
# Check accuracy in training
model.score(xtrain, ytrain)

0.98

In [117]:
# Check accuracy in testing
model.score(xtest, ytest)

0.94

In [118]:
ypred_train = model.predict(xtrain)
ypred_test = model.predict(xtest)

In [119]:
ypred_train[0:5]

array(['Iris-versicolor', 'Iris-setosa', 'Iris-versicolor',
       'Iris-virginica', 'Iris-versicolor'], dtype=object)

In [120]:
ytrain.head()

Unnamed: 0,species
52,Iris-versicolor
49,Iris-setosa
65,Iris-versicolor
118,Iris-virginica
55,Iris-versicolor


In [121]:
ypred_test[0:5]

array(['Iris-versicolor', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-versicolor'], dtype=object)

In [122]:
ytest.head()

Unnamed: 0,species
92,Iris-versicolor
44,Iris-setosa
7,Iris-setosa
21,Iris-setosa
95,Iris-versicolor


In [123]:
#Step 8 :Plot the confusion matrix
model.classes_

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [126]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
cf = confusion_matrix(ytest, ypred_test)
cfd = ConfusionMatrixDisplay(cf, display_labels=model.classes_)
cfd.plot()

AttributeError: module 'matplotlib' has no attribute 'get_data_path'

In [None]:
#Step 9:Print the classification report
from sklearn.metrics import classification_report
print(classification_report(ytest, ypred_test))

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        18
Iris-versicolor       1.00      0.82      0.90        17
 Iris-virginica       0.83      1.00      0.91        15

       accuracy                           0.94        50
      macro avg       0.94      0.94      0.94        50
   weighted avg       0.95      0.94      0.94        50



In [None]:
ytest.value_counts()

species        
Iris-setosa        18
Iris-versicolor    17
Iris-virginica     15
Name: count, dtype: int64

F1 Macro average is more than 80% on test data hence its a good model

In [None]:
#Step 10:Get the probability for train and test
yprob_train = model.predict_proba(xtrain)
yprob_test = model.predict_proba(xtest)

In [None]:
model.classes_


array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [None]:
yprob_train[0:5]

array([[5.80640398e-03, 6.45140235e-01, 3.49053361e-01],
       [9.62651740e-01, 3.73475870e-02, 6.72531913e-07],
       [1.67343173e-02, 8.34261846e-01, 1.49003837e-01],
       [9.14862561e-08, 2.78449735e-03, 9.97215411e-01],
       [2.86909795e-02, 8.34082024e-01, 1.37226997e-01]])

In [None]:
yprob_test[0:5]

array([[2.04775870e-02, 9.12992505e-01, 6.65299084e-02],
       [9.87226823e-01, 1.27726258e-02, 5.51447658e-07],
       [9.72197405e-01, 2.78020825e-02, 5.12289442e-07],
       [9.85955356e-01, 1.40442540e-02, 3.89744617e-07],
       [8.82112835e-02, 8.60816887e-01, 5.09718292e-02]])

In [None]:
#Step 11 : Predict out of sample data
xnew = pd.read_csv('iris_sample.csv')
xnew

FileNotFoundError: [Errno 2] No such file or directory: 'iris_sample.csv'