#### Import the required libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [2]:
pip install xlrd

Note: you may need to restart the kernel to use updated packages.


#### Import the dataset

In [3]:
data = pd.read_excel(r"C:\Users\ADMIN\Downloads\iris (1).xls")
data.head()

Unnamed: 0,SL,SW,PL,PW,Classification
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
# Shape of data
data.shape

(150, 5)

In [5]:
# Basic information about data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   SL              143 non-null    float64
 1   SW              144 non-null    float64
 2   PL              144 non-null    float64
 3   PW              150 non-null    float64
 4   Classification  150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [6]:
#Statistical Analysis
data.describe()

Unnamed: 0,SL,SW,PL,PW
count,143.0,144.0,144.0,150.0
mean,5.855944,3.049306,3.75625,1.198667
std,0.828168,0.430644,1.761306,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [7]:
#Check for null values
data.isna().sum()

SL                7
SW                6
PL                6
PW                0
Classification    0
dtype: int64

#### Data Preprocessing

In [8]:
# Handling Missing values

In [9]:
na_columns = data.iloc[:,[0,1,2]]

In [10]:
for col in na_columns:
    data[col].fillna(data[col].mean(),inplace=True)

In [11]:
data.isna().sum()

SL                0
SW                0
PL                0
PW                0
Classification    0
dtype: int64

In [12]:
# Creating the feature and target variable
X = data.drop(['Classification'],axis=1)
y = data['Classification']

In [13]:
# scaling the feature variable
scaler = StandardScaler()

X = scaler.fit_transform(X)

#### Creating the Training and Test data

In [14]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state=42,shuffle=True)

print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(120, 4) (30, 4) (120,) (30,)


#### Fitting the models

In [15]:
# 1. K-Nearest Neighbors

from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()
knn_model = knn_model.fit(X_train,y_train)
knn_pred = knn_model.predict(X_test)

In [16]:
# 2.Support Vector Machine

from sklearn.svm import SVC

sv_classifier = SVC()
sv_classifier = sv_classifier.fit(X_train,y_train)
svc_pred = sv_classifier.predict(X_test)

In [17]:
# 3.Decision Tree

from sklearn.tree import DecisionTreeClassifier

dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier = dt_classifier.fit(X_train,y_train)
dt_pred = dt_classifier.predict(X_test)

In [18]:
# 4.Random Forest

from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier()
rf_classifier = rf_classifier.fit(X_train,y_train)
rf_pred = rf_classifier.predict(X_test)

#### Perfomance Evaluation

In [19]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

knn_accuracy = accuracy_score(y_test,knn_pred)
svc_accuracy = accuracy_score(y_test,svc_pred)
dt_accuracy = accuracy_score(y_test,dt_pred)
rf_accuracy = accuracy_score(y_test,rf_pred)

performance = pd.DataFrame({'Model':['KNN','SVM','Decision Tree','Random Forest'],'Accuracy':[knn_accuracy,svc_accuracy,dt_accuracy,rf_accuracy]})
print(performance)

           Model  Accuracy
0            KNN  0.966667
1            SVM  0.966667
2  Decision Tree  1.000000
3  Random Forest  1.000000


In [20]:
# Confusion matrix and Clasification report of KNN, SVM, Decision Tree, Random Forest

for i in [knn_pred,svc_pred,dt_pred,rf_pred]:
    conf_matrix = confusion_matrix(y_test,i)
    cl_report = classification_report(y_test,i)
    print(conf_matrix)
    print(cl_report)

[[10  0  0]
 [ 0  8  1]
 [ 0  0 11]]
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       1.00      0.89      0.94         9
 Iris-virginica       0.92      1.00      0.96        11

       accuracy                           0.97        30
      macro avg       0.97      0.96      0.97        30
   weighted avg       0.97      0.97      0.97        30

[[10  0  0]
 [ 0  8  1]
 [ 0  0 11]]
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       1.00      0.89      0.94         9
 Iris-virginica       0.92      1.00      0.96        11

       accuracy                           0.97        30
      macro avg       0.97      0.96      0.97        30
   weighted avg       0.97      0.97      0.97        30

[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
                 precision    recall  f1-score   support

    Iris-setosa       1.00 

#### Cross Validation

In [34]:
from sklearn.model_selection import KFold,cross_val_score

kf = KFold(n_splits=5)

knn_score = cross_val_score(knn_model,X,y,cv=kf)
svc_score = cross_val_score(sv_classifier,X,y,cv=kf)
dt_score = cross_val_score(dt_classifier,X,y,cv=kf)
rf_score = cross_val_score(rf_classifier,X,y,cv=kf)
mean_knn_scr = knn_score.mean()
mean_svc_scr = svc_score.mean()
mean_dt_scr = dt_score.mean()
mean_rf_scr = rf_score.mean()

print('KNN Cross Validation Score :',mean_knn_scr)
print('SVM Cross Validation Score :',mean_svc_scr)
print('Decision Tree Cross Validation Score :',mean_dt_scr)
print('Random Forest Cross Validation Score :',mean_rf_scr)

KNN Cross Validation Score : 0.9133333333333333
SVM Cross Validation Score : 0.9
Decision Tree Cross Validation Score : 0.9199999999999999
Random Forest Cross Validation Score : 0.9199999999999999


Inference :
   From the accuracy scores of different classification models,its clear that both Decision Tree and Random Forest have better performance when compared to K-Nearest Neighbors and Support Vector Machine.