In [1]:
# import necessary packages
import numpy as np
import pandas  as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os


In [4]:
# load the data
df = pd.read_csv('data\IRIS.csv')

In [5]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [6]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [8]:
df['species'].value_counts()

species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64

In [9]:
df.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [10]:


#  Data Preprocessing
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)



In [36]:
# Splitting the dataset into the Training set and Test set and validation set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = 0)

In [37]:
#check the shape of the data
print(X_train.shape)
print(X_test.shape)
print(X_val.shape)

(90, 4)
(30, 4)
(30, 4)


In [38]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_val = sc.transform(X_val)



In [41]:
#train another models using pipelines
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

models = [
    Pipeline([('K-NN', KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2))]),
    Pipeline([('Logistic Regression', LogisticRegression(random_state = 0))]),
    Pipeline([('SVM', SVC(kernel = 'linear', random_state = 0))]),
    Pipeline([('Kernel SVM', SVC(kernel = 'rbf', random_state = 0))]),
    Pipeline([('Decision Tree', DecisionTreeClassifier(criterion = 'entropy', random_state = 0))]),
    Pipeline([('Random Forest', RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0))]),
    Pipeline([('XGBoost', xgb.XGBClassifier())])
    

]

models_names = [ 'K-NN','Logistic Regression', 'SVM', 'Kernel SVM', 'Decision Tree', 'Random Forest', 'XGBoost' ]

In [42]:

from sklearn.metrics import accuracy_score
#train the models
for i, model in enumerate(models):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print('Accuracy {}: {}'.format(models_names[i], accuracy))
    y_pred_val = model.predict(X_val)
    accuracy_val = accuracy_score(y_val, y_pred_val)
    print('Accuracy_val {}: {}'.format(models_names[i], accuracy_val))
    print('---------------------------------------------')
    
#testing the models
from sklearn.model_selection import cross_val_score
accuracies = []
for model in models:
    accuracies.append(cross_val_score(estimator = model, X = X_train, y = y_train, cv = 10))

for i, accuracy in enumerate(accuracies):
    print('Accuracy {}: {}'.format(models_names[i], accuracy.mean()))
    print('Standard Deviation: {}'.format(accuracy.std()))
    print('---------------------------------------------')
    
#validation set

for i, model in enumerate(models):
    y_pred_val = model.predict(X_val)
    accuracy_val = accuracy_score(y_val, y_pred_val)
    print('Accuracy_val {}: {}'.format(models_names[i], accuracy_val))
    print('---------------------------------------------')

Accuracy K-NN: 1.0
Accuracy_val K-NN: 0.8666666666666667
---------------------------------------------
Accuracy Logistic Regression: 0.9666666666666667
Accuracy_val Logistic Regression: 0.9333333333333333
---------------------------------------------
Accuracy SVM: 1.0
Accuracy_val SVM: 0.9333333333333333
---------------------------------------------
Accuracy Kernel SVM: 1.0
Accuracy_val Kernel SVM: 0.9
---------------------------------------------
Accuracy Decision Tree: 1.0
Accuracy_val Decision Tree: 0.8666666666666667
---------------------------------------------
Accuracy Random Forest: 1.0
Accuracy_val Random Forest: 0.9
---------------------------------------------
Accuracy XGBoost: 0.9666666666666667
Accuracy_val XGBoost: 0.9
---------------------------------------------
Accuracy K-NN: 0.9666666666666666
Standard Deviation: 0.05091750772173158
---------------------------------------------
Accuracy Logistic Regression: 0.9555555555555555
Standard Deviation: 0.0737027731190089
----

The best model is the `SVM (Support Vector Machine) classifier with a linear kernel`. It achieved the highest accuracy score of `1.0` on the test set, indicating that it correctly classified all the samples in the test set. The model also achieved an accuracy score of `0.93` on the validation set, further demonstrating its strong performance.