# Assignment on Classification

## Loading the dataset into python environment

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
iris = pd.read_excel(r'C:\Users\Dell\Desktop\Files\Data Science\Assignments\Assignment 5\iris.xls')
iris.head()

Unnamed: 0,SL,SW,PL,PW,Classification
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


## Exploratory Data Analysis

#### Analysing data

In [3]:
iris.shape

(150, 5)

In [4]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   SL              143 non-null    float64
 1   SW              144 non-null    float64
 2   PL              144 non-null    float64
 3   PW              150 non-null    float64
 4   Classification  150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [5]:
iris.describe()

Unnamed: 0,SL,SW,PL,PW
count,143.0,144.0,144.0,150.0
mean,5.855944,3.049306,3.75625,1.198667
std,0.828168,0.430644,1.761306,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


#### Missing values

In [6]:
iris.isnull().sum()

SL                7
SW                6
PL                6
PW                0
Classification    0
dtype: int64

In [7]:
iris.columns

Index(['SL', 'SW', 'PL', 'PW', 'Classification'], dtype='object')

 #### Correlations

In [8]:
iris.corr(numeric_only = True)

Unnamed: 0,SL,SW,PL,PW
SL,1.0,-0.102511,0.873444,0.821584
SW,-0.102511,1.0,-0.428465,-0.3469
PL,0.873444,-0.428465,1.0,0.961679
PW,0.821584,-0.3469,0.961679,1.0


## Pre-processing

#### Handling missing values

In [9]:
iris['SL'].fillna(iris['SL'].mean(), inplace=True)
iris['SW'].fillna(iris['SW'].mean(), inplace=True)
iris['PL'].fillna(iris['PL'].mean(), inplace=True)

In [10]:
iris.isnull().sum()

SL                0
SW                0
PL                0
PW                0
Classification    0
dtype: int64

#### Splitting the dataset into train data and test data

In [11]:
x = iris.drop(columns = ['Classification'])
y = iris['Classification']

In [12]:
x.shape,y.shape

((150, 4), (150,))

In [13]:
scalar = StandardScaler()

In [14]:
x = scalar.fit_transform(x)

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## Model Evaluation

In [16]:
models = {
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
}

In [17]:
for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")

K-Nearest Neighbors Accuracy: 0.9667
Support Vector Machine Accuracy: 0.9667
Decision Tree Accuracy: 1.0000
Random Forest Accuracy: 1.0000


__Accuracies of Different Models__  
K-Nearest Neighbors Accuracy: 96.67 %  
Support Vector Machine Accuracy: 96.67 %  
Decision Tree Accuracy: 100 %  
Random Forest Accuracy: 100 %

__From the above results it is clear that Decision Tree and Random Forest models give the perfect accuracy score but however perfect accuracy is not possible so they can be considered best models with nearly perfect accuracy__