# Case study on Supervised learning

In [1]:
#importing pandas,numply,matplotlib.pyplot,seaborn libaries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# 1. Read the dataset to the python environment.

In [2]:
#Reading the dataset to the pandas environment
Data=pd.read_csv(r'iris .1.csv')
Data.head()

Unnamed: 0,SL,SW,PL,PW,Classification
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
#Check for the null values present in each column of  dataset using 'isnull().sum()' function
Data.isnull().sum()

SL                7
SW                6
PL                6
PW                0
Classification    0
dtype: int64

# Insight:
Here SL,SW and PL contains null values.

In [4]:
# To know the shape of Data
Data.shape

(150, 5)

In [5]:
#To know data types of each columns and checking for null values in the columns
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   SL              143 non-null    float64
 1   SW              144 non-null    float64
 2   PL              144 non-null    float64
 3   PW              150 non-null    float64
 4   Classification  150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


# Insight:

**About our Dataset:**
<br>
**1.This dataset contains details of 150 entries.**
<br>
**2.There are 5 columns/features.**
<br>
**3.1 of the features are numerical and object data types**
</p>

# Data.describe()

In [7]:
Data.describe(include='object')

Unnamed: 0,Classification
count,150
unique,3
top,Iris-versicolor
freq,50


# 2. Do necessary pre-processing steps.


In [8]:
#preprocessing using Labelencoder
from sklearn.preprocessing import LabelEncoder
Ie=LabelEncoder()
Data['Classification']=Ie.fit_transform(Data['Classification'])
Data.head()

Unnamed: 0,SL,SW,PL,PW,Classification
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [9]:
# Droping classification in x axis
x=Data.drop(['Classification'],axis=1)
y=Data['Classification']
x.describe()

Unnamed: 0,SL,SW,PL,PW
count,143.0,144.0,144.0,150.0
mean,5.855944,3.049306,3.75625,1.198667
std,0.828168,0.430644,1.761306,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [10]:
#one hot encoding
Data=pd.get_dummies(Data)
Data.head()

Unnamed: 0,SL,SW,PL,PW,Classification
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [11]:
min_max=preprocessing.MinMaxScaler(feature_range=(0,1))
x=min_max.fit_transform(x)
x=pd.DataFrame(x)
x.describe()

Unnamed: 0,0,1,2,3
count,143.0,144.0,144.0,150.0
mean,0.432207,0.437211,0.467161,0.457778
std,0.230047,0.179435,0.298526,0.317984
min,0.0,0.0,0.0,0.0
25%,0.222222,0.333333,0.101695,0.083333
50%,0.416667,0.416667,0.567797,0.5
75%,0.583333,0.541667,0.694915,0.708333
max,1.0,1.0,1.0,1.0


In [13]:
x=np.nan_to_num(x)
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42,test_size=0.2)
from sklearn import linear_model
lr=linear_model.LinearRegression()
model=lr.fit(x_train,y_train)
predictions=model.predict(x_test)
from sklearn.metrics import mean_squared_error
print('MSE is:',mean_squared_error(y_test,predictions))
from sklearn.metrics import r2_score
print('R Squarred Value is:',r2_score(y_test,predictions))

MSE is: 0.03036409477977395
R Squarred Value is: 0.9565537594566033


# Insight:
**Mean square error is 0.031 and R Squarred Value is: 0.956.**

# 3. Find out which classification model gives the best result to predict iris species.(also do random forest algorithm)


# a)Classification models

# 
**1.Logistic Regression**

In [70]:
model=LogisticRegression()
model.fit(x_train,y_train)
y_pred=model.predict(x_test)

In [71]:
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score
print("confusion matrix:\n",confusion_matrix(y_pred,y_test))
print("Accuracy:",accuracy_score(y_pred,y_test)*100)
print("Precision:",precision_score(y_pred,y_test,average="macro")*100)
print("Recall:",recall_score(y_pred,y_test,average="macro")*100)
print("F1 Score:",f1_score(y_pred,y_test,average="macro")*100)

confusion matrix:
 [[10  1  0]
 [ 0  8  0]
 [ 0  0 11]]
Accuracy: 96.66666666666667
Precision: 96.29629629629629
Recall: 96.96969696969697
F1 Score: 96.45191409897292


# Insight:
Here we get Accuracy=96.667,Precision: 96.29 and F1 Score: 96.45.
    

# 
**2.KNN (k-nearest neighbor)**

In [38]:
from sklearn.neighbors import KNeighborsClassifier
model=KNeighborsClassifier()
model.fit(x_train,y_train)
y_pred=model.predict(x_test)

In [66]:
print("confusion matrix:\n",confusion_matrix(y_pred,y_test))
print("Accuracy:",accuracy_score(y_pred,y_test))
print("Precision:",precision_score(y_pred,y_test,average="micro"))
print("Recall:",recall_score(y_pred,y_test,average="micro"))
print("F1 Score:",f1_score(y_pred,y_test,average="micro"))

confusion matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


# Insight:
**Accuracy,Precision,F1 Score of K-NN classifier is 1.Thus,k-nearest neighbor is a good model of Classifiction.**


# 
**3.Decision Tree**

In [40]:
from sklearn.tree import DecisionTreeClassifier
model=DecisionTreeClassifier()
model.fit(x_train,y_train)
y_pred=model.predict(x_test)

In [67]:
print("confusion matrix:\n",confusion_matrix(y_pred,y_test))
print("Accuracy:",accuracy_score(y_pred,y_test))
print("Precision:",precision_score(y_pred,y_test,average="macro"))
print("Recall:",recall_score(y_pred,y_test,average="macro"))
print("F1 Score:",f1_score(y_pred,y_test,average="macro"))

confusion matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


# Insight:
**Accuracy,Precision,F1 Score of Decision Tree is 1.Thus,Decision Tree is a good model of Classifiction.**

# 
**4.Support Vector Machine**

In [42]:
from sklearn.svm import SVC
model=SVC()
model.fit(x_train,y_train)
y_pred=model.predict(x_test)

In [68]:
print("confusion matrix:\n",confusion_matrix(y_pred,y_test))
print("Accuracy:",accuracy_score(y_pred,y_test))
print("Precision:",precision_score(y_pred,y_test,average="macro"))
print("Recall:",recall_score(y_pred,y_test,average="macro"))
print("F1 Score:",f1_score(y_pred,y_test,average="macro"))

confusion matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


# Insight:
**Accuracy,Precision,F1 Score of Support Vector Machine is 1.Thus,Support Vector Machine is a good model of Classifiction.**

# 
**5.Linear SVM**

In [62]:
from sklearn.svm import SVC
svm_linear=SVC(kernel="linear")
svm_linear.fit(x_train,y_train)
y_pred=svm_linear.predict(x_test)

In [63]:
print("confusion matrix:\n",confusion_matrix(y_pred,y_test))
print("Accuracy:",accuracy_score(y_pred,y_test))
print("Precision:",precision_score(y_pred,y_test,average="macro"))
print("Recall:",recall_score(y_pred,y_test,average="macro"))
print("F1 Score:",f1_score(y_pred,y_test,average="macro"))

confusion matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


# Insight:
**Accuracy,Precision,F1 Score of Linear SVM is 1.Thus,Linear SVM is a good model of Classifiction.**

# 
**6.Polynominal SVM**

In [64]:
from sklearn.svm import SVC
svm_poly=SVC(kernel="poly",degree=1)
svm_poly.fit(x_train,y_train)
y_pred=svm_poly.predict(x_test)

In [65]:
print("confusion matrix:\n",confusion_matrix(y_pred,y_test))
print("Accuracy:",accuracy_score(y_pred,y_test))
print("Precision:",precision_score(y_pred,y_test,average="macro"))
print("Recall:",recall_score(y_pred,y_test,average="macro"))
print("F1 Score:",f1_score(y_pred,y_test,average="macro"))

confusion matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


# Insight:
**Accuracy,Precision,F1 Score of Polynominal SVM is 1.Thus,Polynominal SVM is a good model of Classifiction.**

# b)random forest algorithm

In [53]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(x_train,y_train)
y_pred=rf.predict(x_test)

In [60]:
print("confusion matrix:\n",confusion_matrix(y_pred,y_test))
print("Accuracy:",accuracy_score(y_pred,y_test))
print("Precision:",precision_score(y_pred,y_test,average="micro"))
print("Recall:",recall_score(y_pred,y_test,average="micro"))
print("F1 Score:",f1_score(y_pred,y_test,average="micro"))

confusion matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


# Insight:
**Accuracy,Precision,F1 Score of random forest algorithm is 1.Thus,random forest algorithm is also  a good model of Classifiction.**    