# 6. Classification

# Decision tree, KNN, SVM, Logistic regression and Naive bayes

## 1) Import the necessary packages.

In [1]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split
from sklearn import metrics

## 2) Import the dataset

In [2]:
data=pd.read_csv("students_placement_data.csv")
data.head()

Unnamed: 0,Roll No,Gender,Section,SSC Percentage,inter_Diploma_percentage,B.Tech_percentage,Backlogs,registered_for_ Placement_Training,placement status
0,1,M,A,87.3,65.3,40.0,18,NO,Not placed
1,2,F,A,89.0,92.4,71.45,0,yes,Placed
2,3,F,A,67.0,68.0,45.26,13,yes,Not placed
3,4,M,A,71.0,70.4,36.47,17,yes,Not placed
4,5,M,A,,65.5,42.52,17,yes,Not placed


## 3) Data preprocessing 

### 3.1) Check for missing values and fill it

In [3]:
data.isna().any()

Roll No                               False
Gender                                False
Section                               False
SSC Percentage                         True
inter_Diploma_percentage              False
B.Tech_percentage                     False
Backlogs                              False
registered_for_ Placement_Training     True
placement status                      False
dtype: bool

We see that "SSC Percentage" and "registered_for_ Placement_Training"   are having missing values

### 3.1.1) We replace the missing values in "SSC Percentage" with mean SSC percentage of all students.

In [4]:
data["SSC Percentage"]=data["SSC Percentage"].replace(to_replace=np.nan,value=data["SSC Percentage"].mean())  
data.head()

Unnamed: 0,Roll No,Gender,Section,SSC Percentage,inter_Diploma_percentage,B.Tech_percentage,Backlogs,registered_for_ Placement_Training,placement status
0,1,M,A,87.3,65.3,40.0,18,NO,Not placed
1,2,F,A,89.0,92.4,71.45,0,yes,Placed
2,3,F,A,67.0,68.0,45.26,13,yes,Not placed
3,4,M,A,71.0,70.4,36.47,17,yes,Not placed
4,5,M,A,80.474569,65.5,42.52,17,yes,Not placed


### 3.1.2) We replace missing values in "registered_for_ Placement_Training" with the most frequent value in "registered_for_ Placement_Training" column. 

In [5]:
data["registered_for_ Placement_Training"]=data["registered_for_ Placement_Training"].replace(to_replace=np.nan,value=data["registered_for_ Placement_Training"].mode()[0])  
data.head()

# Note: We have used "mode()[0]" . because, mode() returns a series object. But we need only string. 
#So, mode()[0] is used to pic the string from the series. 
# If you execute 

#data["registered_for_ Placement_Training"].mode() 

# then it returns a series object

# 0    yes
# dtype: object

Unnamed: 0,Roll No,Gender,Section,SSC Percentage,inter_Diploma_percentage,B.Tech_percentage,Backlogs,registered_for_ Placement_Training,placement status
0,1,M,A,87.3,65.3,40.0,18,NO,Not placed
1,2,F,A,89.0,92.4,71.45,0,yes,Placed
2,3,F,A,67.0,68.0,45.26,13,yes,Not placed
3,4,M,A,71.0,70.4,36.47,17,yes,Not placed
4,5,M,A,80.474569,65.5,42.52,17,yes,Not placed


###  3.1.3) We check for missing values again and make sure that there are no missing values.

In [6]:
data.isna().any()

Roll No                               False
Gender                                False
Section                               False
SSC Percentage                        False
inter_Diploma_percentage              False
B.Tech_percentage                     False
Backlogs                              False
registered_for_ Placement_Training    False
placement status                      False
dtype: bool

### 3.2) Convert the strings into numbers

###  _Most of the classification algorithms cannot handle string information. So, Its better to change the string into numbers_

In [7]:
data['Gender'] = data['Gender'].map({'M': 0, 'F': 1})
data['Section']=data['Section'].map({'A':0,'B':1})
data['registered_for_ Placement_Training']=data['registered_for_ Placement_Training'].map({'NO':0,'yes':1})

### Now have a look at our dataset again.

In [8]:
data.head()

Unnamed: 0,Roll No,Gender,Section,SSC Percentage,inter_Diploma_percentage,B.Tech_percentage,Backlogs,registered_for_ Placement_Training,placement status
0,1,0,0,87.3,65.3,40.0,18,0,Not placed
1,2,1,0,89.0,92.4,71.45,0,1,Placed
2,3,1,0,67.0,68.0,45.26,13,1,Not placed
3,4,0,0,71.0,70.4,36.47,17,1,Not placed
4,5,0,0,80.474569,65.5,42.52,17,1,Not placed


## 4) Divide the data into features and labels.
We are considering __"Gender","Section","SSC  Percentage","inter_Diploma_percentage","B.Tech_percentage","Backlogs", "registered_for_ Placement_Training"__ as our features and __"placement status"__ as our label.

In [9]:
# features
X=data[["Gender","Section","SSC Percentage","inter_Diploma_percentage","B.Tech_percentage","Backlogs","registered_for_ Placement_Training"]].values

In [10]:
# Labels
y=data["placement status"]

## 5) Now split the data in training data and test data.
Note:training data is used to built the model and test data is used to evaluate the model.

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

### 5.1) Its always good to check the shapes of X, y ,X_train, X_test, y_train, y_test.
Note: 80% is used for training and 20% is used for testing

In [12]:
print (" The shape of original feature set is : ",X.shape)
print("The shape of original class labels is", y.shape)
print(" The shape of Features in training data",X_train.shape)
print("The shape of class labels in training data",y_train.shape)
print(" The shape of Features in test data",X_test.shape)
print("The shape of class labels in test data",y_test.shape)

 The shape of original feature set is :  (117, 7)
The shape of original class labels is (117,)
 The shape of Features in training data (93, 7)
The shape of class labels in training data (93,)
 The shape of Features in test data (24, 7)
The shape of class labels in test data (24,)


# Now, Let's apply different models

#  <span style="color:red"> <center> 1) Decision tree </centre> </span>

In [13]:
model_DT = DecisionTreeClassifier() # Create Decision Tree classifer object
model_DT = model_DT.fit(X_train,y_train) # Build the model using training data
y_predict_DT = model_DT.predict(X_test) # Apply the model on test data.

In [14]:
Actual_predicted_DT=pd.DataFrame({'predicted':y_predict_DT, 'Actual':y_test})
Actual_predicted_DT.head()

Unnamed: 0,predicted,Actual
45,Not placed,Placed
95,Not placed,Not placed
56,Not placed,Not placed
84,Not placed,Placed
98,Placed,Placed


## Check the performance of Decision tree

### Draw the confusion matrix of decision tree

In [15]:
from sklearn.metrics import confusion_matrix

pd.DataFrame(
    confusion_matrix(y_test, y_predict_DT),
    columns=['Not placed Predicted', 'placed predicted'],
    index=['Not placed', 'placed']
)

Unnamed: 0,Not placed Predicted,placed predicted
Not placed,10,4
placed,4,6


### Calculate Accuracy, recall, precision, f1 Score, of decision tree.

In [16]:
print("Accuracy of decison tree is " ,metrics.accuracy_score(y_test, y_predict_DT)*100)
print("Recall of decision tree is",metrics.recall_score(y_test, y_predict_DT, pos_label="Placed"))
print("Precision of decision tree is",metrics.precision_score(y_test, y_predict_DT, pos_label="Placed"))
print("f1_score of decision tree is",metrics.f1_score(y_test, y_predict_DT, pos_label="Placed"))

Accuracy of decison tree is  66.66666666666666
Recall of decision tree is 0.6
Precision of decision tree is 0.6
f1_score of decision tree is 0.6


#  <span style="color:red"> <center> 2) K nearest Neighbour </centre> </span>

In [17]:
model = KNeighborsClassifier(n_neighbors=3)
model = model.fit(X_train,y_train)
y_predict_knn = model.predict(X_test)

In [18]:
Actual_predicted_knn=pd.DataFrame({'predicted':y_predict_knn, 'Actual':y_test})
Actual_predicted_knn.head()

Unnamed: 0,predicted,Actual
45,Placed,Placed
95,Not placed,Not placed
56,Not placed,Not placed
84,Not placed,Placed
98,Placed,Placed


### Draw the confusion matrix of K nearest Neighbour

In [19]:
from sklearn.metrics import confusion_matrix

pd.DataFrame(
    confusion_matrix(y_test, y_predict_knn),
    columns=['Not placed Predicted', 'placed predicted'],
    index=['Not placed', 'placed']
)

Unnamed: 0,Not placed Predicted,placed predicted
Not placed,12,2
placed,3,7


### Calculate Accuracy, recall, precision, f1 Score, of K nearest Neighbour.

In [20]:
print("Accuracy of K nearest Neighbour is" ,metrics.accuracy_score(y_test, y_predict_knn)*100)
print("Recall of K nearest Neighbour is",metrics.recall_score(y_test, y_predict_knn, pos_label="Placed"))
print("Precision of K nearest Neighbour is",metrics.precision_score(y_test, y_predict_knn, pos_label="Placed"))
print("f1_score of K nearest Neighbour is",metrics.f1_score(y_test, y_predict_knn, pos_label="Placed"))

Accuracy of K nearest Neighbour is 79.16666666666666
Recall of K nearest Neighbour is 0.7
Precision of K nearest Neighbour is 0.7777777777777778
f1_score of K nearest Neighbour is 0.7368421052631577


#  <span style="color:red"> <center> 3) Logistic Regresssion </centre> </span>

In [37]:
model = LogisticRegression(max_iter=2000) # The model needed 2000 iterations to converge. (To get the optimal parameters)
model = model.fit(X_train,y_train)
y_predict_lr = model.predict(X_test)

In [34]:
Actual_predicted=pd.DataFrame({'predicted':y_predict_lr, 'Actual':y_test})
Actual_predicted.head()

Unnamed: 0,predicted,Actual
45,Placed,Placed
95,Not placed,Not placed
56,Not placed,Not placed
84,Not placed,Placed
98,Placed,Placed


### Draw the confusion matrix of Logistic Regresssion

In [23]:
from sklearn.metrics import confusion_matrix

pd.DataFrame(
    confusion_matrix(y_test, y_predict_lr),
    columns=['Not placed Predicted', 'placed predicted'],
    index=['Not placed', 'placed']
)

Unnamed: 0,Not placed Predicted,placed predicted
Not placed,12,2
placed,0,10


### Calculate Accuracy, recall, precision, f1 Score, of Logistic Regresssion.

In [24]:
print("Accuracy of Logistic Regresssion is " ,metrics.accuracy_score(y_test, y_predict_lr)*100)
print("recall of Logistic Regresssion is",metrics.recall_score(y_test, y_predict_lr, pos_label="Placed"))
print("Precision of Logistic Regresssion is",metrics.precision_score(y_test, y_predict_lr, pos_label="Placed"))
print("f1_score of Logistic Regresssion is",metrics.f1_score(y_test, y_predict_lr, pos_label="Placed"))

Accuracy of Logistic Regresssion is  91.66666666666666
recall of Logistic Regresssion is 1.0
Precision of Logistic Regresssion is 0.8333333333333334
f1_score of Logistic Regresssion is 0.9090909090909091


#  <span style="color:red"> <center> 4) Support Vector Machine </centre> </span>

In [25]:
model = SVC()
model = model.fit(X_train,y_train)
y_predict_svc = model.predict(X_test)

In [26]:
Actual_predicted=pd.DataFrame({'predicted':y_predict_svc, 'Actual':y_test})
Actual_predicted.head()

Unnamed: 0,predicted,Actual
45,Not placed,Placed
95,Not placed,Not placed
56,Not placed,Not placed
84,Not placed,Placed
98,Placed,Placed


### Draw the confusion matrix of Support Vector Machine

In [27]:
from sklearn.metrics import confusion_matrix

pd.DataFrame(
    confusion_matrix(y_test, y_predict_svc),
    columns=['Not placed Predicted', 'placed predicted'],
    index=['Not placed', 'placed']
)

Unnamed: 0,Not placed Predicted,placed predicted
Not placed,14,0
placed,6,4


### Calculate Accuracy, recall, precision, f1 Score, of Support Vector Machine.

In [28]:
print("Accuracy of Support Vector Machine is " ,metrics.accuracy_score(y_test, y_predict_svc)*100)
print("recall of Support Vector Machine is",metrics.recall_score(y_test, y_predict_svc, pos_label="Placed"))
print("Precision of Support Vector Machine is",metrics.precision_score(y_test, y_predict_svc, pos_label="Placed"))
print("f1_score of Support Vector Machine is",metrics.f1_score(y_test, y_predict_svc, pos_label="Placed"))

Accuracy of Support Vector Machine is  75.0
recall of Support Vector Machine is 0.4
Precision of Support Vector Machine is 1.0
f1_score of Support Vector Machine is 0.5714285714285715


#  <span style="color:red"> <center> 4) Naive bayes </centre> </span>

In [29]:
model = GaussianNB()
model = model.fit(X_train,y_train)
y_predict_nb = model.predict(X_test)

In [30]:
Actual_predicted=pd.DataFrame({'predicted':y_predict_svc, 'Actual':y_test})
Actual_predicted.head()

Unnamed: 0,predicted,Actual
45,Not placed,Placed
95,Not placed,Not placed
56,Not placed,Not placed
84,Not placed,Placed
98,Placed,Placed


In [31]:
from sklearn.metrics import confusion_matrix

pd.DataFrame(
    confusion_matrix(y_test, y_predict_svc),
    columns=['Not placed Predicted', 'placed predicted'],
    index=['Not placed', 'placed']
)

Unnamed: 0,Not placed Predicted,placed predicted
Not placed,14,0
placed,6,4


In [32]:
print("Accuracy of Naive bayes is " ,metrics.accuracy_score(y_test, y_predict_svc)*100)
print("recall of Naive bayes is",metrics.recall_score(y_test, y_predict_svc, pos_label="Placed"))
print("Precision of Naive bayes is",metrics.precision_score(y_test, y_predict_svc, pos_label="Placed"))
print("f1_score of Naive is",metrics.f1_score(y_test, y_predict_svc, pos_label="Placed"))

Accuracy of Naive bayes is  75.0
recall of Naive bayes is 0.4
Precision of Naive bayes is 1.0
f1_score of Naive is 0.5714285714285715
