### Importing the Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix ,accuracy_score,recall_score,precision_score

### Importing the Dataset

In [2]:
df = pd.read_csv('../input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv')

In [3]:
df.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


## Get A better Intuition about the data

Now that we have our data,
- We will try and look for null values in our data.
- We will use the describe() method to find the mean of different columns and some other data that might help us in making our model.
- Finding Unique the categories


### Checking for null values

In [4]:
print(df.isnull().sum())

sl_no              0
gender             0
ssc_p              0
ssc_b              0
hsc_p              0
hsc_b              0
hsc_s              0
degree_p           0
degree_t           0
workex             0
etest_p            0
specialisation     0
mba_p              0
status             0
salary            67
dtype: int64


> ### So salary column contains null values

In [5]:
df.describe()

Unnamed: 0,sl_no,ssc_p,hsc_p,degree_p,etest_p,mba_p,salary
count,215.0,215.0,215.0,215.0,215.0,215.0,148.0
mean,108.0,67.303395,66.333163,66.370186,72.100558,62.278186,288655.405405
std,62.209324,10.827205,10.897509,7.358743,13.275956,5.833385,93457.45242
min,1.0,40.89,37.0,50.0,50.0,51.21,200000.0
25%,54.5,60.6,60.9,61.0,60.0,57.945,240000.0
50%,108.0,67.0,65.0,66.0,71.0,62.0,265000.0
75%,161.5,75.7,73.0,72.0,83.5,66.255,300000.0
max,215.0,89.4,97.7,91.0,98.0,77.89,940000.0


In [6]:
 df.shape

(215, 15)

#### Finding the categories

In [7]:
print(df['workex'].unique())
print(df['specialisation'].unique())
print(df['status'].unique())

['No' 'Yes']
['Mkt&HR' 'Mkt&Fin']
['Placed' 'Not Placed']


## Dropping Unnecessary Colums
We will drop the sl_no and salary column as they wont help us in any way. The salary also has null values so dropping it means we dint have to impute these values and it won't help us in predicting if a person gets placed or not.

In [8]:
df1 = df.copy()
df1.drop(['sl_no','salary'],axis = 1,inplace = True)
df1.head()

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status
0,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed
1,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed
2,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed
3,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed
4,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed


In [9]:
 df1.shape

(215, 13)

### Encoding Categorical Values

We have values in the columns that are strings. In order to train our model, we need to convert these strings to numbers in order to feed it to the model.



In [10]:
df1['status']= df1['status'].map({'Placed':1,'Not Placed':0})
df1['workex']= df1['workex'].map({'Yes':1,'No':0})
df1['gender']= df1['gender'].map({'M':1,'F':0})
df1['hsc_b']= df1['hsc_b'].map({'Central':1,'Others':0})
df1['ssc_b']= df1['ssc_b'].map({'Central':1,'Others':0})
df1['specialisation'] = df1['specialisation'].map({'Mkt&HR': 1 , 'Mkt&Fin' : 0})
df1['degree_t']= df1['degree_t'].map({'Sci&Tech':0,'Comm&Mgmt':1,'Others':2})
df1['hsc_s']= df1['hsc_s'].map({'Commerce':0,'Science':1,'Arts':2})

In [11]:
df1.head()

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status
0,1,67.0,0,91.0,0,0,58.0,0,0,55.0,1,58.8,1
1,1,79.33,1,78.33,0,1,77.48,0,1,86.5,0,66.28,1
2,1,65.0,1,68.0,1,2,64.0,1,0,75.0,0,57.8,1
3,1,56.0,1,52.0,1,1,52.0,0,0,66.0,1,59.43,0
4,1,85.8,1,73.6,1,0,73.3,1,0,96.8,0,55.5,1


## Splitting data into features and target

We won't use all of these values. We will use the `ssc_p`,`hsc_p`,`degree_p`,`workex`,`mba_p`,`etest_p`,`gender`,`degree_t` and `specialisation` column as our features to predict whether a candidate will be __placed or not__.



In [12]:
X =df1[['ssc_p','hsc_p','degree_p','workex','mba_p','etest_p','gender','degree_t','specialisation']]
Y = df1.iloc[:,-1]

In [13]:
X.head()

Unnamed: 0,ssc_p,hsc_p,degree_p,workex,mba_p,etest_p,gender,degree_t,specialisation
0,67.0,91.0,58.0,0,58.8,55.0,1,0,1
1,79.33,78.33,77.48,1,66.28,86.5,1,0,0
2,65.0,68.0,64.0,0,57.8,75.0,1,1,0
3,56.0,52.0,52.0,0,59.43,66.0,1,0,1
4,85.8,73.6,73.3,0,55.5,96.8,1,1,0


In [14]:
print(Y)

0      1
1      1
2      1
3      0
4      1
      ..
210    1
211    1
212    1
213    1
214    0
Name: status, Length: 215, dtype: int64


### Splitting the dataset into training set and test set

In [15]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

### Feature Scaling

In [16]:
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

### Applying classification models on the Training set

_Our goal is to predict whether a candidate will be placed or not. This is a classification problem so we will use  __KNN__ and __SVM__ and compare how these perform individually._



### K Nearest Neighbor

In [17]:
from sklearn.neighbors import KNeighborsClassifier


In [18]:
classifier = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
classifier.fit(x_train,y_train)

KNeighborsClassifier()

In [19]:
prediction = classifier.predict(x_test)
print("Accuracy:",accuracy_score(y_test, prediction)*100)
print("Precision:",precision_score(y_test, prediction)*100)
print("Recall:",recall_score(y_test, prediction)*100)

Accuracy: 86.04651162790698
Precision: 87.87878787878788
Recall: 93.54838709677419


### Support Vector Machine

In [20]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score

In [21]:
classifier = SVC()
classifier.fit(x_train, y_train)

SVC()

In [22]:
svc_pred = classifier.predict(x_test)
print("Accuracy:",accuracy_score(y_test, svc_pred)*100)
print("Precision:",precision_score(y_test, svc_pred)*100)
print("Recall:",recall_score(y_test, svc_pred)*100)

Accuracy: 83.72093023255815
Precision: 87.5
Recall: 90.32258064516128
