In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Data Importing

In [7]:
data = pd.read_csv("Placement_Data_Full_Class.csv")

In [8]:
data.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


In [9]:
data.drop(['sl_no','ssc_b','hsc_b','hsc_s','degree_t','salary'],axis=1,inplace=True)

In [10]:
data.head()

Unnamed: 0,gender,ssc_p,hsc_p,degree_p,workex,etest_p,specialisation,mba_p,status
0,M,67.0,91.0,58.0,No,55.0,Mkt&HR,58.8,Placed
1,M,79.33,78.33,77.48,Yes,86.5,Mkt&Fin,66.28,Placed
2,M,65.0,68.0,64.0,No,75.0,Mkt&Fin,57.8,Placed
3,M,56.0,52.0,52.0,No,66.0,Mkt&HR,59.43,Not Placed
4,M,85.8,73.6,73.3,No,96.8,Mkt&Fin,55.5,Placed


In [11]:
data.shape

(215, 9)

In [12]:
data.isnull().sum()

gender            0
ssc_p             0
hsc_p             0
degree_p          0
workex            0
etest_p           0
specialisation    0
mba_p             0
status            0
dtype: int64

# Preprocessing

## Encoding

Converting categorical columns into binary values

In [13]:
data['gender'] = data.gender.map({'M':0,'F':1})
data["workex"] = data.workex.map({"No":0, "Yes":1})
data["status"] = data.status.map({"Not Placed":0, "Placed":1})
data["specialisation"] = data.specialisation.map({"Mkt&HR":0, "Mkt&Fin":1})

## Balincing Dataset

In [14]:
data['status'].value_counts()

status
1    148
0     67
Name: count, dtype: int64

In [15]:
from sklearn.utils import resample

data_majority = data[data['status']==1]
data_minority = data[data['status']==0]

data_minority_upsampled = resample(data_minority, 
                                   replace=True,
                                   n_samples=len(data_majority),
                                   random_state=42
                                  )

balanced_data = pd.concat([data_majority, data_minority_upsampled])

In [16]:
balanced_data['status'].value_counts()

status
1    148
0    148
Name: count, dtype: int64

## Train Test Split

In [17]:
X = balanced_data.copy().drop('status',axis=1)
y = balanced_data['status']

In [18]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.3)

## Feature Scaling

In [19]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

# Model Training

## Algorithm 1: K-Nearest Neighbor

In [20]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)

knn.fit(X_train,y_train)

In [21]:
knn_y_pred = knn.predict(X_test)

In [22]:
from sklearn.metrics import accuracy_score, confusion_matrix

knn_accuracy = accuracy_score(knn_y_pred,y_test)
print('KNN Accuracy: \n', knn_accuracy,'\n')

knn_cf = confusion_matrix(y_test,knn_y_pred)
print('KNN Confusion Matrix: \n',knn_cf)

KNN Accuracy: 
 0.7752808988764045 

KNN Confusion Matrix: 
 [[36 12]
 [ 8 33]]


## Algorithm 2: Decision Trees

In [23]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier()

decision_tree.fit(X_train,y_train)

In [24]:
decision_tree_y_pred = decision_tree.predict(X_test)

In [25]:
decision_tree_accuracy = accuracy_score(decision_tree_y_pred,y_test)
print('Decision Trees Accuracy: \n', decision_tree_accuracy,'\n')

decision_tree_cf = confusion_matrix(y_test,decision_tree_y_pred)
print('Decision Trees Matrix: \n',decision_tree_cf)

Decision Trees Accuracy: 
 0.797752808988764 

Decision Trees Matrix: 
 [[31 17]
 [ 1 40]]


## Algorithm 3: SVM

In [26]:
from sklearn.svm import SVC

svm = SVC()

svm.fit(X_train, y_train)

In [27]:
svm_y_pred = svm.predict(X_test)

In [28]:
svc_accuracy = accuracy_score(svm_y_pred,y_test)
print('SVM Accuracy: \n', svc_accuracy,'\n')

svc_accuracy_cf = confusion_matrix(y_test,svm_y_pred)
print('SVM Confusion Matrix: \n',svc_accuracy_cf)

SVM Accuracy: 
 0.797752808988764 

SVM Confusion Matrix: 
 [[35 13]
 [ 5 36]]


## Algorithm 4: Random Forest

In [29]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=1000)

random_forest.fit(X_train, y_train)

In [30]:
random_forest_y_pred = random_forest.predict(X_test)

In [31]:
randoem_forest_accuracy = accuracy_score(random_forest_y_pred,y_test)
print('Random Forest Accuracy: \n', randoem_forest_accuracy,'\n')

randoem_forest_cf = confusion_matrix(y_test,random_forest_y_pred)
print('Random Forest Matrix: \n',randoem_forest_cf)

Random Forest Accuracy: 
 0.8764044943820225 

Random Forest Matrix: 
 [[38 10]
 [ 1 40]]


## Algorithm 5: Gaussian Naive Bayes

In [32]:
from sklearn.naive_bayes import GaussianNB

gaussian = GaussianNB()

gaussian.fit(X_train, y_train)

In [33]:
gaussian_y_pred = gaussian.predict(X_test)

In [34]:
gaussian_accuracy = accuracy_score(gaussian_y_pred,y_test)
print('Gaussian Naive Bayes Accuracy: \n', gaussian_accuracy,'\n')

gaussian_cf = confusion_matrix(y_test,gaussian_y_pred)
print('Gaussian Naive Bayes Matrix: \n',gaussian_cf)

Gaussian Naive Bayes Accuracy: 
 0.8314606741573034 

Gaussian Naive Bayes Matrix: 
 [[36 12]
 [ 3 38]]


## Algorithm 6: Logistic Regression

In [35]:
from sklearn.linear_model import LogisticRegression

lg = LogisticRegression()

lg.fit(X_train, y_train)

In [36]:
lg_y_pred = lg.predict(X_test)

In [37]:
logistic_reg_accuracy = accuracy_score(lg_y_pred,y_test)
print('Logistic Regression Accuracy: \n', logistic_reg_accuracy,'\n')

logistic_reg_cf = confusion_matrix(y_test,lg_y_pred)
print('Logistic Regression Matrix: \n',logistic_reg_cf)

Logistic Regression Accuracy: 
 0.8089887640449438 

Logistic Regression Matrix: 
 [[33 15]
 [ 2 39]]


# Predictive system

In [40]:
def prediction( gender, ssc_p, hsc_p, degree_p, workex, etest_p, specialisation, mba_p):
    data = {
    'gender': [gender],
    'ssc_p': [ssc_p],
    'hsc_p': [hsc_p],
    'degree_p': [degree_p],
    'workex': [workex],
    'etest_p': [etest_p],
    'specialisation': [specialisation],
    'mba_p': [mba_p]
    }
    data = pd.DataFrame(data)
    data['gender'] = data['gender'].map({'M':1,"F":0})
    data['workex'] = data['workex'].map({"Yes":1,"No":0})
    data['specialisation'] = data['specialisation'].map({"Mkt&HR":1,"Mkt&Fin":0})
    scaled_df = scaler.fit_transform(data)
    result = random_forest.predict(scaled_df).reshape(1, -1)
    if result == 1:
        print('Place')
    else:
        print('Not Place')

In [41]:
gender = "F"
ssc_p =58.
hsc_p = 61.
degree_p = 60.
workex = "Yes"
etest_p = 62.
specialisation = "Mkt&Fin"
mba_p = 60.85

result = prediction(gender, ssc_p, hsc_p, degree_p, workex, etest_p, specialisation, mba_p)

Not Place


# Save File

In [42]:
import pickle

pickle.dump(random_forest,open('model.pkl','wb'))
pickle.dump(scaler,open('scaler.pkl','wb'))