In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings

In [None]:
data = pd.read_csv("Placement_Data_Full_Class.csv")

In [None]:
data.head(10)

In [None]:
placement = data.copy()

In [None]:
placement.shape

In [None]:
placement.info()

In [None]:
placement.dtypes

In [None]:
# To check how many students got placed

placement['status'].unique()

In [None]:
placement['status'].value_counts()

# NaN Handling

In [None]:
placement.isnull().sum()

In [None]:
placement['salary'].fillna(value=0 , inplace = True)

In [None]:
placement.isnull().sum()

In [None]:
# Dropping the columns sl_no, ss_b, hsc_b because they contain redundant values which do not contribute in our model.


placement.drop(['sl_no','ssc_b','hsc_b'], axis = 1 , inplace = True)

In [None]:
placement.head()

# Outlier Handling

In [None]:
plt.figure(figsize = (15,10))

ax = plt.subplot(221)
plt.boxplot(placement['ssc_p'])
ax.set_title('Secondary School Percentage')

ax = plt.subplot(222)
plt.boxplot(placement['hsc_p'])
ax.set_title('Higher Secondary Percentage')

ax = plt.subplot(223)
plt.boxplot(placement['degree_p'])
ax.set_title('UG Percentage')

ax = plt.subplot(224)
plt.boxplot(placement['etest_p'])
ax.set_title('Employability Percentage')

In [None]:
Q1 = placement['hsc_p'].quantile(0.25)
Q3 = placement['hsc_p'].quantile(0.75)

IQR = Q3-Q1

filter = (placement['hsc_p']>= Q1 - 1.5 * IQR) & (placement['hsc_p']<= Q3+ 1.5*IQR)
placement_filtered = placement.loc[filter]

In [None]:
plt.boxplot(placement_filtered['hsc_p'])

In [None]:
Q1 = placement['degree_p'].quantile(0.25)
Q3 = placement['degree_p'].quantile(0.75)
IQR = Q3 - Q1

filter = (placement['degree_p'] >= Q1 - 1.5 * IQR) & (placement['degree_p']<= Q3+ 1.5*IQR)
placement_filtered= data.loc[filter]

In [None]:
plt.boxplot(placement_filtered['degree_p'])

#  Data Visualization

In [None]:
plt.figure(figsize = (15,7))

plt.subplot(231)
ax = sns.countplot(x= 'gender' , data = placement_filtered, palette = "flare")

plt.subplot(232)
ax = sns.countplot(x= 'hsc_s' , data = placement_filtered, palette = "flare")

plt.subplot(233)
ax = sns.countplot(x= 'degree_t' , data = placement_filtered, palette = "flare")

plt.subplot(234)
ax = sns.countplot(x= 'specialisation' , data = placement_filtered, palette = "flare")

plt.subplot(235)
ax = sns.countplot(x= 'workex' , data = placement_filtered, palette = "flare")

plt.subplot(236)
ax = sns.countplot(x= 'status' , data = placement_filtered, palette = "flare")

In [None]:
warnings.filterwarnings('ignore')

In [None]:
placement_placed = placement_filtered[placement_filtered.salary!= 0]
sns.distplot(placement_placed['salary'])

# Visualizing which stream placed the higher number of students.

In [None]:
import plotly.express as px

fig = px.histogram(data_frame = data,
             x = "degree_t",
             color="status", title="<b>Counts of Stream</b>",
             pattern_shape_sequence=['x'],
             template='plotly_dark')

fig.update_layout(title_x = 0.5,
                  title_font = dict(size = 20),
                  uniformtext_minsize = 15)


fig.show()

# Encoding the Categorical Columns

In [None]:
#Label Encoding

from sklearn.preprocessing import LabelEncoder

object_cols= ['gender','workex','specialisation','status']

label_encoder = LabelEncoder()

for col in object_cols:
    placement_filtered[col]= label_encoder.fit_transform(placement_filtered[col])
    
placement_filtered.head(10)

In [None]:
# One Hot Encoding for stream and degree columns 

dummy_hsc_s = pd.get_dummies(placement_filtered['hsc_s'], prefix = 'dummy')
dummy_degree_t = pd.get_dummies(placement_filtered['degree_t'], prefix = 'dummy')

placement_coded = pd.concat([placement_filtered , dummy_hsc_s , dummy_degree_t],axis = 1)
placement_coded.drop(['sl_no','ssc_b','hsc_b','hsc_s','degree_t','salary'],axis = 1 , inplace = True)
placement_coded.head()

#dropping hsc_s(stream), degree_t, salary columns because they consist of redundant values which are of no use.

# Stream Wise Analyzing

In [None]:
stream_wise = placement_filtered.groupby('degree_t').agg({'degree_p' :'mean',
                                          'workex' : 'sum',                            
                                           "mba_p":'mean',
                                           'status':'sum'})

stream_wise.style.highlight_max()

In [None]:
px.bar(data_frame=stream_wise, barmode='group',
       title = "<b>Stream wise Analyzing</b>",template="plotly_dark")

# Visualizing how many students having their degree percentage above average and below average got placed 

In [None]:
degree_p_above_avg = placement[placement['degree_p'] > placement['degree_p'].mean()]

degree_p_above_avg

In [None]:
fig = px.histogram(data_frame = degree_p_above_avg,
                   x = 'degree_p',
                   color='status',
                   title = "<b>Above Average Percentage Vs Placement</b>",
                   template='plotly')

fig.update_layout(bargap=0.2)

fig.show()

In [None]:
degree_p_below_avg = placement[placement['degree_p'] < placement['degree_p'].mean()]

degree_p_below_avg 

In [None]:
fig = px.histogram(data_frame = degree_p_below_avg,
                   x = 'degree_p',
                   color='status',
                   title = "<b>Below Average Percentage Vs Placement</b>",
                   template='plotly')

fig.update_layout(bargap=0.2)

fig.show()

# Store Feature Matrix in X and Response(Target) in y

In [None]:
placement_coded.columns

In [None]:
X = placement_coded.drop(['status'],axis=1)
y = placement_coded.status

# Splitting the Dataset into training and test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X , y , train_size = 0.8 , random_state = 1)

# Classification Models

In [None]:
#Decision Tree Classifier

from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

dt = DecisionTreeClassifier( criterion = 'gini', max_depth = 3)

dt = dt.fit(X_train , y_train)
y_pred1 = dt.predict(X_test)

score1 = metrics.accuracy_score(y_test , y_pred1)
print(score1)

In [None]:
# K Nearest Neighbor

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

knn = knn.fit(X_train , y_train)
y_pred2 = knn.predict(X_test)

score2 = metrics.accuracy_score(y_test , y_pred2)
print(score2)

In [None]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

logreg.fit(X_train , y_train)

y_pred3 = logreg.predict(X_test)

score3 = logreg.score(X_test , y_test)
print(score3)

In [None]:
final_data = pd.DataFrame({'Models':['DT','KNN','LR'],
                          'Accuracy':[score1*100,
                                    score2*100,
                                     score3*100]})

In [None]:
final_data

In [None]:
sns.barplot(final_data['Models'],final_data['Accuracy'],palette = "flare")