# Introduction and Imports

In [None]:
! pip install -q dabl

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff

import dabl

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import roc_auc_score

plt.style.use("fivethirtyeight")
orange_black = [
    '#fdc029', '#df861d', '#FF6347', '#aa3d01', '#a30e15', '#800000', '#171820'
]

# EDA and Data Preprocessing

Let's see what each column means:
- sl_no: Serial Number
- Gender
- ssc_p: Secondary Education percentage - 10th Grade
- ssc_b: Board of Education - Central/ Others
- hsc_p: Higher Secondary Education percentage- 12th Grade
- hsc_b: Board of Education- Central/ Others
- hsc_s: Specialization in Higher Secondary Education
- degree_p: Degree Percentage
- degree_t: Under Graduation(Degree type)- Field of degree education
- workex: Work Experience 
- etest_p: Employability test percentage (Conducted by college)
- specialization: Post Graduation(MBA)- Specialization
- mba_p: MBA percentage
- status: Status of placement- Placed/Not placed
- salary: Salary offered by corporate to candidates

In [None]:
# Let's read the data now
data = pd.read_csv("../input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv")
data.head()

Let's Do some basic Descriptive Statitics on the data

In [None]:
data.describe()

Let's now first check for the NULL values and do something about them.
As we can see, the `salary` column is the only one having null values (67).

In [None]:
data.isna().sum()

The Null values are only for those candidates who haven't been placed, which is obvious. Now I will impute these to 0 just for sake of visualization. 

However, we aren't going to use the `salary` column since **it is not a cause of the `status` column, but rather an effect of `status` column**.

In [None]:
data['status'].value_counts()

In [None]:
# Fill 0 in place of NuLL values
data['salary'] = data['salary'].fillna(0)
data.head()

In [None]:
# Before we do viz, first drop "sl_no" column.
data = data.drop(['sl_no'], axis=1)

### Let's Start the Visualization now!

## Candidate Gender Chart

In [None]:
targets = data['gender'].value_counts().tolist()
values = list(dict(data['gender'].value_counts()).keys())

fig = px.pie(
    values=targets, 
    names=values,
    title='Gender Value Pie-chart',
)
fig.show()

## Candidate Status (Target Variable) Chart

In [None]:
targets = data['status'].value_counts().tolist()
values = list(dict(data['status'].value_counts()).keys())

fig = px.pie(
    values=targets, 
    names=values,
    title='Status Value Distribution',
    color_discrete_sequence=["cyan", "blue"]
    
)
fig.show()

There is a solid Data Imbalance in the dataset. We will have to deal with this later.

## Specialization Distribution
Let's take a look at `Specialization` Column.

In [None]:
targets = data['specialisation'].value_counts().tolist()
values = list(dict(data['specialisation'].value_counts()).keys())

fig = px.pie(
    values=targets, 
    names=values,
    title='Spec. Value Distribution',
    color_discrete_sequence=orange_black
    
)
fig.show()

So there is about the same amount of candidates that have Marketing-Finance and Marketing-Human Resources Specializations.

## MBA Percent Count Plot
Let's draw up a Count Plot to see the trends of MBA Percent Distributions.

In [None]:
fig = px.histogram(
    data, x="mba_p",
    marginal="violin",
    hover_data=data.columns,
    color_discrete_sequence=["maroon"],
    title=f"MBA Percent Distribution [\u03BC : ~{data['mba_p'].mean():.2f}% | \u03C3 : ~{data['mba_p'].std():.2f} %]",
)

fig.show()

Mean Candidate Percentage in MBA lies around: `62%` with a standard deviation of `6%`

## Salary Distribution
This one will be only for students that actually got placed, since the ones not getting placed will have a salary of `0` which would affect our plot.

In [None]:
fig = px.histogram(
    data[data['salary']!=0], x="salary",
    marginal="violin",
    hover_data=data.columns,
    color_discrete_sequence=["magenta"],
    title=f"MBA Percent Distribution [\u03BC : ~{data['mba_p'].mean():.2f}% | \u03C3 : ~{data['mba_p'].std():.2f} %]",
)

fig.show()

## Work Experience Distribution
Now analyize how work experience varies through dataset.

In [None]:
targets = data['workex'].value_counts().tolist()
values = list(dict(data['workex'].value_counts()).keys())

fig = px.pie(
    values=targets, 
    names=values,
    title='Work Exp. Distribution',
    color_discrete_sequence=["gray", "black"]
    
)
fig.show()

As you can see, more than `65%` of the Candidates have no work experience at all.

## Performance in Employability Test
Now we look at `etest_p` which is **Employability Test Percentage** scored by candidates.

In [None]:
fig = px.histogram(
    data, x="etest_p",
    marginal="box",
    hover_data=data.columns,
    color_discrete_sequence=["red"],
    title=f"Performance in Employability Test [\u03BC : ~{data['etest_p'].mean():.2f}% | \u03C3 : ~{data['etest_p'].std():.2f} %]",
)

fig.show()

An average student has scored *72%* in Employability test, where the maximum scored was *98%* and the minimum scored was *50%*

## Candidate Degree Pie Chart
Now we look at what proportion of candidates hold each degree.

In [None]:
targets = data['degree_t'].value_counts().tolist()
values = list(dict(data['degree_t'].value_counts()).keys())

fig = px.pie(
    values=targets, 
    names=values,
    title="Candidate's Degree Type Chart",
)
fig.show()

Here we can see that about *67%* of candidates have a Degree in **Commerce and Management**, while *27%* of Candidates hold a degree in **Science and Technology**. Remaining *5%* hold a degree in other disciplines. 

## Degree Percentage Distribution
Following the degree type chart, let's look at the percentage achieved by students for their respective degrees.

In [None]:
fig = px.histogram(
    data, x="degree_p",
    marginal="box",
    hover_data=data.columns,
    color_discrete_sequence=["green"],
    title=f"Attained Degree Percentage [\u03BC : ~{data['degree_p'].mean():.2f}% | \u03C3 : ~{data['degree_p'].std():.2f} %]",
)

fig.show()

From the above Distribution, we can note that only 1 student has more than *90%* in their degree. The average percentage score lies at *~67%* while the minimum score lies at *~50%*.

Let's also compare which degree type has more average percentage.

In [None]:
sci_avg_pcent = data[data['degree_t'] == 'Sci&Tech']['degree_p'].mean()
com_avg_pcent = data[data['degree_t'] == 'Comm&Mgmt']['degree_p'].mean()
print(f"Average Percentage for Science & Technology Students is: {sci_avg_pcent:.2f}% while the average percentage of Commerce & Management Students is: {com_avg_pcent:.2f}%")

## Higher Secondry Specialization Pie-chart
Now we move on to Pie-chart of Higher Secondry Specialization type.

In [None]:
targets = data['hsc_s'].value_counts().tolist()
values = list(dict(data['hsc_s'].value_counts()).keys())

fig = px.pie(
    values=targets, 
    names=values,
    title="Higher Secondry Spec. Type Chart",
    color_discrete_sequence=["red", "blue", "green"]
)
fig.show()

## Higher Secondry Board Type

In [None]:
targets = data['hsc_b'].value_counts().tolist()
values = list(dict(data['hsc_b'].value_counts()).keys())

fig = px.pie(
    values=targets,
    names=values,
    title="Higher Secondry Board Type Chart",
    color_discrete_sequence=["orange", "gold"]
)
fig.show()

## Higher Secondry Percentage
Finally, let's look at Student Percentage in Higher Secondry Exams.

In [None]:
fig = px.histogram(
    data, x="ssc_p",
    marginal="box",
    hover_data=data.columns,
    color_discrete_sequence=["blue"],
    title=f"Higher Secondry Percentage [\u03BC : ~{data['ssc_p'].mean():.2f}% | \u03C3 : ~{data['ssc_p'].std():.2f} %]",
)

fig.show()

## Encode Categorical Data
Before I forget, let's encode categorical data so that the upcoming charts can be more understandable.

In [None]:
# A few utility functions to encode categorical data

def get_category_names(df, column_name):
    '''
    Column passed must be categorical
    '''
    unique_names_dict = dict(df[column_name].value_counts())
    unique_names = list(unique_names_dict.keys())
    
    _length = len(unique_names)
    return (_length, unique_names)

def replace_small_categorical_data(df, column_name, categorical_names):
    """
    Categorical Encodes a data
    """
    copy_frame = df.copy(deep=True)
    
    copy_frame[column_name].replace(categorical_names, [x for x in range(len(categorical_names))], inplace=True)
    
    return copy_frame

In [None]:
to_encode = ["gender", "ssc_b", "hsc_b", "hsc_s", "degree_t", "workex", "specialisation", "status"]
encoded_data = data.copy(deep=True)

for col in to_encode:
    _, current_category_names = get_category_names(encoded_data, col)
    encoded_data = replace_small_categorical_data(encoded_data, col, current_category_names)

In [None]:
encoded_data.head()

In [None]:
data.head()

## Pairplot
Let's draw some pair plot to see how different features affect each other.

In [None]:
sns.pairplot(data)

## Correlation Heatmap
Let's now see how all the features are correlated to each other.

In [None]:
plt.figure(figsize=(15, 12))
sns.heatmap(encoded_data.corr(), annot=True)

## DABL Plot
Since we have viz. most of the columns, let's end this EDA session by a DABL plot.

First by setting `status` as target column

In [None]:
plt.rcParams['figure.figsize'] = (18, 6)
dabl.plot(encoded_data, target_col = 'status')

And then by setting `salary` as target column

In [None]:
plt.rcParams['figure.figsize'] = (18, 6)
dabl.plot(encoded_data, target_col = 'salary')

# Modelling

Before we start modelling, we have to normalise and split our encoded dataset.

In [None]:
# First, let's split the data
split_pcent = 0.10
split = int(split_pcent * len(encoded_data))
encoded_data = encoded_data.sample(frac=1).reset_index(drop=True)

encoded_data = encoded_data.drop(['salary'], axis=1)

test = encoded_data[:split]
train = encoded_data[split:]

trainY = train['status'].values
trainX = train.drop(['status'], axis=1)

testY = test['status'].values
testX = test.drop(['status'], axis=1)

In [None]:
# Mean Normalise the data
trainX = (trainX - trainX.mean()) / trainX.std()
testX = (testX - testX.mean()) / testX.std()

Let's start the real Classification.
I am going to compare all the different classification techniques and then plot their corresponding testing accuracies.

In [None]:
# We are only using 11 Classifiers, you can use more if you wish.
names = ["Logistic Regression", "Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

classifiers = [
    LogisticRegression(),
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

Due to the presence of data imbalance, I am using roc-auc score instead of normal accuracy.

In [None]:
# Let's do the classification and store the name of the classifier and it's test score into a dictionary

clf_results = {}

for name, clf in tqdm(zip(names, classifiers)):
    # Fit on the traning data
    clf.fit(trainX, trainY)
    
    # Get the test time prediction
    preds = clf.predict(testX)
    
    # Calculate Test ROC_AUC
    score = roc_auc_score(testY, preds)
    
    # Store the results in a dictionary
    clf_results[name] = score

In [None]:
# Sort the Model Accuracies based on the test score
sort_clf = dict(sorted(clf_results.items(), key=lambda x: x[1], reverse=True))

# Get the names and the corresponding scores
clf_names = list(sort_clf.keys())[::-1]
clf_scores = list(sort_clf.values())[::-1]

In [None]:
# Plot the per-model performance
fig = px.bar(
    x=clf_scores,
    y=clf_names,
    color=clf_names,
    labels={'x':'Test ROC-AUC Score', 'y':'Models'},
    title=f"Model Performance [ Best Model: {clf_names[-1]} | Score: {clf_scores[-1]} ]"
)

fig.show()

In [None]:
clf_names

As we can see, we can cross the **90%** accuracy mark.

## Thank you!

Please correct me if I’ve made any mistakes in EDA, modelling or maybe explaining some concept since I am a beginner and prone to making mistakes. 

Please consider giving this notebook an upvote as it helps me work harder and publish more quality notebooks.