In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# all the necessary libraries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.model_selection as model_selection
import sklearn.preprocessing as pre

from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.datasets import make_classification

#models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# 1. Importing Data

In [3]:
main_df = pd.read_csv(r'/kaggle/input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv', index_col='sl_no')
print(f'Shape : {main_df.shape}')
main_df.head()

In [4]:
# to have a look at the general overview of the entire data
main_df.info()

In [5]:
# to get the exact number of missing values in the column.
main_df.isnull().sum()

In [6]:
# check if the status is no places
main_df['status'].value_counts()

In [7]:
# Does all the rows with missing values have a status of Not Placed?

status_df = main_df[main_df['status']=='Not Placed']
print('Shape of the dataframe:',status_df.shape, sep ='\n', end ='\n\n')
print('Number of rows with status as Not Placed:',status_df['status'].value_counts(), sep ='\n', end ='\n\n')
print('Number of rows with salary as zero:',status_df['salary'].isnull().sum(), sep ='\n', end ='\n\n')

### Insights:
- We see that all rows with Status as 'Not Placed' have the salary as 'Nan', which is logical.
- **Recomended change:**
    - Drop the Salary column as using that we can predict the Placed status with an if statment.
> If the value is not Nan then the person is placed or else he is not.
In real life we won't have such a column telling us these things.

In [8]:
# segrigating all the numeric and categorical columns
num_cols = list(main_df.select_dtypes(exclude=['object']).columns)
cat_cols = list(main_df.select_dtypes(include=['object']).columns)
print('all the numeric cols: ',num_cols, end ='\n\n')
print('all the categorical cols:',cat_cols, end ='\n\n')

In [9]:
main_df[num_cols].describe()

In [10]:
main_df[cat_cols].describe()

# 2. Data Visualisation

In [11]:
sns.pairplot(main_df[num_cols])

#### Insights:
- ***hsc_p*** and ***ssc_p*** have a high correlation: ***0.51***
- ***degree_p*** and ***ssc_p*** have a high correlation: ***0.54***
- ***hsc_p*** and ***degree_p*** have a high correlation: ***0.43***
- ***mba_p*** and ***ssc_p*** have a high correlation: ***0.39***

> Removing ***hsc_p***, ***degree_p*** and ***mba_p*** and keeping ***ssc_p***, would be a good choice as ***ssc_p explains the other 2 columns as well.***

In [12]:
#Change the figure size
plt.figure(figsize=[20, 20])

for i in range(len(cat_cols)):
    plt.subplot(4, 4, i+1)
    if(i<7):
        sns.countplot(x = main_df[cat_cols[i]], hue = main_df['status'])
    else:
        sns.countplot(x = main_df[cat_cols[i]])
plt.show()

In [13]:
#Change the figure size
plt.figure(figsize=[20, 20])

for i in range(len(num_cols)):
    plt.subplot(3, 3, i+1)
    sns.violinplot(x = main_df[num_cols[i]], hue = main_df['status'])
plt.show()

### Insights:
- All the columns seem to be normaly disributed except the salary column.
- No outlier treatment required as we are going to drop the salary column as it is.

# 3. Data Transformation

In [14]:
transform_df_1 = main_df.drop(['salary'], axis=1)
transform_df_1.head()

## 3.1 Numeric to Categorical

In [15]:
num_cols.remove('salary')
num_cols

In [16]:
transform_df_1[num_cols].describe()

### Insights:
***Ranges***
- ssc_p <code>[40.89 to 89.40]</code>
- hsc_p <code>[37 to 97.7]</code>
- degree_p <code>[50 to 91]</code>
- etest_p <code>[50 to 98]</code>
- mba_p <code>[51.21 to 77.89]</code>

In [17]:
def get_bins(df, col_name, bin_num):
    lower = df[col_name].min()
    upper = df[col_name].max()
    bins = np.linspace(lower,upper,bin_num+1)
    return bins

# example :
print(get_bins(transform_df_1, 'ssc_p', 3))

In [18]:
for i in (num_cols):
    transform_df_1[i] = pd.cut(transform_df_1[i], bins=get_bins(transform_df_1, i, 2), labels=[0,1], include_lowest=True)
    
print(transform_df_1[num_cols].info())
transform_df_1[num_cols].head()

In [19]:
# converting the categorical columns to integer columns
for i in num_cols:
    transform_df_1[i] = transform_df_1[i].astype(int)
transform_df_1[num_cols].info()

In [20]:
#Change the figure size
plt.figure(figsize=[20, 20])

for i in range(len(num_cols)):
    plt.subplot(3, 3, i+1)
    if(i<7):
        sns.countplot(x = transform_df_1[num_cols[i]], hue = transform_df_1['status'])
    else:
        sns.countplot(x = transform_df_1[num_cols[i]])
plt.show()

### Homework:
- make insights for the above graph.

## 3.2 Label Encode the Categorical Columns

In [21]:
# converting the object type values into numeric
labelencoder = pre.LabelEncoder()
for i in cat_cols:
    transform_df_1[i] = labelencoder.fit_transform(transform_df_1[i])
transform_df_1[cat_cols].head()

## 3.3 Feature Selection
ref : 

In [22]:
plt.figure(figsize=[20, 5])
sns.heatmap(transform_df_1.corr(), annot=True)

In [23]:
# from sklearn.feature_selection import mutual_info_classif
mutual_info = mutual_info_classif(transform_df_1.drop('status', axis =1),transform_df_1['status'])
mutual_info

In [24]:
mutual_info = pd.Series(mutual_info)
mutual_info.index = transform_df_1.drop('status', axis=1).columns
mutual_info.sort_values(ascending=False)

## Insights:
- In this case we can take the columns that provide some info and drop the ones that don't
- Generally we take the top n columns, n defines the number of columns that the user wants to take into cosideration. <code>(Generally 5 or 10)</code>

In [25]:
# select colums that have non-zero mutual information
sel_cols=[]
for i in range(len(mutual_info)):
    if(mutual_info[i]>0):
        sel_cols.append(mutual_info.index[i])
sel_cols.append('status')
print(sel_cols)

In [26]:
transform_df_2 = transform_df_1[sel_cols]
transform_df_2.head()

In [27]:
# from sklearn.feature_selection import SelectKBest
transform_df_3 = transform_df_1
sel_top5 = SelectKBest(mutual_info_classif, k=5)
sel_top5.fit(transform_df_3.drop('status', axis = 1), transform_df_3['status'])

# to see which are the top5 columns:
list(transform_df_3.drop('status', axis=1).columns[sel_top5.get_support()])

In [28]:
sel_top5 = list(transform_df_3.drop('status', axis=1).columns[sel_top5.get_support()])
sel_top5.append('status')
transform_df_3 = transform_df_3[sel_top5]
transform_df_3.head()

**NOTE**
- transform_df_1 : all the columns
- transform_df_2 : the columns with some mutual information
- transform_df_3 : the top 5 columns with mutual information

# 4. Model Selection

## 4.1 Logistic Regression

#### transform_df_1

In [29]:
# from sklearn.linear_model import LogisticRegression
log_r = LogisticRegression()
X_train, X_test, y_train, y_test = model_selection.train_test_split(transform_df_1.drop('status', axis =1), 
                                                                    transform_df_1['status'],
                                                                    test_size=0.2, random_state=42)
log_r.fit(X_train, y_train)
y_pred = log_r.predict(X_test)

# from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test , y_pred))

# accuraccy of the model
print('Score for transform_df_1: {:.4f}'.format(accuracy_score(y_test , y_pred)))
print('Score: {:.4f}'.format(log_r.score(X_test,y_test)))

In [30]:
sns.heatmap(confusion_matrix(y_test , y_pred), annot=True)

#### transform_df_2

In [31]:
log_r = LogisticRegression()
X_train, X_test, y_train, y_test = model_selection.train_test_split(transform_df_2.drop('status', axis =1), 
                                                                    transform_df_2['status'], test_size=0.25,
                                                                    random_state=42)
log_r.fit(X_train, y_train)
y_pred = log_r.predict(X_test)

print('Score for transform_df_1: {:.4f}'.format(accuracy_score(y_test , y_pred)))
sns.heatmap(confusion_matrix(y_test , y_pred), annot=True)

#### transform_df_3

In [32]:
log_r = LogisticRegression()
X_train, X_test, y_train, y_test = model_selection.train_test_split(transform_df_2.drop('status', axis =1), 
                                                                    transform_df_2['status'], test_size=0.25,
                                                                    random_state=42)
log_r.fit(X_train, y_train)
y_pred = log_r.predict(X_test)

print('Score for transform_df_1: {:.4f}'.format(accuracy_score(y_test , y_pred)))
sns.heatmap(confusion_matrix(y_test , y_pred), annot=True)

In [33]:
# how to get the TP, FP, FN, TN values from the confusion matrix.
a = confusion_matrix(y_test , y_pred)
print(a)
print("TP: ",a[0,0])
print("FN: ",a[1,0])

### Home Work:
- find the precision,recall, f1-score for the above Confusion Matrix. 
- find the average accuracy for log_reg using the <code>average_accuracy()</code> method.

In [34]:
# code goes here.


In [35]:
# Find the accuracy of the models using the average_accuracy method:


## 4.2 KNN

#### transform_df_1

In [36]:
# from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5,algorithm='brute')

In [37]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(transform_df_1.drop('status', axis =1), 
                                                                    transform_df_1['status'], test_size=0.25, 
                                                                    random_state=42)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [38]:
# from sklearn.metrics import classification_report
# from sklearn.metrics import accuracy_score
print("For classification report:")
print(classification_report(y_test , y_pred))
accuracy_score(y_test , y_pred)

#### transform_df_2

In [39]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(transform_df_2.drop('status', axis =1), 
                                                                    transform_df_2['status'], test_size=0.25, 
                                                                    random_state=42)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print("For classification report:")
print(classification_report(y_test , y_pred))
accuracy_score(y_test , y_pred)

#### transform_df_3

In [40]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(transform_df_3.drop('status', axis =1), 
                                                                    transform_df_3['status'], test_size=0.25, 
                                                                    random_state=42)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print("For classification report:")
print(classification_report(y_test , y_pred))
accuracy_score(y_test , y_pred)

### Insights: (Home work)
- Given the same random state for the split and the same model being fit:
    - transform_df_1 gives us as an accuracy of **81%** <code>f1-score: 0 -> 64 | 1 -> 88</code>
    - transform_df_2 gives us as an accuracy of **__%** <code>f1-score: 0 -> __ | 1 -> __</code>
    - transform_df_3 gives us as an accuracy of **__%** <code>f1-score: 0 -> __ | 1 -> __</code>
- Use Stratified K Fold to have a better understanding of the same.

### Model Parameter Tuning

In [41]:
# from sklearn.model_selection import GridSearchCV
param = dict(n_neighbors=list(range(5,57,2)), algorithm=list(['brute','auto']))
print(param)

In [42]:
grid = GridSearchCV(knn, param, cv=10, scoring='f1')
X = transform_df_1.drop('status',axis=1)
y = transform_df_1['status']
# fit the grid with data
grid.fit(X, y)

In [43]:
# examine the best model

# Single best score achieved across all params (k)
print(grid.best_score_)

# Dictionary containing the parameters (k) used to generate that score
print(grid.best_params_)

# Actual model object fit with those best parameters
# Shows default parameters that we did not specify
print(grid.best_estimator_)

In [44]:
# from sklearn.model_selection import StratifiedKFold
def average_accuracy(df, model):
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=12) 
    all_accuracy = []
    
    X = df.drop('status', axis = 1)
    y = df['status']
    
    for train_index, test_index in skf.split(X, y):
        X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
        
        model.fit(X_train, y_train) 
        all_accuracy.append(model.score(X_test, y_test))
    all_accuracy = sum(all_accuracy)/len(all_accuracy)
    return all_accuracy

knn = KNeighborsClassifier(n_neighbors=43)
print('transform_df_1: {:.4f}'.format(average_accuracy(transform_df_1,knn)))
print('transform_df_2: {:.4f}'.format(average_accuracy(transform_df_2,knn)))
print('transform_df_3: {:.4f}'.format(average_accuracy(transform_df_3,knn)))

## 4.3 Naive Bayes

In [45]:
# from sklearn.naive_bayes import GaussianNB
X = transform_df_1.drop('status', axis=1)
y = transform_df_1['status']
X_train,X_test,y_train,y_test=model_selection.train_test_split(X,y,test_size=0.2,random_state=9) #Split the dataset

nv = GaussianNB() # create a classifier

nv.fit(X_train,y_train) # fitting the data

y_pred = nv.predict(X_test) # store the prediction data

print(classification_report(y_test , y_pred))

In [46]:
X = transform_df_2.drop('status', axis=1)
y = transform_df_2['status']
X_train,X_test,y_train,y_test=model_selection.train_test_split(X,y,test_size=0.2,random_state=9) #Split the dataset

nv = GaussianNB() # create a classifier

nv.fit(X_train,y_train) # fitting the data

y_pred = nv.predict(X_test) # store the prediction data

print(classification_report(y_test , y_pred))

In [47]:
X = transform_df_3.drop('status', axis=1)
y = transform_df_3['status']
X_train,X_test,y_train,y_test=model_selection.train_test_split(X,y,test_size=0.2,random_state=9) #Split the dataset

nv = GaussianNB() # create a classifier

nv.fit(X_train,y_train) # fitting the data

y_pred = nv.predict(X_test) # store the prediction data

print(classification_report(y_test , y_pred))

In [48]:
nv = GaussianNB()
print('transform_df_1: {:.4f}'.format(average_accuracy(transform_df_1,nv)))
print('transform_df_2: {:.4f}'.format(average_accuracy(transform_df_2,nv)))
print('transform_df_3: {:.4f}'.format(average_accuracy(transform_df_3,nv)))

## 4.4 SVM

In [63]:
# from sklearn.svm import SVC
svm = SVC(kernel='linear',class_weight={0:5, 1:5})

X = transform_df_1.drop('status', axis=1)
y = transform_df_1['status']
X_train,X_test,y_train,y_test=model_selection.train_test_split(X,y,test_size=0.2,random_state=42) #Split the dataset

svm.fit(X_train,y_train)
y_pred = svm.predict(X_test) # store the prediction data

print(classification_report(y_test , y_pred))

In [64]:
param = {'kernel' : [ 'linear', 'poly', 'rbf', 'sigmoid'],
         'class_weight' : [{0:6, 1:1},{0:5, 1:1},{0:4, 1:1},{0:3, 1:1},{0:2, 1:1}]}

model = SVC()
search = GridSearchCV(model, param, cv=10, scoring='f1')

In [65]:
# execute search
result = search.fit(X, y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

In [66]:
svm = SVC(class_weight = {0: 2, 1: 1}, kernel = 'linear')
print('transform_df_1: {:.4f}'.format(average_accuracy(transform_df_1,svm)))
print('transform_df_2: {:.4f}'.format(average_accuracy(transform_df_2,svm)))
print('transform_df_3: {:.4f}'.format(average_accuracy(transform_df_3,svm)))

## 4.5 Decision Trees

In [67]:
X = transform_df_3.drop('status', axis=1)
y = transform_df_3['status']
X_train,X_test,y_train,y_test=model_selection.train_test_split(X,y,test_size=0.2,random_state=42) #Split the dataset

# from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree = dtree.fit(X, y)

y_pred = dtree.predict(X_test) # store the prediction data

print(classification_report(y_test , y_pred))

In [68]:
%%time
param = {'criterion' : ["gini", "entropy"],
         'max_depth' : [5,10,15], 
         'class_weight' : [{0:6, 1:1},{0:5, 1:1},{0:4, 1:1},{0:3, 1:1},{0:2, 1:1}], 
         'min_samples_leaf' :[1,2,3]}

model = DecisionTreeClassifier(random_state=42)
search = GridSearchCV(model, param, cv=10, scoring='f1')

# execute search
result = search.fit(X, y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

In [69]:
dtree = DecisionTreeClassifier(class_weight = {0: 2, 1: 1}, criterion = 'gini',
                               max_depth = 5, min_samples_leaf = 1)
print('transform_df_1: {:.4f}'.format(average_accuracy(transform_df_1,dtree)))
print('transform_df_2: {:.4f}'.format(average_accuracy(transform_df_2,dtree)))
print('transform_df_3: {:.4f}'.format(average_accuracy(transform_df_3,dtree)))

In [91]:
# Testing for overfitting:
X = transform_df_3.drop('status', axis=1)
y = transform_df_3['status']
X_train,X_test,y_train,y_test=model_selection.train_test_split(X,y,test_size=0.2,random_state=42) #Split the dataset

# from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(class_weight = {0: 2, 1: 1}, criterion = 'gini',
                               max_depth = 3, min_samples_leaf = 1)
dtree = dtree.fit(X, y)

print('Training Accuraccy: {:.4f}'.format(dtree.score(X_train, y_train)))
print('Testing Accuraccy: {:.4f}'.format(dtree.score(X_test, y_test)))

# 5. Oversampling and Undersampling

## 5.1 Over Sampling

In [96]:
# from imblearn.over_sampling import RandomOverSampler
# from imblearn.under_sampling import RandomUnderSampler
# from sklearn.datasets import make_classification

# OverSampling and UnderSampling the transformdf_1
X = transform_df_1.drop('status', axis =1)
y = transform_df_1['status']

print('Before sampling:',pd.Series(y).value_counts(),end='\n\n',sep='\n')
# define dataset
X, y = make_classification(n_samples=200, weights=[0.70], flip_y=0)

# define oversampling strategy
oversample = RandomOverSampler(sampling_strategy='minority')

# fit and apply the transform
X_over, y_over = oversample.fit_resample(X, y)

print('After sampling:',pd.Series(y_over).value_counts(),sep='\n')

In [97]:
def KFold_accuracy(X,y,model):
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=12) 
    all_accuracy = []
        
    for train_index, test_index in skf.split(X, y):
        X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
        
        model.fit(X_train, y_train) 
        all_accuracy.append(model.score(X_test, y_test))
    all_accuracy = sum(all_accuracy)/len(all_accuracy)
    return all_accuracy

In [98]:
# Logistic Regression:
log_r = LogisticRegression()
print('Logistic Regression accuracy:{:.4f}'.format(KFold_accuracy(X_over,y_over, log_r)))

# KNN:
knn = KNeighborsClassifier(n_neighbors=43)
print('KNN accuracy:{:.4f}'.format(KFold_accuracy(X_over,y_over, knn)))

# Naive Bayes:
nv = GaussianNB()
print('Naive Bayes accuracy:{:.4f}'.format(KFold_accuracy(X_over,y_over, nv)))

# SVM:
svm = SVC(class_weight = {0: 2, 1: 1}, kernel = 'rbf')
print('SVM accuracy:{:.4f}'.format(KFold_accuracy(X_over,y_over,svm)))

# Decision Tree:
dtree = DecisionTreeClassifier(class_weight = {0: 2, 1: 1}, criterion = 'gini',
                               max_depth = 5, min_samples_leaf = 1)

print('Decision Tree accuracy: {:.4f}'.format(KFold_accuracy(X_over,y_over, dtree)))

## 5.2 Under Sampling:

In [113]:
# OverSampling and UnderSampling the transformdf_1
X = transform_df_1.drop('status', axis =1)
y = transform_df_1['status']

print('Before sampling:',X.shape,end='\n\n',sep='\n')

# define dataset
X, y = make_classification(n_samples=200, weights=[0.30], flip_y=0)

# define undersample strategy
undersampling = RandomUnderSampler(sampling_strategy='majority')

# fit and apply the transform
X_under, y_under = undersampling.fit_resample(X, y)

print('After sampling:',X_over.shape,sep='\n')

In [110]:
# Logistic Regression:
log_r = LogisticRegression()
print('Logistic Regression accuracy:{:.4f}'.format(KFold_accuracy(X_under,y_under, log_r)))

# KNN:
knn = KNeighborsClassifier(n_neighbors=43)
print('KNN accuracy:{:.4f}'.format(KFold_accuracy(X_under,y_under, knn)))

# Naive Bayes:
nv = GaussianNB()
print('Naive Bayes accuracy:{:.4f}'.format(KFold_accuracy(X_under,y_under, nv)))

# SVM:
svm = SVC(class_weight = {0: 2, 1: 1}, kernel = 'rbf')
print('SVM accuracy:{:.4f}'.format(KFold_accuracy(X_under,y_under,svm)))

# Decision Tree:
dtree = DecisionTreeClassifier(class_weight = {0: 2, 1: 1}, criterion = 'gini',
                               max_depth = 5, min_samples_leaf = 1)

print('Decision Tree accuracy: {:.4f}'.format(KFold_accuracy(X_under,y_under, dtree)))