**Importing libraries**

In [4]:
pip install imbalanced-learn

Collecting imbalanced-learn
  Obtaining dependency information for imbalanced-learn from https://files.pythonhosted.org/packages/a3/9e/fbe60a768502af54563dcb59ca7856f5a8833b3ad5ada658922e1ab09b7f/imbalanced_learn-0.11.0-py3-none-any.whl.metadata
  Downloading imbalanced_learn-0.11.0-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.11.0-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.6/235.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.11.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [13]:


import pandas as pd
import numpy as np
import math

#Libraries for Modelling
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn import tree
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from scipy.stats import randint
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import BalancedRandomForestClassifier

from numpy import mean

#Libraries for Visualization
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

**Import CSV file in python and display data**

In [None]:
#mount google drive
#from google.colab import drive
#drive.mount('/content/drive')

In [2]:
#data = pd.read_csv('drive/MyDrive/Colab Notebooks/data.csv', sep=';')
data = pd.read_csv('Data/data.csv', sep=';')
data.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


**Display target values and count**

In [3]:
data['Target'].value_counts()

Target
Graduate    2209
Dropout     1421
Enrolled     794
Name: count, dtype: int64

**Data processing**

In [4]:
data.rename(columns = {'Nacionality':'Nationality', 'Age at enrollment':'Age', 'Daytime/evening attendance\t':'Daytime/evening attendance'}, inplace = True)
#data.isnull().sum()/len(data)*100

**Dropping the columns which has very low corelation with the target**

In [None]:
#cor = data_cor.corr()['Target']

#drop columns having covariance less than 0.05
#to_remove = np.array(cor[abs(cor) < 0.3].keys())
#to_remove = np.delete(to_remove, len(to_remove)-1)
#data = data.drop(columns=to_remove, axis=1)

#to_keep = cor.sort_values(ascending=False, key=abs).head(11).keys()
#data = data[np.intersect1d(data.columns, to_keep)]

**Divide the data into input and outputs**

In [5]:
inputs = data.drop(['Target'], axis = 1)
target = data['Target']

**Split the data into training and testing data**

In [6]:
x_train, x_test, y_train, y_test = train_test_split(inputs,
                                                   target,
                                                   test_size = 0.3,
                                                   random_state = 365,
                                                   stratify = target)

In [7]:
# as the oversampling techniques use KNN
# we scale the variables
scaler = MinMaxScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

**Encoding target labels**

In [8]:
enc_t = LabelEncoder()
y_train_enc = enc_t.fit_transform(y_train)
y_test_enc = enc_t.transform(y_test)
unique, counts = np.unique(y_train_enc, return_counts=True)
print("Training data:", dict(zip(unique, counts)))
unique, counts = np.unique(y_test_enc, return_counts=True)
print("Testing data:", dict(zip(unique, counts)))
print({l: i for i, l in enumerate(enc_t.classes_)})

Training data: {0: 994, 1: 556, 2: 1546}
Testing data: {0: 427, 1: 238, 2: 663}
{'Dropout': 0, 'Enrolled': 1, 'Graduate': 2}


**Normalizing data for better performance**

In [18]:
def run_randomForests(x_train, x_test, y_train, y_test):

    #rf = RandomForestClassifier(n_estimators=290, random_state=365, max_depth=18, n_jobs=4, ccp_alpha=0.01)
    rf = RandomForestClassifier(random_state=365, ccp_alpha=0.001)
    rf.fit(x_train, y_train)

    y_pred = rf.predict(x_test)
    precision = precision_score(y_test, y_pred, average="micro")
    recall = recall_score(y_test, y_pred, average="micro")
    f1 = f1_score(y_test, y_pred,average="micro")

    # Calculate micro-average
    micro_precision = precision.mean()
    micro_recall = recall.mean()
    micro_f1 = f1.mean()

    print("Micro-average Precision: {:.2f}".format(micro_precision))
    print("Micro-average Recall: {:.2f}".format(micro_recall))
    print("Micro-average F1-Score: {:.2f}".format(micro_f1))
    print("======================================")
    print("=======CLASSIFICATION SUMMARY=========")
    print(classification_report(y_test,y_pred))
    return y_pred

**Apply Random Forest Classifier to the original data**

In [24]:
run_randomForests(x_train, x_test, y_train_enc, y_test_enc)

Micro-average Precision: 0.76
Micro-average Recall: 0.76
Micro-average F1-Score: 0.76
              precision    recall  f1-score   support

           0       0.78      0.77      0.78       427
           1       0.55      0.30      0.39       238
           2       0.78      0.91      0.84       663

    accuracy                           0.76      1328
   macro avg       0.70      0.66      0.67      1328
weighted avg       0.74      0.76      0.74      1328



array([2, 0, 0, ..., 0, 2, 0])

**Resampling and apply random forest to resampled data**

In [25]:
# Resampling the minority class. The strategy can be changed as required.
sm = SMOTE(sampling_strategy='auto', k_neighbors=20, n_jobs=4, random_state=365)

x_resampled, y_resampled_enc = sm.fit_resample(x_train, y_train_enc)
unique, counts = np.unique(y_resampled_enc, return_counts=True)

classificationReport = run_randomForests(x_resampled, x_test, y_resampled_enc, y_test_enc)
print(classificationReport)

Micro-average Precision: 0.76
Micro-average Recall: 0.76
Micro-average F1-Score: 0.76
              precision    recall  f1-score   support

           0       0.81      0.76      0.79       427
           1       0.51      0.51      0.51       238
           2       0.82      0.86      0.84       663

    accuracy                           0.76      1328
   macro avg       0.71      0.71      0.71      1328
weighted avg       0.76      0.76      0.76      1328

[2 0 0 ... 0 2 0]


**Applying Bagging Classifier**

In [26]:
# define model
brfc = BaggingClassifier()
brfc.fit(x_resampled, y_resampled_enc)
y_pred = brfc.predict(x_test)
print(classification_report(y_test_enc, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.76      0.76       427
           1       0.43      0.43      0.43       238
           2       0.82      0.83      0.82       663

    accuracy                           0.73      1328
   macro avg       0.67      0.67      0.67      1328
weighted avg       0.73      0.73      0.73      1328



In [27]:
# define model
brfc = BalancedRandomForestClassifier()
brfc.fit(x_resampled, y_resampled_enc)
y_pred = brfc.predict(x_test)
print(classification_report(y_test_enc, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.76      0.79       427
           1       0.51      0.46      0.48       238
           2       0.81      0.88      0.84       663

    accuracy                           0.77      1328
   macro avg       0.71      0.70      0.71      1328
weighted avg       0.76      0.77      0.76      1328



**Hyperparameter Tuning for Random Forests**

In [22]:
def run_hyperparameters(rand_search, x_test, y_test):
  best_rf = rand_search.best_estimator_
  print('Best parameters set found on development set: ')
  print(rand_search.best_params_)
  print()

  means = rand_search.cv_results_['mean_test_score']
  stds = rand_search.cv_results_['std_test_score']
  for mean, std, params in zip(means, stds, rand_search.cv_results_['params']):
    print('%0.3f (+/-%0.03f) for %r' % (mean, std * 2, params))

  print()
  print('Detailed classification report:')
  print()
  print('The model is trained on the full development set.')
  print('The scores are computed on the full evaluation set.')
  print()
  y_true, y_pred = y_test, rand_search.predict(x_test)
  print(classification_report(y_true, y_pred))
  print()