### 1. Data Overview

#### 1.1 Import the necessary libraries and read the dataset

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from matplotlib import pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = [5,3]

from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC

import optuna

from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report



In [2]:
df_train= pd.read_csv('train.csv', index_col = 'id')
df_test = pd.read_csv('test.csv')

In [3]:
df_train.head()

Unnamed: 0_level_0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,1,9238,1,1,126.0,1,1,19,...,0,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate
1,1,17,1,9238,1,1,125.0,1,19,19,...,0,6,9,0,0.0,0,11.1,0.6,2.02,Dropout
2,1,17,2,9254,1,1,137.0,1,3,19,...,0,6,0,0,0.0,0,16.2,0.3,-0.92,Dropout
3,1,1,3,9500,1,1,131.0,1,19,3,...,0,8,11,7,12.82,0,11.1,0.6,2.02,Enrolled
4,1,1,2,9500,1,1,132.0,1,19,37,...,0,7,12,6,12.933333,0,7.6,2.6,0.32,Graduate


In [4]:
df_test.head()

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
0,76518,1,1,1,9500,1,1,141.0,1,3,...,0,0,8,0,0,0.0,0,13.9,-0.3,0.79
1,76519,1,1,1,9238,1,1,128.0,1,1,...,0,0,6,6,6,13.5,0,11.1,0.6,2.02
2,76520,1,1,1,9238,1,1,118.0,1,1,...,0,0,6,11,5,11.0,0,15.5,2.8,-4.06
3,76521,1,44,1,9147,1,39,130.0,1,1,...,0,3,8,14,5,11.0,0,8.9,1.4,3.51
4,76522,1,39,1,9670,1,1,110.0,1,1,...,0,0,6,9,4,10.666667,2,7.6,2.6,0.32


In [5]:
df_train.shape

(76518, 37)

In [6]:
#show entire dataframe
pd.set_option('display.max_columns', None)

df_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Marital status,76518.0,1.111934,0.441669,1.0,1.0,1.0,1.0,6.0
Application mode,76518.0,16.054419,16.682337,1.0,1.0,17.0,39.0,53.0
Application order,76518.0,1.64441,1.229645,0.0,1.0,1.0,2.0,9.0
Course,76518.0,9001.286377,1803.438531,33.0,9119.0,9254.0,9670.0,9991.0
Daytime/evening attendance,76518.0,0.915314,0.278416,0.0,1.0,1.0,1.0,1.0
Previous qualification,76518.0,3.65876,8.623774,1.0,1.0,1.0,1.0,43.0
Previous qualification (grade),76518.0,132.378766,10.995328,95.0,125.0,133.1,140.0,190.0
Nacionality,76518.0,1.2266,3.392183,1.0,1.0,1.0,1.0,109.0
Mother's qualification,76518.0,19.837633,15.399456,1.0,1.0,19.0,37.0,44.0
Father's qualification,76518.0,23.425076,14.921164,1.0,4.0,19.0,37.0,44.0


In [7]:
df_train['Target'].value_counts()
# The Dataset is imbalanced

Graduate    36282
Dropout     25296
Enrolled    14940
Name: Target, dtype: int64

#### 1.2 Check null and duplicate values

In [None]:
df_train.isna().sum()
# No null values present

In [None]:
df_train.duplicated().sum()
# No duplicate values presenta

### 2. EDA 

#### 2.1 Seperate categorical and continuous features and explore the feature w.r.t. labels

In [8]:
feature_list = [feature for feature in df_train.columns if not feature  == "Target"]
categorical_features = ['Scholarship holder','International','Gender','Tuition fees up to date','Daytime/evening attendance','Debtor','Educational special needs','Displaced']
target = "Target"
continuous_features = list(set(feature_list) - set(categorical_features))

In [None]:
num_plots = len(continuous_features)

num_cols = 2
num_rows = (num_plots + num_cols - 1) // num_cols  # Ceiling division to get the number of rows

# Create the subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 5 * num_rows))
axes = axes.flatten()

# Plot each boxplot
for i, feature in enumerate(continuous_features):
    sns.boxplot(data=df_train, x='Target', y=feature, ax=axes[i])

plt.tight_layout()
plt.show()


#### 2.2 Doing the same for continuous variables

In [None]:
# Create the subplots
fig, axes = plt.subplots(4, 2, figsize=(15,20))
axes = axes.flatten()

# Plot each boxplot
for i, feature in enumerate(categorical_features):
    sns.barplot(data=df_train, x='Target', y=feature, ax=axes[i])
    plt.title(f'Barplot of {feature} vs Target')
plt.tight_layout()
plt.show()


#### 2.3 Distribution of the target variable

In [None]:
#pie chart of target
plt.figure(figsize=(7,5))
plt.title('Distribution of the target variable')
plt.pie(df_train.Target.value_counts(), labels = df_train.Target.value_counts().index, explode = [0.1, 0.1, 0.1], autopct='%1.1f%%', shadow = True, startangle=450)
plt.show()

# Concludes that the dataset is imbalanced and is biased towards the target variable 'Graduate'

#### 2.4 Observe the correlation

In [None]:
plt.figure(figsize=(30, 20))
sns.heatmap((df_train.drop(columns = ['Target']).corr()), annot=True, cmap = 'coolwarm')

### 3. Preprocessing

In [9]:
X = df_train.drop(columns = ['Target'])
y = df_train['Target']

le = LabelEncoder()
y = le.fit_transform(df_train['Target'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
le = LabelEncoder()

# Encode the 'Target' column
df_train['Encoded_Target'] = le.fit_transform(df_train['Target'])

# Decode the encoded 'Target' column
decoded = le.inverse_transform(df_train['Encoded_Target'])
decoded

In [10]:
scaler = StandardScaler()
minmax = MinMaxScaler()

In [29]:
X_train_scaled_std = scaler.fit_transform(X_train)
X_test_scaled_std = scaler.fit_transform(X_test)


'''
do not touch df_test lol its basic
df_test_scaled_std = scaler.fit_transform(df_test.drop(columns=['id']))
df_test_scaled_minmax = minmax.fit_transform(df_test.drop(columns=['id']))

'''
X_train_scaled_minmax = minmax.fit_transform(X_train)
X_test_scaled_minmax = minmax.fit_transform(X_test)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

### 4. Modelling

#### 4.1 Boosting algorithms work best on this data

In [12]:
cat = CatBoostClassifier(verbose=0)
lgbm = LGBMClassifier(verbose=0)
xgb = XGBClassifier(verbose=0)
rfc = RandomForestClassifier()

In [None]:
models = [cat, lgbm, xgb, rfc]
for model in models:
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print('*'*100)
    print(model.__class__.__name__)
    print('*'*100)
    print(classification_report(y_test, preds))
    print(accuracy_score(y_test, preds))
    print(confusion_matrix(y_test, preds),'\n\n\n')

#### 4.2 Ensemble LGBM, XGB

In [None]:
xgb.fit(X_train_scaled_std, y_train)
lgbm.fit(X_train_scaled_std, y_train)

xgb_pred = xgb.predict(X_test_scaled_std)
lgbm_pred = lgbm.predict(X_test_scaled_std)


print(f'accuracy of xgb : {accuracy_score(y_test, xgb_pred)}')
print(f'accuracy of lgbm : {accuracy_score(y_test, lgbm_pred)}')

print(f'confusion matrix of xgb : \n {confusion_matrix(y_test, xgb_pred)}')
print(f'confusion matrix of lgbm : \n {confusion_matrix(y_test, lgbm_pred)}')

print(classification_report(y_test, xgb_pred), '\n\n')
print(classification_report(y_test, lgbm_pred), '\n\n')

In [None]:
xgb.fit(X_train_scaled_minmax, y_train)
lgbm.fit(X_train_scaled_minmax, y_train)

xgb_pred = xgb.predict(X_test_scaled_minmax)
lgbm_pred = lgbm.predict(X_test_scaled_minmax)


print(f'accuracy of xgb : {accuracy_score(y_test, xgb_pred)}')
print(f'accuracy of lgbm : {accuracy_score(y_test, lgbm_pred)}')

print(f'confusion matrix of xgb : \n {confusion_matrix(y_test, xgb_pred)}')
print(f'confusion matrix of lgbm : \n {confusion_matrix(y_test, lgbm_pred)}')

print(classification_report(y_test, xgb_pred), '\n\n')
print(classification_report(y_test, lgbm_pred), '\n\n')

In [13]:
xgb_best_parameters= {'n_estimators': 278,
                      'max_depth': 3,
                      'learning_rate': 0.1730820977365898,
                      'feature_fraction': 0.8606026706098876,
                      'lambda_l1': 0.005758141578768008,
                      'lambda_l2': 0.020373882463946363,
                      'num_leaves': 102,
                      'min_child_samples': 46,
                      'max_bin': 127,
                      'top_rate': 0.3620721341171365
                      }

lgbm_best_parameters= {'n_estimators': 113,
                       'max_depth': 8,
                       'learning_rate': 0.21206573575973744,
                       'feature_fraction': 0.2404042525935703,
                       'lambda_l1': 8.158503349053043e-05,
                       'lambda_l2': 9.306156003557286,
                       'num_leaves': 13,
                       'min_child_samples': 68,
                       'max_bin': 154,
                       'top_rate': 0.7200788979249051
                       }

cat_best_parameters= {'n_estimators': 627,
                      'max_depth': 5,
                      'learning_rate': 0.1590386332299466,
                      'random_state' : 42}

cat = CatBoostClassifier(**cat_best_parameters)
xgb = XGBClassifier(**xgb_best_parameters)
lgbm = LGBMClassifier(**lgbm_best_parameters)

In [14]:
vc = VotingClassifier(estimators=[('xgb', xgb), ('lgbm', lgbm)], voting='hard')
vc.fit(X_train_scaled_std, y_train)
preds = vc.predict(X_test_scaled_std)
print(accuracy_score(y_test, preds))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003687 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1097
[LightGBM] [Info] Number of data points in the train set: 61214, number of used features: 36
[LightGBM] [Info] Start training from score -1.105333
[LightGBM] [Info] Start training from score -1.635907
[LightGBM] [Info] Start training from score -0.746287
0.8267773131207528


In [15]:
vc = VotingClassifier(estimators=[('xgb', xgb), ('lgbm', lgbm)], voting='hard')
vc.fit(X_train_scaled_minmax, y_train)
preds = vc.predict(X_test_scaled_minmax)
print(accuracy_score(y_test, preds))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003292 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1063
[LightGBM] [Info] Number of data points in the train set: 61214, number of used features: 36
[LightGBM] [Info] Start training from score -1.105333
[LightGBM] [Info] Start training from score -1.635907
[LightGBM] [Info] Start training from score -0.746287
0.8309592263460533


In [16]:
sc = StackingClassifier(estimators=[('xgb', xgb), ('lgbm', lgbm)], final_estimator=LogisticRegression())
sc.fit(X_train, y_train)

y_pred = sc.predict(X_test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003378 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1074
[LightGBM] [Info] Number of data points in the train set: 61214, number of used features: 36
[LightGBM] [Info] Start training from score -1.105333
[LightGBM] [Info] Start training from score -1.635907
[LightGBM] [Info] Start training from score -0.746287
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004828 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1058
[LightGBM] [Info] Number of data points in the train set: 48971, number of used features: 36
[LightGBM] [Info] Start training from score -1.105353
[LightGBM] [Info] Start training from score -1.635840
[LightGBM] [Info] Start 

In [28]:
print(accuracy_score(y_test, y_pred))

0.8365133298484057


In [27]:
y_preds_minmax = sc.predict(df_test_scaled_minmax)
y_preds_scaled = sc.predict(df_test_scaled_std)

y_preds_scaled



array([0, 0, 0, ..., 0, 0, 0])

In [21]:
#inverse transform for submission

y_preds_minmax = le.inverse_transform(y_preds_minmax)
y_preds_scaled = le.inverse_transform(y_preds_scaled)

ValueError: y contains previously unseen labels: ['Dropout']

In [22]:
print(y_preds_minmax)
print(y_preds_scaled)


['Dropout' 'Dropout' 'Dropout' ... 'Dropout' 'Dropout' 'Dropout']
['Dropout' 'Dropout' 'Dropout' ... 'Dropout' 'Dropout' 'Dropout']


In [23]:
submission = pd.DataFrame({'id':df_test['id'], 'Target':y_preds_scaled})
submission2 = pd.DataFrame({'id':df_test['id'], 'Target':y_preds_minmax})


In [24]:
submission.to_csv('Stacking_scaler.csv', index = False)
submission.to_csv('Stacking_minmax.csv', index = False)