### 1. Data Overview

#### 1.1 Import the necessary libraries and read the dataset

In [29]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from matplotlib import pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = [5,3]

from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler

from sklearn.model_selection import train_test_split, cross_val_score

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC

import optuna

from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [3]:
df_train= pd.read_csv('train.csv', index_col = 'id')
df_test = pd.read_csv('test.csv')

In [4]:
df_train.head()

Unnamed: 0_level_0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,1,9238,1,1,126.0,1,1,19,...,0,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate
1,1,17,1,9238,1,1,125.0,1,19,19,...,0,6,9,0,0.0,0,11.1,0.6,2.02,Dropout
2,1,17,2,9254,1,1,137.0,1,3,19,...,0,6,0,0,0.0,0,16.2,0.3,-0.92,Dropout
3,1,1,3,9500,1,1,131.0,1,19,3,...,0,8,11,7,12.82,0,11.1,0.6,2.02,Enrolled
4,1,1,2,9500,1,1,132.0,1,19,37,...,0,7,12,6,12.933333,0,7.6,2.6,0.32,Graduate


In [5]:
df_test.head()

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
0,76518,1,1,1,9500,1,1,141.0,1,3,...,0,0,8,0,0,0.0,0,13.9,-0.3,0.79
1,76519,1,1,1,9238,1,1,128.0,1,1,...,0,0,6,6,6,13.5,0,11.1,0.6,2.02
2,76520,1,1,1,9238,1,1,118.0,1,1,...,0,0,6,11,5,11.0,0,15.5,2.8,-4.06
3,76521,1,44,1,9147,1,39,130.0,1,1,...,0,3,8,14,5,11.0,0,8.9,1.4,3.51
4,76522,1,39,1,9670,1,1,110.0,1,1,...,0,0,6,9,4,10.666667,2,7.6,2.6,0.32


In [6]:
df_train.shape

(76518, 37)

In [7]:
#show entire dataframe
pd.set_option('display.max_columns', None)

df_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Marital status,76518.0,1.111934,0.441669,1.0,1.0,1.0,1.0,6.0
Application mode,76518.0,16.054419,16.682337,1.0,1.0,17.0,39.0,53.0
Application order,76518.0,1.64441,1.229645,0.0,1.0,1.0,2.0,9.0
Course,76518.0,9001.286377,1803.438531,33.0,9119.0,9254.0,9670.0,9991.0
Daytime/evening attendance,76518.0,0.915314,0.278416,0.0,1.0,1.0,1.0,1.0
Previous qualification,76518.0,3.65876,8.623774,1.0,1.0,1.0,1.0,43.0
Previous qualification (grade),76518.0,132.378766,10.995328,95.0,125.0,133.1,140.0,190.0
Nacionality,76518.0,1.2266,3.392183,1.0,1.0,1.0,1.0,109.0
Mother's qualification,76518.0,19.837633,15.399456,1.0,1.0,19.0,37.0,44.0
Father's qualification,76518.0,23.425076,14.921164,1.0,4.0,19.0,37.0,44.0


In [8]:
df_train['Target'].value_counts()
# The Dataset is imbalanced

Graduate    36282
Dropout     25296
Enrolled    14940
Name: Target, dtype: int64

#### 1.2 Check null and duplicate values

In [9]:
df_train.isna().sum()
# No null values present

Marital status                                    0
Application mode                                  0
Application order                                 0
Course                                            0
Daytime/evening attendance                        0
Previous qualification                            0
Previous qualification (grade)                    0
Nacionality                                       0
Mother's qualification                            0
Father's qualification                            0
Mother's occupation                               0
Father's occupation                               0
Admission grade                                   0
Displaced                                         0
Educational special needs                         0
Debtor                                            0
Tuition fees up to date                           0
Gender                                            0
Scholarship holder                                0
Age at enrol

In [10]:
df_train.duplicated().sum()
# No duplicate values presenta

0

### 2. EDA 

#### 2.1 Seperate categorical and continuous features and explore the feature w.r.t. labels

In [11]:
feature_list = [feature for feature in df_train.columns if not feature  == "Target"]
categorical_features = ['Scholarship holder','International','Gender','Tuition fees up to date','Daytime/evening attendance','Debtor','Educational special needs','Displaced']
target = "Target"
continuous_features = list(set(feature_list) - set(categorical_features))

In [None]:
num_plots = len(continuous_features)

num_cols = 2
num_rows = (num_plots + num_cols - 1) // num_cols  # Ceiling division to get the number of rows

# Create the subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 5 * num_rows))
axes = axes.flatten()

# Plot each boxplot
for i, feature in enumerate(continuous_features):
    sns.boxplot(data=df_train, x='Target', y=feature, ax=axes[i])

plt.tight_layout()
plt.show()


#### 2.2 Doing the same for continuous variables

In [None]:
# Create the subplots
fig, axes = plt.subplots(4, 2, figsize=(15,20))
axes = axes.flatten()

# Plot each boxplot
for i, feature in enumerate(categorical_features):
    sns.barplot(data=df_train, x='Target', y=feature, ax=axes[i])
    plt.title(f'Barplot of {feature} vs Target')
plt.tight_layout()
plt.show()


#### 2.3 Distribution of the target variable

In [None]:
#pie chart of target
plt.figure(figsize=(7,5))
plt.title('Distribution of the target variable')
plt.pie(df_train.Target.value_counts(), labels = df_train.Target.value_counts().index, explode = [0.1, 0.1, 0.1], autopct='%1.1f%%', shadow = True, startangle=450)
plt.show()

# Concludes that the dataset is imbalanced and is biased towards the target variable 'Graduate'

#### 2.4 Observe the correlation

In [None]:
plt.figure(figsize=(30, 20))
sns.heatmap((df_train.drop(columns = ['Target']).corr()), annot=True, cmap = 'coolwarm')

### 3. Preprocessing

In [16]:
X = df_train.drop(columns = ['Target'])
y = df_train['Target']

le = LabelEncoder()
y = le.fit_transform(df_train['Target'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [17]:
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

In [None]:
le = LabelEncoder()

# Encode the 'Target' column
df_train['Encoded_Target'] = le.fit_transform(df_train['Target'])

# Decode the encoded 'Target' column
decoded = le.inverse_transform(df_train['Encoded_Target'])
decoded

### 4. Modelling

#### 4.1 Boosting algorithms work best on this data

In [12]:
cat = CatBoostClassifier(verbose=0)
lgbm = LGBMClassifier(verbose=0)
xgb = XGBClassifier(verbose=0)


In [None]:
models = [cat, lgbm, xgb]
for model in models:
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print('*'*100)
    print(model.__class__.__name__)
    print('*'*100)
    print(classification_report(y_test, preds))
    print(accuracy_score(y_test, preds))
    print(confusion_matrix(y_test, preds),'\n\n\n')

#### 4.2 Optuna LGBM

In [None]:
'''def objective(trial):
    lgb_params = {
    'objective': 'multiclass',
    'data_sample_strategy': 'goss',
    'tree_learner': 'feature',
    'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
    'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
    'feature_fraction': trial.suggest_uniform('feature_fraction', 0.1, 0.9),
    'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
    'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
    'num_leaves': trial.suggest_int('num_leaves', 2, 2**11),
    'max_depth': trial.suggest_int('max_depth', 3, 8),
    'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
    'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    'min_sum_hessian_in_leaf': trial.suggest_loguniform('min_sum_hessian_in_leaf', 1e-5, 1e2),
    'min_gain_to_split': trial.suggest_loguniform('min_gain_to_split', 1e-8, 10.0),
    'max_bin': trial.suggest_int('max_bin', 100, 500),
    'top_rate': trial.suggest_uniform('top_rate', 0.1, 0.9),
    'verbose': -1,
    'random_state': 42      
}
    lgbm = LGBMClassifier(**lgb_params)

    lgbm.fit(X_train, y_train)
    preds = lgbm.predict(X_test)

    return accuracy_score(y_test, preds)


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)'''

In [23]:
lgb_best_params = {'n_estimators': 170,
 'learning_rate': 0.1315700338586378,
 'feature_fraction': 0.47422346847150354,
 'lambda_l1': 0.10649097661034565,
 'lambda_l2': 3.957038105600349e-07,
 'num_leaves': 468,
 'max_depth': 4,
 'colsample_bytree': 0.5520694480581351,
 'min_child_samples': 20,
 'min_sum_hessian_in_leaf': 4.987565756942516,
 'min_gain_to_split': 1.2363801068603209e-05,
 'max_bin': 483,
 'top_rate': 0.3425102021650939,
 'verbose' : -1
 }

In [25]:
lgbm = LGBMClassifier(**lgb_best_params)
lgbm.fit(X_train, y_train)

y_pred = lgbm.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8361212754835338


In [33]:
final_preds = lgbm.predict(df_test.drop(columns = ['id']))
print(final_preds)

[0 2 2 ... 0 0 0]


In [34]:
submission = le.inverse_transform(final_preds)
submission

array(['Dropout', 'Graduate', 'Graduate', ..., 'Dropout', 'Dropout',
       'Dropout'], dtype=object)

In [35]:
sub = pd.DataFrame({'id': df_test['id'], 'Target': submission})
sub.to_csv('submission5.csv', index=False)

#### 4.3 Optna XGB

In [37]:
'''
def objective(trial):
    xgb_params = {
    'objective': 'multiclass',
    'data_sample_strategy': 'goss',
    'tree_learner': 'feature',
    'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
    'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
    'feature_fraction': trial.suggest_uniform('feature_fraction', 0.1, 0.9),
    'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
    'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
    'num_leaves': trial.suggest_int('num_leaves', 2, 2**11),
    'max_depth': trial.suggest_int('max_depth', 3, 8),
    'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
    'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    'min_sum_hessian_in_leaf': trial.suggest_loguniform('min_sum_hessian_in_leaf', 1e-5, 1e2),
    'min_gain_to_split': trial.suggest_loguniform('min_gain_to_split', 1e-8, 10.0),
    'max_bin': trial.suggest_int('max_bin', 100, 500),
    'top_rate': trial.suggest_uniform('top_rate', 0.1, 0.9),
    'verbose': -1,
    'random_state': 42      
}
    xgb = XGBClassifier(**xgb_params)

    xgb.fit(X_train, y_train)
    preds = xgb.predict(X_test)

    return accuracy_score(y_test, preds)


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=150)'''

[I 2024-06-30 02:48:32,998] A new study created in memory with name: no-name-5a36d109-0346-4936-af69-9928f9e1138c
[I 2024-06-30 02:48:39,344] Trial 0 finished with value: 0.8361212754835338 and parameters: {'n_estimators': 605, 'learning_rate': 0.17849576982772256, 'feature_fraction': 0.6140661544812789, 'lambda_l1': 6.903492617998526e-06, 'lambda_l2': 0.0004625510713178527, 'num_leaves': 439, 'max_depth': 3, 'colsample_bytree': 0.9884720549076185, 'min_child_samples': 50, 'min_sum_hessian_in_leaf': 34.62300661975427, 'min_gain_to_split': 6.962271133952577e-08, 'max_bin': 380, 'top_rate': 0.12876413968928402}. Best is trial 0 with value: 0.8361212754835338.
[I 2024-06-30 02:48:44,616] Trial 1 finished with value: 0.8357945635128071 and parameters: {'n_estimators': 417, 'learning_rate': 0.1428636047410064, 'feature_fraction': 0.443389538711646, 'lambda_l1': 2.0102083102315752e-07, 'lambda_l2': 1.523166117311554e-08, 'num_leaves': 994, 'max_depth': 5, 'colsample_bytree': 0.69147719157244

In [27]:
xgb_best_params = {'n_estimators': 495,
 'learning_rate': 0.10334960152492949,
 'feature_fraction': 0.522616563166872,
 'lambda_l1': 4.969794048779083,
 'lambda_l2': 3.517020884560284e-08,
 'num_leaves': 355,
 'max_depth': 4,
 'colsample_bytree': 0.5575967200471991,
 'min_child_samples': 77,
 'min_sum_hessian_in_leaf': 0.029502842074832412,
 'min_gain_to_split': 5.709481783706076e-08,
 'max_bin': 245,
 'top_rate': 0.3889305307879871,
 'verbose' : 0
 }


In [28]:
model4 = XGBClassifier(**xgb_best_params)
model4.fit(X_train, y_train)

y_pred = model4.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8368400418191323


In [41]:
submission5 = model4.predict(df_test.drop(columns = ['id']))
submission5

array([0, 2, 2, ..., 0, 0, 0], dtype=int64)

In [42]:
sub5 = le.inverse_transform(submission5)
sub5

array(['Dropout', 'Graduate', 'Graduate', ..., 'Dropout', 'Dropout',
       'Dropout'], dtype=object)

In [43]:
submission5 = pd.DataFrame({'id' : df_test['id'], 'Target': sub5})
submission5

Unnamed: 0,id,Target
0,76518,Dropout
1,76519,Graduate
2,76520,Graduate
3,76521,Enrolled
4,76522,Enrolled
...,...,...
51007,127525,Dropout
51008,127526,Dropout
51009,127527,Dropout
51010,127528,Dropout


In [44]:
submission5.to_csv('submission6.csv', index = False)

#### 4.3 HistGradientBoosting HGBc

In [20]:
hgb = HistGradientBoostingClassifier()
hgb.fit(X_train, y_train)
preds= hgb.predict(X_test)

acc= accuracy_score(y_test, preds)
print(acc)

0.8323314166231051


In [33]:
'''AssertionError'''

[I 2024-06-30 19:36:27,323] A new study created in memory with name: no-name-8c0cd7d0-0f69-4af7-b690-4058b462eb3a
[I 2024-06-30 19:36:30,238] Trial 0 finished with value: 0.8312859383167799 and parameters: {'learning_rate': 0.285821268768845, 'max_depth': 5, 'min_samples_leaf': 72, 'l2_regularization': 0.0011723928332750511, 'max_bins': 20}. Best is trial 0 with value: 0.8312859383167799.
[I 2024-06-30 19:36:33,628] Trial 1 finished with value: 0.8328541557762676 and parameters: {'learning_rate': 0.22057206123338194, 'max_depth': 3, 'min_samples_leaf': 67, 'l2_regularization': 1.56889454653191e-08, 'max_bins': 194}. Best is trial 1 with value: 0.8328541557762676.
[I 2024-06-30 19:36:41,880] Trial 2 finished with value: 0.8320700470465238 and parameters: {'learning_rate': 0.04735760991295793, 'max_depth': 6, 'min_samples_leaf': 68, 'l2_regularization': 1.5263597032926175, 'max_bins': 130}. Best is trial 1 with value: 0.8328541557762676.
[I 2024-06-30 19:36:51,202] Trial 3 finished with 

In [37]:
hgb_params = {'learning_rate': 0.14593644845402992,
              'max_depth': 5,
              'min_samples_leaf': 33,
              'l2_regularization': 7.894163479859661e-07,
              'max_bins': 151}

In [41]:
hgb = HistGradientBoostingClassifier(**hgb_params)
hgb.fit(X_train, y_train)
preds= hgb.predict(X_test)

acc= accuracy_score(y_test, preds)
print(acc)

0.8346184004181914


In [42]:
subs = hgb.predict(df_test.drop(columns = ['id']))
print(subs)

[0 2 2 ... 0 0 0]


In [43]:
subs = le.inverse_transform(subs)
print(subs)

['Dropout' 'Graduate' 'Graduate' ... 'Dropout' 'Dropout' 'Dropout']


In [44]:
submission7 = pd.DataFrame({'id' : df_test['id'], 'Target': subs})
submission7

Unnamed: 0,id,Target
0,76518,Dropout
1,76519,Graduate
2,76520,Graduate
3,76521,Enrolled
4,76522,Enrolled
...,...,...
51007,127525,Dropout
51008,127526,Dropout
51009,127527,Dropout
51010,127528,Dropout


In [47]:
submission7.to_csv('submission7.csv', index = False)

In [92]:
#get the mode of the submission no 1,2,5,6,7

submission1 = pd.read_csv('submission1.csv')
submission2 = pd.read_csv('submission2.csv')
submission5 = pd.read_csv('submission5.csv')
submission6 = pd.read_csv('submission6.csv')
submission7 = pd.read_csv('submission7.csv')

In [93]:
final_df = pd.concat([df_test['id'],submission1.Target, submission2.Target, submission5.Target, submission6.Target, submission7.Target], axis = 1)
final_df.columns = ['id', 'Target1', 'Target2', 'Target3', 'Target4', 'Target5']
final_df

Unnamed: 0,id,Target1,Target2,Target3,Target4,Target5
0,76518,Dropout,Dropout,Dropout,Dropout,Dropout
1,76519,Graduate,Graduate,Graduate,Graduate,Graduate
2,76520,Graduate,Graduate,Graduate,Graduate,Graduate
3,76521,Graduate,Enrolled,Graduate,Enrolled,Enrolled
4,76522,Enrolled,Enrolled,Enrolled,Enrolled,Enrolled
...,...,...,...,...,...,...
51007,127525,Dropout,Dropout,Dropout,Dropout,Dropout
51008,127526,Dropout,Dropout,Dropout,Dropout,Dropout
51009,127527,Dropout,Dropout,Dropout,Dropout,Dropout
51010,127528,Dropout,Dropout,Dropout,Dropout,Dropout


In [94]:
final_df.Target1 = le.transform(final_df.Target1)
final_df.Target2 = le.transform(final_df.Target2)
final_df.Target3 = le.transform(final_df.Target3)
final_df.Target4 = le.transform(final_df.Target4)
final_df.Target5 = le.transform(final_df.Target5)

final_df

Unnamed: 0,id,Target1,Target2,Target3,Target4,Target5
0,76518,0,0,0,0,0
1,76519,2,2,2,2,2
2,76520,2,2,2,2,2
3,76521,2,1,2,1,1
4,76522,1,1,1,1,1
...,...,...,...,...,...,...
51007,127525,0,0,0,0,0
51008,127526,0,0,0,0,0
51009,127527,0,0,0,0,0
51010,127528,0,0,0,0,0


In [95]:
# get the mode of all targets as final prediction

final_df['Final'] =  final_df[['Target1', 'Target2', 'Target3', 'Target4', 'Target5']].mode(axis=1)[0]
final_df

Unnamed: 0,id,Target1,Target2,Target3,Target4,Target5,Final
0,76518,0,0,0,0,0,0.0
1,76519,2,2,2,2,2,2.0
2,76520,2,2,2,2,2,2.0
3,76521,2,1,2,1,1,1.0
4,76522,1,1,1,1,1,1.0
...,...,...,...,...,...,...,...
51007,127525,0,0,0,0,0,0.0
51008,127526,0,0,0,0,0,0.0
51009,127527,0,0,0,0,0,0.0
51010,127528,0,0,0,0,0,0.0


In [98]:
final_df['Final'] = final_df['Final'].astype(int) 

final_df

Unnamed: 0,id,Target1,Target2,Target3,Target4,Target5,Final
0,76518,0,0,0,0,0,0
1,76519,2,2,2,2,2,2
2,76520,2,2,2,2,2,2
3,76521,2,1,2,1,1,1
4,76522,1,1,1,1,1,1
...,...,...,...,...,...,...,...
51007,127525,0,0,0,0,0,0
51008,127526,0,0,0,0,0,0
51009,127527,0,0,0,0,0,0
51010,127528,0,0,0,0,0,0


In [100]:
final_df['Target'] = le.inverse_transform(final_df['Final'])
final_df

Unnamed: 0,id,Target1,Target2,Target3,Target4,Target5,Final,Target
0,76518,0,0,0,0,0,0,Dropout
1,76519,2,2,2,2,2,2,Graduate
2,76520,2,2,2,2,2,2,Graduate
3,76521,2,1,2,1,1,1,Enrolled
4,76522,1,1,1,1,1,1,Enrolled
...,...,...,...,...,...,...,...,...
51007,127525,0,0,0,0,0,0,Dropout
51008,127526,0,0,0,0,0,0,Dropout
51009,127527,0,0,0,0,0,0,Dropout
51010,127528,0,0,0,0,0,0,Dropout


In [101]:
sub = final_df[['id', 'Target']]
sub

Unnamed: 0,id,Target
0,76518,Dropout
1,76519,Graduate
2,76520,Graduate
3,76521,Enrolled
4,76522,Enrolled
...,...,...
51007,127525,Dropout
51008,127526,Dropout
51009,127527,Dropout
51010,127528,Dropout


In [102]:
sub.to_csv('final_ensemble_submission.csv', index = False)