In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import DMatrix, cv
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.ensemble import AdaBoostClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score,classification_report
import pickle
import joblib
import warnings
warnings.filterwarnings('ignore')

In [53]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [54]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76518 entries, 0 to 76517
Data columns (total 38 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   id                                              76518 non-null  int64  
 1   Marital status                                  76518 non-null  int64  
 2   Application mode                                76518 non-null  int64  
 3   Application order                               76518 non-null  int64  
 4   Course                                          76518 non-null  int64  
 5   Daytime/evening attendance                      76518 non-null  int64  
 6   Previous qualification                          76518 non-null  int64  
 7   Previous qualification (grade)                  76518 non-null  float64
 8   Nacionality                                     76518 non-null  int64  
 9   Mother's qualification                 

In [55]:
train.columns

Index(['id', 'Marital status', 'Application mode', 'Application order',
       'Course', 'Daytime/evening attendance', 'Previous qualification',
       'Previous qualification (grade)', 'Nacionality',
       'Mother's qualification', 'Father's qualification',
       'Mother's occupation', 'Father's occupation', 'Admission grade',
       'Displaced', 'Educational special needs', 'Debtor',
       'Tuition fees up to date', 'Gender', 'Scholarship holder',
       'Age at enrollment', 'International',
       'Curricular units 1st sem (credited)',
       'Curricular units 1st sem (enrolled)',
       'Curricular units 1st sem (evaluations)',
       'Curricular units 1st sem (approved)',
       'Curricular units 1st sem (grade)',
       'Curricular units 1st sem (without evaluations)',
       'Curricular units 2nd sem (credited)',
       'Curricular units 2nd sem (enrolled)',
       'Curricular units 2nd sem (evaluations)',
       'Curricular units 2nd sem (approved)',
       'Curricular units

In [56]:
object_cols=[cols for cols in train.columns if (train[cols].dtypes in ['object','category']) or (train[cols].nunique()<25)]
object_cols

['Marital status',
 'Application mode',
 'Application order',
 'Course',
 'Daytime/evening attendance',
 'Previous qualification',
 'Nacionality',
 'Displaced',
 'Educational special needs',
 'Debtor',
 'Tuition fees up to date',
 'Gender',
 'Scholarship holder',
 'International',
 'Curricular units 1st sem (credited)',
 'Curricular units 1st sem (enrolled)',
 'Curricular units 1st sem (approved)',
 'Curricular units 1st sem (without evaluations)',
 'Curricular units 2nd sem (credited)',
 'Curricular units 2nd sem (enrolled)',
 'Curricular units 2nd sem (approved)',
 'Curricular units 2nd sem (without evaluations)',
 'Unemployment rate',
 'Inflation rate',
 'GDP',
 'Target']

In [57]:
for obj_cols in object_cols:
    if train[obj_cols].nunique()<=10:
        print(obj_cols," - ",train[obj_cols].nunique())

Marital status  -  6
Application order  -  8
Daytime/evening attendance  -  2
Displaced  -  2
Educational special needs  -  2
Debtor  -  2
Tuition fees up to date  -  2
Gender  -  2
Scholarship holder  -  2
International  -  2
Target  -  3


#### Drop the unnecessary variables such as id, application mode, order and course id.

In [58]:
train['Target']=train['Target'].replace({'Graduate':0,'Enrolled':1,'Dropout':2})

In [59]:
x=train.drop(['id','Application mode','Application order','Course','Target'],axis='columns')
y=train.Target

In [60]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=100,stratify=y)

In [61]:
rf=RandomForestClassifier(class_weight='balanced',random_state=44)
rf.fit(x_train,y_train)

In [62]:
y_pred_rf=rf.predict(x_test)

In [63]:
print(accuracy_score(y_pred_rf,y_test))

0.819916361735494


#### Baseline model with no preprocessing gives 82% accuracy.

In [64]:
print(classification_report(y_pred_rf,y_test))

              precision    recall  f1-score   support

           0       0.93      0.83      0.88     10088
           1       0.56      0.64      0.60      3242
           2       0.82      0.90      0.86      5800

    accuracy                           0.82     19130
   macro avg       0.77      0.79      0.78     19130
weighted avg       0.83      0.82      0.82     19130



In [65]:
feature_importances = rf.feature_importances_
importance_df = pd.DataFrame({
    'Feature': x.columns,
    'Importance': feature_importances*100
}).sort_values(by='Importance', ascending=False)

In [66]:
importance_df

Unnamed: 0,Feature,Importance
27,Curricular units 2nd sem (approved),14.129327
28,Curricular units 2nd sem (grade),13.835457
21,Curricular units 1st sem (approved),10.665738
22,Curricular units 1st sem (grade),8.826856
26,Curricular units 2nd sem (evaluations),6.486902
9,Admission grade,4.944272
20,Curricular units 1st sem (evaluations),4.522029
3,Previous qualification (grade),4.113203
16,Age at enrollment,3.435806
13,Tuition fees up to date,2.9491


#### Selecting the variables whose importance scores are atleast above than or equal to 1%.

In [67]:
features=importance_df[importance_df.Importance>=1]

In [68]:
x_features=train[features.Feature]
y_features=train.Target

In [69]:
smote = SMOTE(random_state=42)
x_resampled, y_resampled = smote.fit_resample(x_features, y_features)

In [70]:
categ=x_resampled['Scholarship holder']

In [71]:
stdscaler=StandardScaler()
x_scaled=stdscaler.fit_transform(x_resampled.drop('Scholarship holder',axis='columns'))

In [51]:
with open('scaler.joblib','wb') as scale:
    joblib.dump(stdscaler,scale)

In [72]:
scaled_df=pd.DataFrame(data=x_scaled,columns=['Curricular units 2nd sem (approved)','Curricular units 2nd sem (grade)',
       'Curricular units 1st sem (approved)',
       'Curricular units 1st sem (grade)',
       'Curricular units 2nd sem (evaluations)', 'Admission grade',
       'Curricular units 1st sem (evaluations)',
       'Previous qualification (grade)', 'Age at enrollment',
       'Tuition fees up to date', "Father's occupation", "Mother's occupation",
       'GDP', 'Unemployment rate', "Mother's qualification",
       "Father's qualification", 'Inflation rate',
       'Curricular units 1st sem (enrolled)',
       'Curricular units 2nd sem (enrolled)'])

In [73]:
scaled_df=pd.concat([scaled_df,categ],axis='columns')

In [74]:
drop_cols=[]
for cols in train.columns:
    if cols not in scaled_df.columns:
        drop_cols.append(cols)

In [75]:
drop_cols

['id',
 'Marital status',
 'Application mode',
 'Application order',
 'Course',
 'Daytime/evening attendance',
 'Previous qualification',
 'Nacionality',
 'Displaced',
 'Educational special needs',
 'Debtor',
 'Gender',
 'International',
 'Curricular units 1st sem (credited)',
 'Curricular units 1st sem (without evaluations)',
 'Curricular units 2nd sem (credited)',
 'Curricular units 2nd sem (without evaluations)',
 'Target']

In [24]:
scaled_df.shape

(108846, 20)

In [25]:
scaled_df.head()

Unnamed: 0,Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 2nd sem (evaluations),Admission grade,Curricular units 1st sem (evaluations),Previous qualification (grade),Age at enrollment,Tuition fees up to date,Father's occupation,Mother's occupation,GDP,Unemployment rate,Mother's qualification,Father's qualification,Inflation rate,Curricular units 1st sem (enrolled),Curricular units 2nd sem (enrolled),Scholarship holder
0,0.899566,0.555283,0.833283,0.905475,-0.096268,-0.174022,-0.413352,-0.54107,-0.625226,0.392251,-0.224453,-0.20229,0.957357,-0.139027,-1.220204,-0.293505,-0.457739,0.16471,0.147912,1
1,-1.37429,-1.715768,0.057301,0.347844,0.474619,-0.399284,0.160884,-0.633823,-0.625226,0.392251,0.002185,0.013516,0.957357,-0.139027,-0.049926,-0.293505,-0.457739,0.16471,0.147912,0
2,-1.37429,-1.715768,-1.494664,-1.882679,-2.09437,1.603946,-2.13606,0.479204,-0.625226,0.392251,-0.337772,-0.364145,-0.375884,1.805649,-1.090173,-0.293505,-0.676473,0.16471,0.147912,0
3,1.278541,0.626808,1.221274,0.538448,1.045506,0.107557,0.448002,-0.077309,-0.625226,0.392251,-0.394431,-0.310193,0.957357,-0.139027,-0.049926,-1.366731,-0.457739,0.794693,1.440177,1
4,0.899566,0.647518,0.833283,0.604226,1.330949,-0.375149,1.309357,0.015443,-0.625226,0.392251,0.002185,-0.256242,0.186435,-1.47361,-0.049926,0.913874,1.000483,0.794693,0.794045,0


In [44]:
x_feature_train, x_feature_test, y_feature_train, y_feature_test = train_test_split(scaled_df, y_resampled,
                                                                                    test_size=0.25,
                                                                                     random_state=42)

In [45]:
rf_1=RandomForestClassifier(random_state=44)
rf_1.fit(x_feature_train,y_feature_train)

In [46]:
y_pred_rf_1=rf_1.predict(x_feature_test)

In [47]:
print(accuracy_score(y_pred_rf_1,y_feature_test))

0.8529325297662795


In [48]:
print(classification_report(y_pred_rf_1,y_feature_test))

              precision    recall  f1-score   support

           0       0.89      0.85      0.87      9611
           1       0.84      0.79      0.81      9469
           2       0.83      0.93      0.88      8132

    accuracy                           0.85     27212
   macro avg       0.85      0.86      0.85     27212
weighted avg       0.85      0.85      0.85     27212



In [50]:
with open('rf.pkl','wb') as rf:
    pickle.dump(rf_1,rf)

In [31]:
rf_cross=cross_val_score(rf_1,scaled_df,y_resampled,cv=10,scoring='accuracy',n_jobs=-1)

In [32]:
rf_cross.mean()

np.float64(0.8506434379535343)

In [33]:
data_dmatrix = DMatrix(data=scaled_df, label=y_resampled)
params = {
    'objective': 'multi:softmax',
    'num_class': len(set(y_resampled)),
    'eval_metric': 'mlogloss',
    'seed': 42
}
cv_results = cv(
    dtrain=data_dmatrix,
    params=params,
    nfold=10,
    num_boost_round=100,
    early_stopping_rounds=10,
    metrics="mlogloss",
    as_pandas=True,
    seed=42
)
print(cv_results)
print(f"Mean Test Log Loss: {cv_results['test-mlogloss-mean'].min():.4f}")

    train-mlogloss-mean  train-mlogloss-std  test-mlogloss-mean  \
0              0.871118            0.000319            0.872085   
1              0.739413            0.000453            0.741376   
2              0.655395            0.000600            0.658277   
3              0.599438            0.000743            0.603222   
4              0.560513            0.000805            0.565223   
..                  ...                 ...                 ...   
95             0.325120            0.001490            0.400873   
96             0.324295            0.001493            0.400655   
97             0.323485            0.001546            0.400400   
98             0.322702            0.001587            0.400220   
99             0.321872            0.001557            0.400013   

    test-mlogloss-std  
0            0.001218  
1            0.002243  
2            0.003211  
3            0.003866  
4            0.004449  
..                ...  
95           0.006599  
96 

In [34]:
best_num_boost_round = cv_results['test-mlogloss-mean'].idxmin()
xgb_model = XGBClassifier(
    objective='multi:softmax', 
    num_class=len(set(y_resampled)),
    eval_metric='mlogloss',
    seed=42,
    n_estimators=best_num_boost_round
)
xgb_model.fit(x_feature_train, y_feature_train)
y_pred = xgb_model.predict(x_feature_test)
accuracy = accuracy_score(y_feature_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 84.99%


In [35]:
train_data = lgb.Dataset(x_feature_train, label=y_feature_train)
params = {
    'objective': 'multiclass',
    'num_class': len(set(y_resampled)),
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': -1,
    'seed': 42
}
lgb_model = lgb.train(params, train_data, 100)
y_pred_lgb = lgb_model.predict(x_feature_test)
y_pred_lgb = [np.argmax(val) for val in y_pred_lgb]
accuracy_lgb = accuracy_score(y_feature_test, y_pred_lgb)
print(f"LightGBM Accuracy: {accuracy_lgb * 100:.2f}%")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007793 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2198
[LightGBM] [Info] Number of data points in the train set: 81634, number of used features: 20
[LightGBM] [Info] Start training from score -1.102601
[LightGBM] [Info] Start training from score -1.094993
[LightGBM] [Info] Start training from score -1.098257
LightGBM Accuracy: 83.94%


In [36]:
adaboost_model = AdaBoostClassifier(
    n_estimators=50,
    learning_rate=1.0,
    random_state=42
)
adaboost_model.fit(x_feature_train, y_feature_train)
y_pred_ada = adaboost_model.predict(x_feature_test)
accuracy_ada = accuracy_score(y_feature_test, y_pred_ada)
print(f"AdaBoost Accuracy: {accuracy_ada * 100:.2f}%")

AdaBoost Accuracy: 80.24%


In [37]:
model = Sequential()
model.add(Dense(units=64, activation='relu', input_dim=x_feature_train.shape[1]))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=16, activation='relu'))
model.add(Dense(units=3, activation='softmax'))
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
y_feature_train_one_hot = to_categorical(y_feature_train, num_classes=3)
model.fit(x_feature_train, y_feature_train_one_hot, epochs=50, batch_size=32)

Epoch 1/50
[1m2552/2552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.7772 - loss: 0.5630
Epoch 2/50
[1m2552/2552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.8085 - loss: 0.4943
Epoch 3/50
[1m2552/2552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.8127 - loss: 0.4872
Epoch 4/50
[1m2552/2552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.8137 - loss: 0.4787
Epoch 5/50
[1m2552/2552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.8148 - loss: 0.4771
Epoch 6/50
[1m2552/2552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.8178 - loss: 0.4727
Epoch 7/50
[1m2552/2552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.8162 - loss: 0.4723
Epoch 8/50
[1m2552/2552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.8178 - loss: 0.4683
Epoch 9/50
[1m2552/255

<keras.src.callbacks.history.History at 0x2e103c97a30>

In [38]:
y_feature_test_one_hot = to_categorical(y_feature_test, num_classes=3)
loss, accuracy = model.evaluate(x_feature_test, y_feature_test_one_hot)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")

[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8198 - loss: 0.4792
Test Loss: 0.4728
Test Accuracy: 82.15%


In [76]:
from model import predict

In [79]:
predict(test)