In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import DMatrix, cv
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.ensemble import AdaBoostClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score,classification_report
import pickle
import joblib
import warnings
warnings.filterwarnings('ignore')

In [2]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [14]:
train['Age at enrollment'].describe()

count    76518.000000
mean        22.278653
std          6.889241
min         17.000000
25%         18.000000
50%         19.000000
75%         23.000000
max         70.000000
Name: Age at enrollment, dtype: float64

In [3]:
train.head()

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,0,1,1,1,9238,1,1,126.0,1,1,...,0,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate
1,1,1,17,1,9238,1,1,125.0,1,19,...,0,6,9,0,0.0,0,11.1,0.6,2.02,Dropout
2,2,1,17,2,9254,1,1,137.0,1,3,...,0,6,0,0,0.0,0,16.2,0.3,-0.92,Dropout
3,3,1,1,3,9500,1,1,131.0,1,19,...,0,8,11,7,12.82,0,11.1,0.6,2.02,Enrolled
4,4,1,1,2,9500,1,1,132.0,1,19,...,0,7,12,6,12.933333,0,7.6,2.6,0.32,Graduate


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76518 entries, 0 to 76517
Data columns (total 38 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   id                                              76518 non-null  int64  
 1   Marital status                                  76518 non-null  int64  
 2   Application mode                                76518 non-null  int64  
 3   Application order                               76518 non-null  int64  
 4   Course                                          76518 non-null  int64  
 5   Daytime/evening attendance                      76518 non-null  int64  
 6   Previous qualification                          76518 non-null  int64  
 7   Previous qualification (grade)                  76518 non-null  float64
 8   Nacionality                                     76518 non-null  int64  
 9   Mother's qualification                 

In [5]:
object_cols=[cols for cols in train.columns if (train[cols].dtypes in ['object','category']) or (train[cols].nunique()<25)]
object_cols

['Marital status',
 'Application mode',
 'Application order',
 'Course',
 'Daytime/evening attendance',
 'Previous qualification',
 'Nacionality',
 'Displaced',
 'Educational special needs',
 'Debtor',
 'Tuition fees up to date',
 'Gender',
 'Scholarship holder',
 'International',
 'Curricular units 1st sem (credited)',
 'Curricular units 1st sem (enrolled)',
 'Curricular units 1st sem (approved)',
 'Curricular units 1st sem (without evaluations)',
 'Curricular units 2nd sem (credited)',
 'Curricular units 2nd sem (enrolled)',
 'Curricular units 2nd sem (approved)',
 'Curricular units 2nd sem (without evaluations)',
 'Unemployment rate',
 'Inflation rate',
 'GDP',
 'Target']

In [6]:
for obj_cols in object_cols:
    if train[obj_cols].nunique()<=25:
        print(obj_cols," - ",train[obj_cols].nunique())

Marital status  -  6
Application mode  -  22
Application order  -  8
Course  -  19
Daytime/evening attendance  -  2
Previous qualification  -  21
Nacionality  -  18
Displaced  -  2
Educational special needs  -  2
Debtor  -  2
Tuition fees up to date  -  2
Gender  -  2
Scholarship holder  -  2
International  -  2
Curricular units 1st sem (credited)  -  21
Curricular units 1st sem (enrolled)  -  24
Curricular units 1st sem (approved)  -  23
Curricular units 1st sem (without evaluations)  -  12
Curricular units 2nd sem (credited)  -  20
Curricular units 2nd sem (enrolled)  -  22
Curricular units 2nd sem (approved)  -  21
Curricular units 2nd sem (without evaluations)  -  11
Unemployment rate  -  11
Inflation rate  -  13
GDP  -  11
Target  -  3


In [4]:
train['Target']=train['Target'].replace({'Graduate':0,'Enrolled':1,'Dropout':2})

In [5]:
x=train.drop(['id','Target'],axis='columns')
y=train.Target

In [6]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=100,stratify=y)

In [7]:
rf=RandomForestClassifier(class_weight='balanced',random_state=44)
rf.fit(x_train,y_train)

In [8]:
y_pred_rf=rf.predict(x_test)

In [9]:
print(accuracy_score(y_pred_rf,y_test))

0.8239414532148458


#### Baseline model with no preprocessing gives 82% accuracy.

In [10]:
print(classification_report(y_pred_rf,y_test))

              precision    recall  f1-score   support

           0       0.93      0.84      0.88     10097
           1       0.57      0.65      0.61      3260
           2       0.82      0.90      0.86      5773

    accuracy                           0.82     19130
   macro avg       0.77      0.80      0.78     19130
weighted avg       0.84      0.82      0.83     19130



In [11]:
feature_importances = rf.feature_importances_
importance_df = pd.DataFrame({
    'Feature': x.columns,
    'Importance': feature_importances*100
}).sort_values(by='Importance', ascending=False)

In [12]:
importance_df

Unnamed: 0,Feature,Importance
30,Curricular units 2nd sem (approved),16.108866
31,Curricular units 2nd sem (grade),12.2269
25,Curricular units 1st sem (grade),9.244277
24,Curricular units 1st sem (approved),9.166944
29,Curricular units 2nd sem (evaluations),5.407321
23,Curricular units 1st sem (evaluations),4.48165
12,Admission grade,4.481153
6,Previous qualification (grade),3.599087
19,Age at enrollment,3.396945
16,Tuition fees up to date,3.12943


In [13]:
features=importance_df[importance_df.Importance>=0]

In [14]:
x_features=train[features.Feature]
y_features=train.Target

In [15]:
smote = SMOTE(random_state=42)
x_resampled, y_resampled = smote.fit_resample(x_features, y_features)

In [16]:
x_resampled

Unnamed: 0,Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 1st sem (grade),Curricular units 1st sem (approved),Curricular units 2nd sem (evaluations),Curricular units 1st sem (evaluations),Admission grade,Previous qualification (grade),Age at enrollment,Tuition fees up to date,...,Previous qualification,Curricular units 1st sem (credited),Marital status,Curricular units 2nd sem (without evaluations),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Daytime/evening attendance,Nacionality,Educational special needs,International
0,6,12.428571,14.500000,6,7,6,122.600000,126.000000,18,1,...,1,0,1,0,0,0,1,1,0,0
1,0,0.000000,11.600000,4,9,8,119.800000,125.000000,18,1,...,1,0,1,0,0,0,1,1,0,0
2,0,0.000000,0.000000,0,0,0,144.700000,137.000000,18,1,...,1,0,1,0,0,0,1,1,0,0
3,7,12.820000,12.591250,7,11,9,126.100000,131.000000,18,1,...,1,0,1,0,0,0,1,1,0,0
4,6,12.933333,12.933333,6,12,12,120.100000,132.000000,18,1,...,1,0,1,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108841,0,0.000000,0.000000,0,5,5,116.700000,123.128643,21,1,...,1,0,1,0,0,0,1,1,0,0
108842,4,12.162852,11.422388,4,8,8,129.961094,140.397014,19,0,...,1,0,1,0,0,0,1,1,0,0
108843,0,0.000000,0.000000,0,12,12,118.673971,131.347798,20,0,...,1,0,1,0,0,0,1,1,0,0
108844,0,0.066752,12.000000,4,7,11,115.033821,125.011867,19,0,...,1,0,1,0,0,0,1,1,0,0


In [17]:
stdscaler=StandardScaler()
x_scaled=stdscaler.fit_transform(x_resampled)

In [18]:
with open('scaler.joblib','wb') as scale:
    joblib.dump(stdscaler,scale)

In [19]:
scaled_df=pd.DataFrame(data=x_scaled,columns=x_resampled.columns)

In [20]:
x_feature_train, x_feature_test, y_feature_train, y_feature_test = train_test_split(scaled_df, y_resampled,
                                                                                    test_size=0.25,
                                                                                     random_state=42)

In [21]:
rf_1=RandomForestClassifier(random_state=44)
rf_1.fit(x_feature_train,y_feature_train)

In [22]:
y_pred_rf_1=rf_1.predict(x_feature_test)

In [23]:
print(accuracy_score(y_pred_rf_1,y_feature_test))

0.8574158459503161


In [24]:
print(classification_report(y_pred_rf_1,y_feature_test))

              precision    recall  f1-score   support

           0       0.90      0.86      0.88      9651
           1       0.84      0.80      0.82      9460
           2       0.83      0.93      0.88      8101

    accuracy                           0.86     27212
   macro avg       0.86      0.86      0.86     27212
weighted avg       0.86      0.86      0.86     27212



In [25]:
rf_1.n_features_in_

36

In [26]:
rf_1.feature_names_in_

array(['Curricular units 2nd sem (approved)',
       'Curricular units 2nd sem (grade)',
       'Curricular units 1st sem (grade)',
       'Curricular units 1st sem (approved)',
       'Curricular units 2nd sem (evaluations)',
       'Curricular units 1st sem (evaluations)', 'Admission grade',
       'Previous qualification (grade)', 'Age at enrollment',
       'Tuition fees up to date', "Father's occupation", 'Course',
       "Mother's occupation", 'GDP', 'Unemployment rate',
       'Scholarship holder', "Mother's qualification",
       "Father's qualification", 'Inflation rate', 'Application mode',
       'Curricular units 2nd sem (enrolled)', 'Application order',
       'Curricular units 1st sem (enrolled)', 'Gender', 'Debtor',
       'Displaced', 'Previous qualification',
       'Curricular units 1st sem (credited)', 'Marital status',
       'Curricular units 2nd sem (without evaluations)',
       'Curricular units 1st sem (without evaluations)',
       'Curricular units 2nd sem (c

In [28]:
with open('rf.pkl','wb') as rf:
    pickle.dump(rf_1,rf)

In [64]:
rf_cross=cross_val_score(rf_1,x_scaled,y_resampled,cv=10,scoring='accuracy',n_jobs=-1)

In [66]:
rf_cross

array([0.80523656, 0.80799265, 0.80973817, 0.80826826, 0.87138264,
       0.87799724, 0.87991547, 0.89700478, 0.89057332, 0.89948548])

In [65]:
rf_cross.mean()

np.float64(0.8547594577772332)

In [67]:
data_dmatrix = DMatrix(data=x_scaled, label=y_resampled)
params = {
    'objective': 'multi:softmax',
    'num_class': len(set(y_resampled)),
    'eval_metric': 'mlogloss',
    'seed': 42
}
cv_results = cv(
    dtrain=data_dmatrix,
    params=params,
    nfold=10,
    num_boost_round=100,
    early_stopping_rounds=10,
    metrics="mlogloss",
    as_pandas=True,
    seed=42
)
print(cv_results)
print(f"Mean Test Log Loss: {cv_results['test-mlogloss-mean'].min():.4f}")

    train-mlogloss-mean  train-mlogloss-std  test-mlogloss-mean  \
0              0.868174            0.000303            0.869099   
1              0.734158            0.000473            0.735980   
2              0.648678            0.000652            0.651474   
3              0.591190            0.000851            0.594828   
4              0.551131            0.000836            0.555742   
..                  ...                 ...                 ...   
95             0.302820            0.001321            0.381832   
96             0.302187            0.001402            0.381724   
97             0.301422            0.001468            0.381533   
98             0.300524            0.001359            0.381337   
99             0.299620            0.001399            0.381254   

    test-mlogloss-std  
0            0.001412  
1            0.002391  
2            0.003176  
3            0.003996  
4            0.004552  
..                ...  
95           0.006832  
96 

In [68]:
best_num_boost_round = cv_results['test-mlogloss-mean'].idxmin()
xgb_model = XGBClassifier(
    objective='multi:softmax', 
    num_class=len(set(y_resampled)),
    eval_metric='mlogloss',
    seed=42,
    n_estimators=best_num_boost_round
)
xgb_model.fit(x_feature_train, y_feature_train)
y_pred = xgb_model.predict(x_feature_test)
accuracy = accuracy_score(y_feature_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 85.55%


In [69]:
train_data = lgb.Dataset(x_feature_train, label=y_feature_train)
params = {
    'objective': 'multiclass',
    'num_class': len(set(y_resampled)),
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': -1,
    'seed': 42
}
lgb_model = lgb.train(params, train_data, 100)
y_pred_lgb = lgb_model.predict(x_feature_test)
y_pred_lgb = [np.argmax(val) for val in y_pred_lgb]
accuracy_lgb = accuracy_score(y_feature_test, y_pred_lgb)
print(f"LightGBM Accuracy: {accuracy_lgb * 100:.2f}%")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011473 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2526
[LightGBM] [Info] Number of data points in the train set: 81634, number of used features: 36
[LightGBM] [Info] Start training from score -1.102601
[LightGBM] [Info] Start training from score -1.094993
[LightGBM] [Info] Start training from score -1.098257
LightGBM Accuracy: 84.66%


In [70]:
adaboost_model = AdaBoostClassifier(
    n_estimators=50,
    learning_rate=1.0,
    random_state=42
)
adaboost_model.fit(x_feature_train, y_feature_train)
y_pred_ada = adaboost_model.predict(x_feature_test)
accuracy_ada = accuracy_score(y_feature_test, y_pred_ada)
print(f"AdaBoost Accuracy: {accuracy_ada * 100:.2f}%")

AdaBoost Accuracy: 80.77%


In [71]:
model = Sequential()
model.add(Dense(units=64, activation='relu', input_dim=x_feature_train.shape[1]))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=16, activation='relu'))
model.add(Dense(units=3, activation='softmax'))
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
y_feature_train_one_hot = to_categorical(y_feature_train, num_classes=3)
model.fit(x_feature_train, y_feature_train_one_hot, epochs=50, batch_size=32)

Epoch 1/50
[1m2552/2552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 4ms/step - accuracy: 0.7725 - loss: 0.5696
Epoch 2/50
[1m2552/2552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.8162 - loss: 0.4708
Epoch 3/50
[1m2552/2552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.8203 - loss: 0.4601
Epoch 4/50
[1m2552/2552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.8237 - loss: 0.4549
Epoch 5/50
[1m2552/2552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.8267 - loss: 0.4487
Epoch 6/50
[1m2552/2552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.8267 - loss: 0.4463
Epoch 7/50
[1m2552/2552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.8297 - loss: 0.4413
Epoch 8/50
[1m2552/2552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.8293 - loss: 0.4422
Epoch 9/50
[1m2

<keras.src.callbacks.history.History at 0x20f00262c50>

In [72]:
y_feature_test_one_hot = to_categorical(y_feature_test, num_classes=3)
loss, accuracy = model.evaluate(x_feature_test, y_feature_test_one_hot)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")

[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8279 - loss: 0.4638
Test Loss: 0.4614
Test Accuracy: 82.78%


In [43]:
import utils
from utils import AcademicSuccess

In [73]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris

def random_forest_tuning():
    rf = RandomForestClassifier(random_state=42)

    # Define the hyperparameter grid
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Perform grid search with cross-validation
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(x_feature_train,y_feature_train)

    # Get the best hyperparameters
    best_params = grid_search.best_params_

    # Train the model with the best hyperparameters
    best_rf = grid_search.best_estimator_
    best_rf.fit(x_feature_train,y_feature_train)

    # Predict on the test set
    y_pred = best_rf.predict(x_feature_test)

    # Calculate accuracy
    best_accuracy = accuracy_score(y_feature_test, y_pred)

    return best_params, best_accuracy

# Example usage
if __name__ == "__main__":
    best_params, best_accuracy = random_forest_tuning()
    print("Best Hyperparameters:", best_params)
    print("Best Accuracy:", best_accuracy)

Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best Accuracy: 0.858555049242981


In [74]:
rf_3=RandomForestClassifier(random_state=44,max_depth=None,min_samples_split=2,min_samples_leaf=1,n_estimators=200)
rf_3.fit(x_feature_train,y_feature_train)

In [75]:
y_pred_rf_3=rf_3.predict(x_feature_test)

In [77]:
print(accuracy_score(y_pred_rf_3,y_feature_test))
print(classification_report(y_pred_rf_3,y_feature_test))

0.8573423489636924
              precision    recall  f1-score   support

           0       0.90      0.86      0.88      9638
           1       0.84      0.80      0.82      9463
           2       0.83      0.93      0.88      8111

    accuracy                           0.86     27212
   macro avg       0.86      0.86      0.86     27212
weighted avg       0.86      0.86      0.86     27212

