### 1. Importing Libraries

In [71]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

import warnings
warnings.filterwarnings("ignore")

### 2. Loading Dataset

In [72]:
academic_record = pd.read_csv("updated_academic_records1.csv")
parent_data = pd.read_csv("updated_parent_data.csv")
teacher = pd.read_csv("teachers.csv")
class_rooms = pd.read_csv("class_rooms.csv")
exam_result = pd.read_csv("exam_result2_updated_check.csv")
student_data = pd.read_csv("students_data1.csv")
subject_averages = pd.read_csv("subject_averages.csv")

### 3. Dropping Unnecessary Features for the Potential Ones

In [73]:
student_data.drop(columns = "name", inplace = True)

In [74]:
academic_record.drop(columns = ["record_id", "student_id","department","subject_scores"], inplace = True)

In [75]:
exam_result.drop(columns = ["result_id", "student_id","department","subject_scores"], inplace = True)

In [76]:
parent_data.drop(columns = ["involvement_id", "student_id"], inplace = True)

In [77]:
teacher.drop(columns = ["teacherid", "name"], inplace = True)

In [78]:
class_rooms.drop(columns = ["classroom_id", "teacher_id", "department"], inplace = True)

In [79]:
subject_averages.drop(columns = ["student_id"], inplace = True)

In [80]:
# Initialize an empty column for Parental_Engagement_Score
parent_data['Parental_Engagement_Score'] = ''

# Iterate over each row using a for loop to create a new Parental_Engagement_Score column
for index, row in parent_data.iterrows():
    if row['attendance'] == True and row['academic_performance'] == True:
        parent_data.at[index, 'Parental_Engagement_Score'] = 'High'
    elif row['attendance'] == True and row['academic_performance'] == False:
        parent_data.at[index, 'Parental_Engagement_Score'] = 'Medium'
    elif row['attendance'] == False and row['academic_performance'] == True:
        parent_data.at[index, 'Parental_Engagement_Score'] = 'Medium'
    elif row['attendance'] == False and row['academic_performance'] == False:
        parent_data.at[index, 'Parental_Engagement_Score'] = 'Low'
        
parent_data.drop(columns = ["attendance","academic_performance"], axis = 1, inplace = True)

In [81]:
# Dictionary to map categorical values to numeric values for parent attendace
Parental_Engagement_mapping = {
    'Low': 0,
    'Medium':1,
    'High': 2
}
for index, row in parent_data.iterrows():
    parent_data.at[index, "Parental_Engagement_Score"] = Parental_Engagement_mapping[row["Parental_Engagement_Score"]]

In [82]:
# Dictionary to map categorical values to numeric values for student attendance
student_Engagement_mapping = {
    'Low': 0,
    'Medium':1,
    'High': 2
}
for index, row in subject_averages.iterrows():
    subject_averages.at[index, "attendance_category"] = student_Engagement_mapping[row["attendance_category"]]

### 4. Joining the necessary dataset for the Modelling

In [83]:
data_africa = pd.concat([student_data,academic_record,parent_data,exam_result,subject_averages], axis = 1)
data_africa.head()

Unnamed: 0,student_id,age,gender,home_language,residential_area,household_income,department,attendance_rate,marital_status,educational_level,...,Chemistry_avg,Government_avg,Literature_in_English_avg,Physics_avg,Commerce_avg,Financial_Accounting_avg,Geography_avg,Fine_Arts_avg,Further_Mathematics_avg,Marketing_avg
0,1,18,Female,Igbo,Urban,Middle,Science,73.72,Married,Secondary,...,53.5,0.0,0.0,66.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,17,Female,Yoruba,Urban,Middle,Science,83.69,Married,Tertiary,...,62.0,0.0,0.0,48.75,0.0,0.0,0.0,0.0,0.0,0.0
2,3,16,Male,Yoruba,Rural,Middle,Science,93.88,Divorced,Primary,...,58.625,0.0,0.0,53.5,0.0,0.0,0.0,0.0,63.0,0.0
3,4,15,Male,Yoruba,Urban,Middle,Science,69.35,Married,Secondary,...,45.5,0.0,0.0,53.875,0.0,0.0,0.0,0.0,0.0,0.0
4,5,17,Male,Hausa,Rural,Middle,Science,81.67,Married,Tertiary,...,58.75,0.0,0.0,59.5,0.0,0.0,0.0,0.0,0.0,0.0


In [84]:
data_africa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 37 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   student_id                 500 non-null    int64  
 1   age                        500 non-null    int64  
 2   gender                     500 non-null    object 
 3   home_language              500 non-null    object 
 4   residential_area           500 non-null    object 
 5   household_income           500 non-null    object 
 6   department                 500 non-null    object 
 7   attendance_rate            500 non-null    float64
 8   marital_status             500 non-null    object 
 9   educational_level          439 non-null    object 
 10  Parental_Engagement_Score  500 non-null    object 
 11  exam_type                  500 non-null    object 
 12  pass_or_fail               500 non-null    object 
 13  attendance_category        500 non-null    object 

In [85]:
data_africa.describe()

Unnamed: 0,student_id,age,attendance_rate,Civic_Education_avg,Religion_Studies_avg,History_avg,Computer_Science_avg,Mathematics_avg,Economics_avg,Visual_Art_avg,...,Chemistry_avg,Government_avg,Literature_in_English_avg,Physics_avg,Commerce_avg,Financial_Accounting_avg,Geography_avg,Fine_Arts_avg,Further_Mathematics_avg,Marketing_avg
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,...,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,250.5,16.532,80.86586,24.026,51.057,14.00225,4.342,51.683,46.35775,4.152,...,9.176,32.92225,14.07625,9.24125,41.869,34.346,19.1115,14.101,23.28875,9.85425
std,144.481833,1.139989,11.177319,31.260207,24.50137,26.522273,15.406496,23.429143,27.452673,14.948086,...,21.073229,32.419507,26.789645,21.172708,29.846054,31.400446,29.758719,26.822582,30.979656,23.610175
min,1.0,15.0,60.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,125.75,16.0,71.87,0.0,50.0,0.0,0.0,42.125,39.75,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,250.5,17.0,80.975,0.0,58.625,0.0,0.0,49.875,48.375,0.0,...,0.0,41.6875,0.0,0.0,44.625,41.125,0.0,0.0,0.0,0.0
75%,375.25,18.0,90.6475,53.28125,61.53125,0.0,0.0,72.8125,71.375,0.0,...,0.0,58.75,0.0,0.0,69.21875,53.90625,49.65625,0.0,52.875,0.0
max,500.0,18.0,99.99,89.25,89.625,86.625,74.25,89.875,93.125,74.25,...,78.125,93.25,92.625,77.5,91.5,92.625,87.625,89.75,87.5,86.875


### 5. Preprocessing using OneHotEncoder

In [86]:
data_africa_encoded = pd.get_dummies(data_africa, columns=['gender',"household_income", 'home_language', 'residential_area', 'department', 'exam_type',"marital_status", "educational_level", "pass_or_fail"], drop_first=True)

In [87]:
data_africa_encoded.head()

Unnamed: 0,student_id,age,attendance_rate,Parental_Engagement_Score,attendance_category,Civic_Education_avg,Religion_Studies_avg,History_avg,Computer_Science_avg,Mathematics_avg,...,household_income_Middle,home_language_Igbo,home_language_Yoruba,residential_area_Urban,department_Commercial,department_Science,marital_status_Married,educational_level_Secondary,educational_level_Tertiary,pass_or_fail_Pass
0,1,18,73.72,1,1,51.875,0.0,0.0,68.0,51.625,...,True,True,False,True,False,True,True,True,False,False
1,2,17,83.69,2,2,0.0,0.0,0.0,0.0,55.5,...,True,False,True,True,False,True,True,False,True,True
2,3,16,93.88,2,2,63.0,0.0,0.0,52.625,57.75,...,True,False,True,False,False,True,False,False,False,True
3,4,15,69.35,2,1,0.0,0.0,0.0,51.25,48.625,...,True,False,True,True,False,True,True,True,False,False
4,5,17,81.67,1,2,0.0,0.0,0.0,66.375,57.125,...,True,False,False,False,False,True,True,False,True,True


In [88]:
data_africa_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 40 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   student_id                   500 non-null    int64  
 1   age                          500 non-null    int64  
 2   attendance_rate              500 non-null    float64
 3   Parental_Engagement_Score    500 non-null    object 
 4   attendance_category          500 non-null    object 
 5   Civic_Education_avg          500 non-null    float64
 6   Religion_Studies_avg         500 non-null    float64
 7   History_avg                  500 non-null    float64
 8   Computer_Science_avg         500 non-null    float64
 9   Mathematics_avg              500 non-null    float64
 10  Economics_avg                500 non-null    float64
 11  Visual_Art_avg               500 non-null    float64
 12  Data_Processing_avg          500 non-null    float64
 13  Fisheries_avg       

In [89]:
# converting objetc columns into int type
data_africa_encoded['Parental_Engagement_Score'] = data_africa_encoded['Parental_Engagement_Score'].astype(int)
data_africa_encoded['attendance_category'] = data_africa_encoded['attendance_category'].astype(int)

In [90]:
# checking the infotmation about the modify dataset
data_africa_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 40 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   student_id                   500 non-null    int64  
 1   age                          500 non-null    int64  
 2   attendance_rate              500 non-null    float64
 3   Parental_Engagement_Score    500 non-null    int32  
 4   attendance_category          500 non-null    int32  
 5   Civic_Education_avg          500 non-null    float64
 6   Religion_Studies_avg         500 non-null    float64
 7   History_avg                  500 non-null    float64
 8   Computer_Science_avg         500 non-null    float64
 9   Mathematics_avg              500 non-null    float64
 10  Economics_avg                500 non-null    float64
 11  Visual_Art_avg               500 non-null    float64
 12  Data_Processing_avg          500 non-null    float64
 13  Fisheries_avg       

### 6. Creating Science Student Dataset

In [91]:
science_dataset = data_africa_encoded[['age', 'attendance_rate', 'Parental_Engagement_Score','attendance_category',"Mathematics_avg", "English_avg", 
"Physics_avg", "Chemistry_avg",'gender_Male','household_income_Low', 'household_income_Middle', 'home_language_Igbo','home_language_Yoruba', 'residential_area_Urban', 'pass_or_fail_Pass']]

science_incompulsory_dataset = data_africa_encoded [['Civic_Education_avg','Computer_Science_avg','Economics_avg', 
'Data_Processing_avg','Fisheries_avg', 'Biology_avg','Agricultural_Science_avg','Geography_avg','Further_Mathematics_avg']].mean(axis = 1)

science_dataset["Other Science Courses"] = science_incompulsory_dataset
science_dataset.head()

Unnamed: 0,age,attendance_rate,Parental_Engagement_Score,attendance_category,Mathematics_avg,English_avg,Physics_avg,Chemistry_avg,gender_Male,household_income_Low,household_income_Middle,home_language_Igbo,home_language_Yoruba,residential_area_Urban,pass_or_fail_Pass,Other Science Courses
0,18,73.72,1,1,51.625,53.875,66.0,53.5,False,False,True,True,False,True,False,34.013889
1,17,83.69,2,2,55.5,55.25,48.75,62.0,False,False,True,False,True,True,True,25.152778
2,16,93.88,2,2,57.75,65.375,53.5,58.625,True,False,True,False,True,False,True,33.041667
3,15,69.35,2,1,48.625,54.0,53.875,45.5,True,False,True,False,True,True,False,18.152778
4,17,81.67,1,2,57.125,57.75,59.5,58.75,True,False,True,False,False,False,True,25.694444


### 7. Creating Commercial Student Dataset

In [92]:
commercial_dataset = data_africa_encoded[['age', 'attendance_rate', 'Parental_Engagement_Score','attendance_category',"Mathematics_avg", "English_avg",
"Financial_Accounting_avg", "Economics_avg","Commerce_avg",'gender_Male','household_income_Low', 'household_income_Middle', 'home_language_Igbo',
'home_language_Yoruba', 'residential_area_Urban', 'pass_or_fail_Pass']]

commercial_incompulsory_dataset = data_africa_encoded [['Civic_Education_avg','Computer_Science_avg','Data_Processing_avg',
                                     'Marketing_avg','Geography_avg','Biology_avg','Agricultural_Science_avg']].mean(axis = 1)

commercial_dataset["Other Commercial Courses"] = commercial_incompulsory_dataset
commercial_dataset.head()

Unnamed: 0,age,attendance_rate,Parental_Engagement_Score,attendance_category,Mathematics_avg,English_avg,Financial_Accounting_avg,Economics_avg,Commerce_avg,gender_Male,household_income_Low,household_income_Middle,home_language_Igbo,home_language_Yoruba,residential_area_Urban,pass_or_fail_Pass,Other Commercial Courses
0,18,73.72,1,1,51.625,53.875,0.0,0.0,0.0,False,False,True,True,False,True,False,43.732143
1,17,83.69,2,2,55.5,55.25,0.0,51.375,0.0,False,False,True,False,True,True,True,16.428571
2,16,93.88,2,2,57.75,65.375,0.0,0.0,0.0,True,False,True,False,True,False,True,33.482143
3,15,69.35,2,1,48.625,54.0,0.0,0.0,0.0,True,False,True,False,True,True,False,23.339286
4,17,81.67,1,2,57.125,57.75,0.0,0.0,0.0,True,False,True,False,False,False,True,25.410714


### 8. Creating Art Dataset

In [93]:
Art_dataset = data_africa_encoded[['age', 'attendance_rate', 'Parental_Engagement_Score','attendance_category','Literature_in_English_avg',
'English_avg','Religion_Studies_avg','History_avg','Government_avg','gender_Male','household_income_Low', 'household_income_Middle', 
'home_language_Igbo','home_language_Yoruba', 'residential_area_Urban', 'pass_or_fail_Pass']]

Art_incompulsory_dataset = data_africa_encoded [['Civic_Education_avg','Computer_Science_avg','Data_Processing_avg',
                                     'Mathematics_avg','Geography_avg','Visual_Art_avg','Fine_Arts_avg','Yoruba_avg',]].mean(axis = 1)

Art_dataset["Other Commercial Courses"] = Art_incompulsory_dataset
Art_dataset.head()

Unnamed: 0,age,attendance_rate,Parental_Engagement_Score,attendance_category,Literature_in_English_avg,English_avg,Religion_Studies_avg,History_avg,Government_avg,gender_Male,household_income_Low,household_income_Middle,home_language_Igbo,home_language_Yoruba,residential_area_Urban,pass_or_fail_Pass,Other Commercial Courses
0,18,73.72,1,1,0.0,53.875,0.0,0.0,0.0,False,False,True,True,False,True,False,37.9375
1,17,83.69,2,2,0.0,55.25,0.0,0.0,0.0,False,False,True,False,True,True,True,21.515625
2,16,93.88,2,2,0.0,65.375,0.0,0.0,0.0,True,False,True,False,True,False,True,36.1875
3,15,69.35,2,1,0.0,54.0,0.0,0.0,0.0,True,False,True,False,True,True,False,25.75
4,17,81.67,1,2,0.0,57.75,0.0,0.0,0.0,True,False,True,False,False,False,True,30.328125


### 9. Splitting the Dataset for each Department

In [94]:
# Feature columns and target column for modelling

# for science dataset
X_science = science_dataset.drop('pass_or_fail_Pass', axis=1)  # Features
y_science = science_dataset['pass_or_fail_Pass']               # Target (Pass/Fail)
# for commercial dataset
X_commercial = commercial_dataset.drop('pass_or_fail_Pass', axis=1)  # Features
y_commercial = commercial_dataset['pass_or_fail_Pass']               # Target (Pass/Fail)
# for Art dataset
X_Art = Art_dataset.drop('pass_or_fail_Pass', axis=1)  # Features
y_Art = Art_dataset['pass_or_fail_Pass']               # Target (Pass/Fail)

### 10. Centering And Scaling

In [95]:
# creating the centering and scaling function
def centre_scaler(data):
    for column in data.columns:
        mu = np.mean(data[column])
        sigma = np.std(data[column])
        data[column] = (data[column] - mu) / sigma   

# centering and scaling science student dataset
centre_scaler(X_science)
# centering and scaling commercial student dataset
centre_scaler(X_commercial)
# centering and scaling Art student dataset
centre_scaler(X_Art)

### 11. Winsorization

In [96]:
from scipy.stats import mstats
def winsorize(data):
    for column in data.columns:
        data[column] = mstats.winsorize(data[column], limits=[0.05, 0.05])
    
# winsorizing science student dataset
winsorize(X_science)
# winsorizing commercial student dataset
winsorize(X_commercial)
# winsorizing Art student dataset
winsorize(X_Art)

### 12. Modelling and Cross Validation

In [97]:
#xgboost
xgb_params = {'n_estimators': 150,
              'random_state':0,
                 'max_depth': 3,
                 'learning_rate': 0.1,
                 'min_child_weight': 4,
                 'subsample': 0.7,
                 'colsample_bytree': 0.3,
             'verbose':0}
  
xgb_model = xgb.XGBClassifier(**xgb_params)

best_fold_xgb = None
best_classification_report_xgb = None
best_mean_roc_auc_xgb = 0.0

In [100]:
skf = StratifiedKFold(n_splits = 60, shuffle = True, random_state = 1500)

Dataset_list_X = [X_science, X_commercial, X_Art]
Dataset_list_y = [y_science, y_commercial, y_Art]

for Data in range(3):

    for fold,  (train_index, test_index) in enumerate(skf.split(Dataset_list_X[Data],Dataset_list_y[Data])):
        x_train, x_test = X_science.iloc[train_index], X_science.iloc[test_index]
        y_train, y_test = y_science.iloc[train_index], y_science.iloc[test_index]

        #training the model
        xgb_model.fit(x_train, y_train)

        # Make predictions on the test set
        y_pred = xgb_model.predict(x_test)

        #metrics for model performance
        model_rep = classification_report(y_test, y_pred)
        roc_score = roc_auc_score(y_test, y_pred)
        accuracy = accuracy_score(y_test, y_pred)

        if roc_score > best_mean_roc_auc_xgb:
            best_mean_roc_auc_xgb = roc_score
            best_fold_xgb = fold + 1
            best_classification_report_xgb = model_rep

    print(f"Accuracy Score {accuracy:.4f}\n")
    print(f"Best Fold {best_fold_xgb}\n")
    print(f"{best_classification_report_xgb}")
    print(f"ROC AUC : {best_mean_roc_auc_xgb:.4f}")
    print("===================================================================")

Accuracy Score 1.0000

Best Fold 1

              precision    recall  f1-score   support

       False       1.00      1.00      1.00         6
        True       1.00      1.00      1.00         3

    accuracy                           1.00         9
   macro avg       1.00      1.00      1.00         9
weighted avg       1.00      1.00      1.00         9

ROC AUC : 1.0000
Accuracy Score 1.0000

Best Fold 1

              precision    recall  f1-score   support

       False       1.00      1.00      1.00         6
        True       1.00      1.00      1.00         3

    accuracy                           1.00         9
   macro avg       1.00      1.00      1.00         9
weighted avg       1.00      1.00      1.00         9

ROC AUC : 1.0000
Accuracy Score 1.0000

Best Fold 1

              precision    recall  f1-score   support

       False       1.00      1.00      1.00         6
        True       1.00      1.00      1.00         3

    accuracy                           1