In [9]:
# Feature Engineering and Data Preprocessing
# This script performs feature engineering and data preprocessing on a dataset of student performance.
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

df = pd.read_csv('../artifacts/Student_performance_data.csv')

df = pd.get_dummies(df, columns=['Ethnicity', 'ParentalEducation', 'ParentalSupport'])

scaler = StandardScaler()
num_features = ['Age', 'StudyTimeWeekly', 'Absences', 'GPA']
features = df.drop(columns=["StudentID", "GradeClass"], axis=1)

features[num_features] = scaler.fit_transform(features[num_features])
print("Total number of input features:", len(features))
print("The model input features:\n")

print(list(features))

import pickle

# Save the scaler
with open("../src/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

# Save the final list of input features
with open("../src/features.pkl", "wb") as f:
    pickle.dump(features.columns.tolist(), f)


Total number of input features: 2392
The model input features:

['Age', 'Gender', 'StudyTimeWeekly', 'Absences', 'Tutoring', 'Extracurricular', 'Sports', 'Music', 'Volunteering', 'GPA', 'Ethnicity_0', 'Ethnicity_1', 'Ethnicity_2', 'Ethnicity_3', 'ParentalEducation_0', 'ParentalEducation_1', 'ParentalEducation_2', 'ParentalEducation_3', 'ParentalEducation_4', 'ParentalSupport_0', 'ParentalSupport_1', 'ParentalSupport_2', 'ParentalSupport_3', 'ParentalSupport_4']


In [10]:
print(df.columns)
print(df.shape)



Index(['StudentID', 'Age', 'Gender', 'StudyTimeWeekly', 'Absences', 'Tutoring',
       'Extracurricular', 'Sports', 'Music', 'Volunteering', 'GPA',
       'GradeClass', 'Ethnicity_0', 'Ethnicity_1', 'Ethnicity_2',
       'Ethnicity_3', 'ParentalEducation_0', 'ParentalEducation_1',
       'ParentalEducation_2', 'ParentalEducation_3', 'ParentalEducation_4',
       'ParentalSupport_0', 'ParentalSupport_1', 'ParentalSupport_2',
       'ParentalSupport_3', 'ParentalSupport_4'],
      dtype='object')
(2392, 26)


In [11]:
print(df.shape)
df.info()

(2392, 26)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2392 entries, 0 to 2391
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   StudentID            2392 non-null   int64  
 1   Age                  2392 non-null   int64  
 2   Gender               2392 non-null   int64  
 3   StudyTimeWeekly      2392 non-null   float64
 4   Absences             2392 non-null   int64  
 5   Tutoring             2392 non-null   int64  
 6   Extracurricular      2392 non-null   int64  
 7   Sports               2392 non-null   int64  
 8   Music                2392 non-null   int64  
 9   Volunteering         2392 non-null   int64  
 10  GPA                  2392 non-null   float64
 11  GradeClass           2392 non-null   float64
 12  Ethnicity_0          2392 non-null   bool   
 13  Ethnicity_1          2392 non-null   bool   
 14  Ethnicity_2          2392 non-null   bool   
 15  Ethnicity_3          2392 n

In [12]:
from sklearn.model_selection import train_test_split
y = df['GradeClass']
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2, random_state=42, stratify=y)

In [13]:
#Logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr = LogisticRegression(max_iter=20000, random_state=42)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print("Logistic Regression:\n", classification_report(y_test, y_pred_lr))


Logistic Regression:
               precision    recall  f1-score   support

         0.0       0.33      0.24      0.28        21
         1.0       0.55      0.39      0.46        54
         2.0       0.54      0.71      0.61        78
         3.0       0.75      0.57      0.64        83
         4.0       0.90      0.98      0.94       243

    accuracy                           0.76       479
   macro avg       0.62      0.57      0.59       479
weighted avg       0.75      0.76      0.75       479



In [14]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest:\n", classification_report(y_test, y_pred_rf))


Random Forest:
               precision    recall  f1-score   support

         0.0       1.00      0.29      0.44        21
         1.0       0.82      0.85      0.84        54
         2.0       0.89      0.92      0.91        78
         3.0       0.87      0.89      0.88        83
         4.0       0.94      0.97      0.96       243

    accuracy                           0.91       479
   macro avg       0.90      0.78      0.80       479
weighted avg       0.91      0.91      0.90       479



In [15]:
#XGBoost
from xgboost import XGBClassifier

xgb = XGBClassifier(eval_metric='mlogloss')
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
print("XGBoost:\n", classification_report(y_test, y_pred_xgb))


XGBoost:
               precision    recall  f1-score   support

         0.0       1.00      0.43      0.60        21
         1.0       0.89      0.91      0.90        54
         2.0       0.93      0.95      0.94        78
         3.0       0.88      0.86      0.87        83
         4.0       0.93      0.97      0.95       243

    accuracy                           0.92       479
   macro avg       0.92      0.82      0.85       479
weighted avg       0.92      0.92      0.91       479



In [16]:
#Step 10: Deep Learning Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical


In [17]:
y_train_dl = to_categorical(y_train)
y_test_dl = to_categorical(y_test)
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(5, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train_dl, epochs=50, batch_size=32, validation_data=(X_test, y_test_dl))

loss, accuracy = model.evaluate(X_test, y_test_dl)
print("Test Accuracy:", accuracy)

y_pred_dl = model.predict(X_test)
y_pred_classes = np.argmax(y_pred_dl, axis=1)


print(classification_report(y_test, y_pred_classes))
print(confusion_matrix(y_test, y_pred_classes))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.4667 - loss: 1.3633 - val_accuracy: 0.5971 - val_loss: 0.9938
Epoch 2/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6285 - loss: 0.9256 - val_accuracy: 0.7411 - val_loss: 0.8365
Epoch 3/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7104 - loss: 0.8313 - val_accuracy: 0.7641 - val_loss: 0.7710
Epoch 4/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7695 - loss: 0.7884 - val_accuracy: 0.8184 - val_loss: 0.7297
Epoch 5/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8092 - loss: 0.7066 - val_accuracy: 0.8205 - val_loss: 0.7098
Epoch 6/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8133 - loss: 0.7047 - val_accuracy: 0.8267 - val_loss: 0.6903
Epoch 7/50
[1m60/60[0m [32m━━━━━━━━━

In [18]:
import pickle
import keras

# Logistic Regression
with open("../src/logistic_regression_model.pkl", "wb") as file:
    pickle.dump(lr, file)

# Random Forest
with open("../src/random_forest_model.pkl", "wb") as file:
    pickle.dump(rf, file)

# XGBoost
with open("../src/xgboost_model.pkl", "wb") as file:
    pickle.dump(xgb, file)

# Save the deep model to a file
keras.models.save_model(model, "../src/deep_learning_model.h5")


