In [36]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

df = pd.read_csv('../artifacts/engineered_data.csv')
y = df['GradeClass']
features = df.drop(columns=['GradeClass'])
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2, random_state=42, stratify=y)
print(y.value_counts())


GradeClass
4.0    1211
3.0     414
2.0     391
1.0     269
0.0     107
Name: count, dtype: int64


In [37]:
#Logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr = LogisticRegression(max_iter=20000, random_state=42)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print("Logistic Regression:\n", classification_report(y_test, y_pred_lr))


Logistic Regression:
               precision    recall  f1-score   support

         0.0       0.33      0.19      0.24        21
         1.0       0.51      0.37      0.43        54
         2.0       0.50      0.65      0.57        78
         3.0       0.55      0.45      0.49        83
         4.0       0.89      0.95      0.92       243

    accuracy                           0.71       479
   macro avg       0.56      0.52      0.53       479
weighted avg       0.70      0.71      0.70       479



In [38]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest:\n", classification_report(y_test, y_pred_rf))


Random Forest:
               precision    recall  f1-score   support

         0.0       0.40      0.19      0.26        21
         1.0       0.58      0.41      0.48        54
         2.0       0.47      0.56      0.51        78
         3.0       0.43      0.43      0.43        83
         4.0       0.88      0.93      0.90       243

    accuracy                           0.69       479
   macro avg       0.55      0.50      0.52       479
weighted avg       0.68      0.69      0.68       479



In [39]:
#XGBoost
from xgboost import XGBClassifier

xgb = XGBClassifier(eval_metric='mlogloss')
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
print("XGBoost:\n", classification_report(y_test, y_pred_xgb))


XGBoost:
               precision    recall  f1-score   support

         0.0       0.56      0.24      0.33        21
         1.0       0.54      0.56      0.55        54
         2.0       0.44      0.47      0.45        78
         3.0       0.40      0.43      0.42        83
         4.0       0.91      0.89      0.90       243

    accuracy                           0.68       479
   macro avg       0.57      0.52      0.53       479
weighted avg       0.69      0.68      0.68       479



In [None]:
#Step 10: Deep Learning Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
y_train_dl = to_categorical(y_train)
y_test_dl = to_categorical(y_test)
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(5, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train_dl, epochs=50, batch_size=32, validation_data=(X_test, y_test_dl))

loss, accuracy = model.evaluate(X_test, y_test_dl)
print("Test Accuracy:", accuracy)

y_pred_dl = model.predict(X_test)
y_pred_classes = np.argmax(y_pred_dl, axis=1)

print(classification_report(y_test, y_pred_classes))
print(confusion_matrix(y_test, y_pred_classes))


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5134 - loss: 1.3092 - val_accuracy: 0.6033 - val_loss: 1.0282
Epoch 2/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6211 - loss: 0.9844 - val_accuracy: 0.6472 - val_loss: 0.9087
Epoch 3/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6739 - loss: 0.9141 - val_accuracy: 0.6743 - val_loss: 0.8553
Epoch 4/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7101 - loss: 0.8177 - val_accuracy: 0.6994 - val_loss: 0.8394
Epoch 5/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7110 - loss: 0.8328 - val_accuracy: 0.7161 - val_loss: 0.8099
Epoch 6/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7302 - loss: 0.8225 - val_accuracy: 0.7161 - val_loss: 0.7979
Epoch 7/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━

In [41]:
import pickle
import keras

# Logistic Regression
with open("../src/logistic_regression_model.pkl", "wb") as file:
    pickle.dump(lr, file)

# Random Forest
with open("../src/random_forest_model.pkl", "wb") as file:
    pickle.dump(rf, file)

# XGBoost
with open("../src/xgboost_model.pkl", "wb") as file:
    pickle.dump(xgb, file)

# Save the deep model to a file
keras.models.save_model(model, "../src/deep_learning_model.h5")


