In [47]:
import pandas as pd
import joblib
import mariadb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

Database connection

In [48]:
conn = mariadb.connect(
    user="admin",
    password="admin",
    host="localhost",
    port=3306,
    database="grade_prediction"
)

Load Data

In [49]:
query = "SELECT * FROM students_training;"
data = pd.read_sql(query, conn)
data = data.drop(columns=['id'])
print(data.head())

  school sex  age address famsize Pstatus  Medu  Fedu     Mjob      Fjob  ...  \
0     GP   F   18       U     GT3       A     4     4  at_home   teacher  ...   
1     GP   F   17       U     GT3       T     1     1  at_home     other  ...   
2     GP   F   15       U     LE3       T     1     1  at_home     other  ...   
3     GP   F   15       U     GT3       T     4     2   health  services  ...   
4     GP   F   16       U     GT3       T     3     3    other     other  ...   

  internet romantic  famrel  freetime  goout Dalc Walc health absences  G3  
0       no       no       4         3      4    1    1      3        6   6  
1      yes       no       5         3      3    1    1      3        4   6  
2      yes       no       4         3      2    2    3      3       10  10  
3      yes      yes       3         2      2    1    1      5        2  15  
4       no       no       4         3      2    1    2      5        4  10  

[5 rows x 31 columns]


  data = pd.read_sql(query, conn)


Seperate data into x and y

In [50]:
data = pd.get_dummies(data, drop_first=True)

X = data.drop(columns=['G3'])
y = data['G3']

print(X.columns)

Index(['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel',
       'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'school_MS',
       'sex_M', 'address_U', 'famsize_LE3', 'Pstatus_T', 'Mjob_health',
       'Mjob_other', 'Mjob_services', 'Mjob_teacher', 'Fjob_health',
       'Fjob_other', 'Fjob_services', 'Fjob_teacher', 'reason_home',
       'reason_other', 'reason_reputation', 'guardian_mother',
       'guardian_other', 'schoolsup_yes', 'famsup_yes', 'paid_yes',
       'activities_yes', 'nursery_yes', 'higher_yes', 'internet_yes',
       'romantic_yes'],
      dtype='object')


Train model

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

Test model

In [52]:
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.2f}")

print("\nSample Predictions:")
for true, pred in list(zip(y_test, y_pred))[:10]:
    print(f"Actual: {true:.1f}  |  Predicted: {pred:.1f}")

Mean Squared Error: 13.76
R² Score: 0.11

Sample Predictions:
Actual: 11.0  |  Predicted: 11.1
Actual: 8.0  |  Predicted: 10.2
Actual: 13.0  |  Predicted: 9.6
Actual: 11.0  |  Predicted: 12.6
Actual: 12.0  |  Predicted: 13.1
Actual: 11.0  |  Predicted: 11.5
Actual: 0.0  |  Predicted: 2.4
Actual: 11.0  |  Predicted: 12.6
Actual: 15.0  |  Predicted: 11.7
Actual: 13.0  |  Predicted: 14.1


Save the machine learning model in its own file

In [53]:
joblib.dump(model, 'model.pkl')

#loaded_model = joblib.load('model.pkl')

['model.pkl']