## Random Forest Regressor

In [91]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
import pickle

#### Lectura de train y test

In [92]:
# lo voy a traer directamente del dataset procesado, para dividirlo en X train y test
# e y train y test

df = pd.read_csv(r'C:\Users\plaza\Desktop\Documentos_Clase\ONLINE_DS_THEBRIDGE_Alejandro_Plaza\Proyecto_ML\data\processed\estudiantes.csv')
df.head()

Unnamed: 0,Hours_Studied,Attendance,Previous_Scores,Tutoring_Sessions,Exam_Score,PI_High,PI_Low,AtR_High,AtR_Low
0,23,84,73,0,67,0,1,1,0
1,19,64,59,2,61,0,1,0,0
2,24,98,91,2,74,0,0,0,0
3,29,89,98,1,71,0,1,0,0
4,19,92,65,3,70,0,0,0,0


In [93]:
df[df['Exam_Score'] < 50]#['Exam_Score'].value_counts().sort_values(ascending = False)

Unnamed: 0,Hours_Studied,Attendance,Previous_Scores,Tutoring_Sessions,Exam_Score,PI_High,PI_Low,AtR_High,AtR_Low
6607,9,39,35,1,21,1,0,1,0
6608,12,36,53,5,32,0,0,0,1
6609,5,55,32,1,22,0,1,0,1
6610,12,60,35,0,26,0,0,0,0
6611,13,59,40,0,24,0,0,1,0
...,...,...,...,...,...,...,...,...,...
9602,0,51,18,2,18,0,0,1,0
9603,10,47,34,1,22,0,1,1,0
9604,4,51,48,1,18,0,1,0,0
9605,6,54,22,4,21,1,0,0,1


#### Dividimos en train y test

In [94]:
X = df.drop(columns = 'Exam_Score')
y = df['Exam_Score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 30)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(9285, 8)
(2322, 8)
(9285,)
(2322,)


In [95]:
param = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

#### Entrenamos al modelo

In [96]:
grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42),
                    param_grid = param,
                    cv = 5,
                    scoring = 'r2',  
                    verbose = 1,
                    n_jobs = -1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


In [97]:
print("Best parameters:", grid_search.best_params_)
print("Best (negative) MAE:", grid_search.best_score_)
print("Best MAE:", -grid_search.best_score_)

Best parameters: {'max_depth': 7, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
Best (negative) MAE: 0.9425459546451103
Best MAE: -0.9425459546451103


#### Hacemos predicciones con los mejores parámetros y vemos los resultados

In [98]:
h_rf = grid_search.best_estimator_
h_rf.fit(X_train, y_train)

y_pred = h_rf.predict(X_test)

print("MAE", mean_absolute_error(y_test, y_pred))
print("MSE", mean_squared_error(y_test, y_pred))
print("RMSE", mean_squared_error(y_test, y_pred) ** (1/2))
print("MAPE", mean_absolute_percentage_error(y_test, y_pred))
print("r2_score", r2_score(y_test, y_pred))

MAE 3.4110350201936743
MSE 27.924855858252656
RMSE 5.284397397835694
MAPE 389819503313481.1
r2_score 0.9446852706205352


Tras comprobar qué variables ha usado más, nos damos cuenta, como era de esperar, que han sido horas estudiadas y el porcentaje de atención en clase

In [99]:
feature_names = ['Hours_Studied', 'Attendance', 'Previous_Scores', 'Tutoring_Sessions',
       'PI_High', 'PI_Low', 'AtR_High', 'AtR_Low']
importances = h_rf.feature_importances_

In [100]:
for feature, importance in zip(feature_names, importances):
    print(f"{feature}: {importance:.4f}")

Hours_Studied: 0.0293
Attendance: 0.8021
Previous_Scores: 0.1635
Tutoring_Sessions: 0.0042
PI_High: 0.0002
PI_Low: 0.0003
AtR_High: 0.0003
AtR_Low: 0.0002


In [101]:
h_rf.predict([[0, 0, 73, 3, 1, 0, 0, 0]])



array([26.40901949])

In [102]:
df[df['Exam_Score'] == 100]

Unnamed: 0,Hours_Studied,Attendance,Previous_Scores,Tutoring_Sessions,Exam_Score,PI_High,PI_Low,AtR_High,AtR_Low
94,18,89,73,3,100,1,0,0,0


#### Guardamos el modelo con pickle

In [103]:
with open(r"C:\Users\plaza\Desktop\Documentos_Clase\ONLINE_DS_THEBRIDGE_Alejandro_Plaza\Proyecto_ML\models\modelo_final.pkl", "wb") as f:
    pickle.dump(h_rf, f)