## **Preprocessing**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from math import sqrt
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
#Preprocessing the dataset into the wanted one
df = pd.read_csv("combined_dataset.csv")
aggregated_data = df.groupby('Source').agg({
    'CognitiveLoad': ['mean', 'std', 'min', 'max'],
    'Attention': ['mean', 'std', 'min', 'max'],
    'Boredom': ['mean', 'std', 'min', 'max'],
    'Frustration': ['mean', 'std', 'min', 'max'],
    'Confusion': ['mean', 'std', 'min', 'max'],
    'Stress': ['mean', 'std', 'min', 'max'],
    'Engagement': ['mean', 'std', 'min', 'max'],
    'Performance': ['mean']
}).reset_index()
aggregated_data.columns = ['_'.join(col).strip() for col in aggregated_data.columns.values]
students = aggregated_data['Source_'].unique()
#Take 20 out of 80 students for the testing
train_students, test_students = train_test_split(students, test_size=20, random_state=42)
train_data = aggregated_data[aggregated_data['Source_'].isin(train_students)]
test_data = aggregated_data[aggregated_data['Source_'].isin(test_students)]

X_train, y_train = train_data.drop(columns=['Performance_mean']), train_data['Performance_mean']
X_test, y_test = test_data.drop(columns=['Performance_mean']), test_data['Performance_mean']
X_train_no_source = X_train.drop(columns=["Source_"])
X_test_no_source = X_test.drop(columns=["Source_"])

### **Inner evaluation**

In [None]:
#All models to test
models = [
    KNeighborsRegressor(),
    SVR(),
    RandomForestRegressor(),
    LinearRegression(),
    Ridge(),
    Lasso(),
    GradientBoostingRegressor(),
    MLPRegressor(),
    XGBRegressor()
]
#Parameter optimization for each model
param_grid = {
    'KNeighborsRegressor': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    },
    'SVR': {
        'kernel': ['linear', 'poly', 'rbf'],
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto']
    },
    'RandomForestRegressor': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    },
    'LinearRegression': {
    },
    'Ridge': {
        'alpha': [0.1, 1, 10, 100],
        'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'saga'],
    },
    'Lasso': {
        'alpha': [0.1, 1, 10, 100],
        'max_iter': [1000, 5000, 10000],
    },
    'GradientBoostingRegressor': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
    },
    'MLPRegressor': {
        'hidden_layer_sizes': [(50,), (100,), (50, 50)],
        'activation': ['relu', 'tanh', 'logistic'],
        'solver': ['adam', 'lbfgs'],
        'max_iter': [1000, 2000],
    },
    'XGBRegressor': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'gamma': [0, 0.1, 0.2]
    }
}
#For the cv part of the grid search
cv = GroupKFold(n_splits=5)
results = []
parameters = []
#Optimization for each model
for model in models:
    model_name = model.__class__.__name__
    param_grid_for_model = param_grid[model_name] if model_name in param_grid else {}
    grid_search = GridSearchCV(model, param_grid_for_model, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train_no_source, y_train, groups=X_train["Source_"])
    best_rmse = sqrt(-grid_search.best_score_)
    best_params = grid_search.best_params_

    print(model_name)
    print(best_params)
    print("-" * 40)

    #Storage of the results
    results.append({
        'Model': model_name,
        'Best RMSE': best_rmse
    })

results_df = pd.DataFrame(results)
print(results_df)

KNeighborsRegressor
{'algorithm': 'auto', 'n_neighbors': 3, 'weights': 'uniform'}
----------------------------------------
SVR
{'C': 100, 'gamma': 'scale', 'kernel': 'poly'}
----------------------------------------
RandomForestRegressor
{'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
----------------------------------------
LinearRegression
{}
----------------------------------------
Ridge
{'alpha': 0.1, 'solver': 'lsqr'}
----------------------------------------
Lasso
{'alpha': 0.1, 'max_iter': 1000}
----------------------------------------
GradientBoostingRegressor
{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}
----------------------------------------
MLPRegressor
{'activation': 'tanh', 'hidden_layer_sizes': (50, 50), 'max_iter': 2000, 'solver': 'lbfgs'}
----------------------------------------
XGBRegressor
{'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300, 'subsample': 0.8}
------------------

## **Outter evaluation**

In [None]:
mlp_model = MLPRegressor(
    activation='tanh',
    hidden_layer_sizes=(50, 50),
    max_iter=2000,
    solver='lbfgs',
)
mlp_model.fit(X_train_no_source, y_train)
X_test_no_source = X_test.drop(columns=["Source_"])
y_pred = mlp_model.predict(X_test_no_source)

test_rmse = sqrt(mean_squared_error(y_test, y_pred))
print(f"Test RMSE MLP: {test_rmse}")

Test RMSE MLP: 0.03250053787269609


In [None]:
knn_model = KNeighborsRegressor(
    algorithm='auto',
    n_neighbors=3,
    weights='uniform'
)
knn_model.fit(X_train_no_source, y_train)

y_pred = knn_model.predict(X_test_no_source)

test_rmse = sqrt(mean_squared_error(y_test, y_pred))
print(f"Test RMSE KNN: {test_rmse}")


Test RMSE KNN: 0.0


## **Final model**

In [None]:
import joblib
mlp_model = MLPRegressor(
    activation='tanh',
    hidden_layer_sizes=(50, 50),
    max_iter=2000,
    solver='lbfgs',
)
X = pd.concat([X_train_no_source, X_test_no_source])
Y = pd.concat([y_train, y_test])
mlp_model.fit(X, Y)



model_filename = "mlp_model.joblib"
joblib.dump(mlp_model, model_filename)
print(f"Model saved as {model_filename}")
loaded_model = joblib.load(model_filename)

Model saved as mlp_model.joblib
