## **Inner evaluation (without parameters) and preprocessing**

In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from math import sqrt
from sklearn.model_selection import cross_val_score, TimeSeriesSplit
from sklearn.model_selection import GroupKFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
df = pd.read_csv("combined_dataset.csv")

df=df.drop(columns=['Performance'])
df_one_hot = pd.get_dummies(df, columns=['Source'], drop_first=True)
X = df_one_hot.drop(columns=['Attention'])
y = df_one_hot['Attention']

split_index = int(len(X) * 0.75)
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

models = [
    KNeighborsRegressor(),
    RandomForestRegressor(),
    LinearRegression(),
    Ridge(),
    Lasso(),
    GradientBoostingRegressor(),
    MLPRegressor(max_iter=1000),
    XGBRegressor()
]
scores=[]
rmse = make_scorer(lambda y_true, y_pred: sqrt(mean_squared_error(y_true, y_pred)))
tscv = TimeSeriesSplit(n_splits=5)
for  model in models:
    model_name = model.__class__.__name__
    cv_rmse_scores = cross_val_score(model, X_train, y_train, cv=tscv, scoring=rmse, n_jobs=-1)

    mean_rmse = np.mean(cv_rmse_scores)
    std_rmse = np.std(cv_rmse_scores)
    scores.append(mean_rmse)
    print(f"Cross-Validated RMSE for model {model_name}: {mean_rmse:.4f} ± {std_rmse:.4f}")
    print("-" * 40)

Cross-Validated RMSE for model KNeighborsRegressor: 0.2533 ± 0.0542
----------------------------------------
Cross-Validated RMSE for model RandomForestRegressor: 0.2384 ± 0.0561
----------------------------------------
Cross-Validated RMSE for model LinearRegression: 0.2597 ± 0.0267
----------------------------------------
Cross-Validated RMSE for model Ridge: 0.2602 ± 0.0268
----------------------------------------
Cross-Validated RMSE for model Lasso: 0.2937 ± 0.0265
----------------------------------------
Cross-Validated RMSE for model GradientBoostingRegressor: 0.2336 ± 0.0477
----------------------------------------
Cross-Validated RMSE for model MLPRegressor: 0.2942 ± 0.0375
----------------------------------------
Cross-Validated RMSE for model XGBRegressor: 0.2580 ± 0.0695
----------------------------------------


In [None]:
models = [
    ('KNeighborsRegressor', KNeighborsRegressor(), {
        'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance']
    }),
    ('RandomForestRegressor', RandomForestRegressor(), {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    }),
    ('LinearRegression', LinearRegression(), {
        'fit_intercept': [True, False],
        'normalize': [True, False]
    }),
    ('Ridge', Ridge(), {
        'alpha': [0.1, 1.0, 10.0],
        'fit_intercept': [True, False]
    }),
    ('Lasso', Lasso(), {
        'alpha': [0.1, 0.5, 1.0],
        'fit_intercept': [True, False]
    }),
    ('GradientBoostingRegressor', GradientBoostingRegressor(), {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    }),
    ('MLPRegressor', MLPRegressor(max_iter=1000), {
        'hidden_layer_sizes': [(50,), (100,), (50, 50)],
        'activation': ['relu', 'tanh'],
        'alpha': [0.0001, 0.001, 0.01]
    }),
    ('XGBRegressor', XGBRegressor(eval_metric="rmse", use_label_encoder=False), {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    })
]
tscv = TimeSeriesSplit(n_splits=5)
for model_name, model, param_grid in models:
    print(f"Optimizing {model_name}...")

    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1)

    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_

    y_pred = best_model.predict(X_test)

    test_rmse = sqrt(mean_squared_error(y_test, y_pred))

    print(f"Best params for {model_name}: {grid_search.best_params_}")
    print(f"Test RMSE for {model_name}: {test_rmse}")
    print("-" * 40)

Optimizing KNeighborsRegressor...


KeyboardInterrupt: 

## **Outter evaluation**

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from math import sqrt

linear =  LinearRegression(fit_intercept = True)
linear.fit(X_train, y_train)
y_pred = linear.predict(X_test)

linear_rmse = sqrt(mean_squared_error(y_test, y_pred))
print(f"Test RMSE: {linear_rmse}")

Test RMSE: 0.1278345194192675


In [None]:
ridge =  Ridge(fit_intercept = True, alpha=0.1)
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)

ridge_rmse = sqrt(mean_squared_error(y_test, y_pred))
print(f"Test RMSE: {ridge_rmse}")

Test RMSE: 0.12783465815612816


## **Final model**

In [None]:
import joblib
X = pd.concat([X_train, X_test])
Y = pd.concat([y_train, y_test])
linear.fit(X, Y)



model_filename = "linear_model.joblib"
joblib.dump(linear, model_filename)
print(f"Model saved as {model_filename}")
loaded_model = joblib.load(model_filename)

Model saved as linear_model.joblib
