## Imports

In [31]:
import pandas as pd
import numpy as np
import altair as alt
from ast import literal_eval

from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)

from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC

from sklearn.metrics import make_scorer, mean_squared_error, r2_score

from sklearn.base import TransformerMixin


In [43]:
alt.renderers.enable('mimetype')
alt.data_transformers.enable('data_server')

DataTransformerRegistry.enable('data_server')

In [2]:
# Solution adapted from StackOverFlow solution: https://tinyurl.com/59hmeesh
class MyMultiLabelBinarizer(TransformerMixin):
    def __init__(self, *args, **kwargs):
        self.encoder = MultiLabelBinarizer(*args, **kwargs)

    def fit(self, x, y=0):
        self.encoder.fit(x)
        return self

    def transform(self, x, y=0):
        return self.encoder.transform(x)

## Reading and Splitting Model Data

In [3]:
train_df = pd.read_csv("../data/processed/training_split.csv")
test_df = pd.read_csv("../data/processed/testing_split.csv")

categorical_list_features = ["boardgamecategory", "boardgamemechanic", "boardgamefamily", "boardgamedesigner", "boardgameartist", "boardgamepublisher"]
# for feat in categorical_list_features:
#     train_df[feat] = train_df[feat].apply(literal_eval)
#     test_df[feat] = test_df[feat].apply(literal_eval)
#     train_df[feat] = train_df[feat].apply(lambda x: set(x))
#     test_df[feat] = test_df[feat].apply(lambda x: set(x))

X_train, y_train = train_df.drop(columns="average"), train_df["average"]
X_test, y_test = test_df.drop(columns="average"), test_df["average"]

## Setting Up the Column Transformers

In [4]:
numerical_features = ["yearpublished", "minplayers", "maxplayers", "playingtime", "minplaytime", "maxplaytime", "minage"]
text_feature = "description"
categorical_features = ["boardgamecategory", "boardgamemechanic", "boardgamefamily", "boardgamedesigner", "boardgameartist", "boardgamepublisher"]

preprocessor = make_column_transformer(
    (StandardScaler(), numerical_features),
    (CountVectorizer(stop_words="english", max_features=1000), text_feature),
    (OneHotEncoder(handle_unknown="ignore"), categorical_features)
)

## Define Scoring Methods and Results

In [5]:
scoring_dict = {
    "r2": "r2",
    "MAPE": "neg_mean_absolute_percentage_error",
    "neg_rmse": "neg_root_mean_squared_error",
    "neg_mse": "neg_mean_squared_error",
}


## Trying Dummy Regressor Baseline

In [6]:
cross_val_results = {}
dummy_regressor = DummyRegressor()
cross_val_results['dummy_regressor'] = pd.DataFrame(cross_validate(dummy_regressor, X_train, y_train, return_train_score = True, scoring=scoring_dict)).agg(['mean', 'std']).round(3).T
cross_val_results['dummy_regressor']

Unnamed: 0,mean,std
fit_time,0.002,0.001
score_time,0.001,0.001
test_r2,-0.001,0.001
train_r2,0.0,0.0
test_MAPE,-0.102,0.003
train_MAPE,-0.102,0.001
test_neg_rmse,-0.826,0.019
train_neg_rmse,-0.827,0.005
test_neg_mse,-0.683,0.032
train_neg_mse,-0.683,0.008


## Ridge Model Optimization

In [7]:
pipe_ridge = make_pipeline(
    preprocessor,
    Ridge()
)

param_dist_ridge = {
    "ridge__alpha": 10.0 ** np.arange(-6, 6, 1)
}

ridge_search = RandomizedSearchCV(
    pipe_ridge, param_dist_ridge, n_iter=12, n_jobs=-1, return_train_score=True
)

ridge_search.fit(X_train, y_train)
ridge_search.best_score_

0.26555599794366336

## Defining New Optimized Ridge Pipe

In [8]:
pipe_ridge_opt = make_pipeline(
    preprocessor,
    Ridge(alpha=ridge_search.best_params_["ridge__alpha"])
)

## Ridge Model Cross-Validation

In [9]:
cross_val_results["ridge"] = pd.DataFrame(cross_validate(pipe_ridge_opt, X_train, y_train, return_train_score=True, scoring=scoring_dict)).agg(['mean', 'std']).round(3).T
cross_val_results["ridge"]

Unnamed: 0,mean,std
fit_time,0.87,0.009
score_time,0.213,0.005
test_r2,0.266,0.027
train_r2,0.512,0.005
test_MAPE,-0.086,0.003
train_MAPE,-0.07,0.001
test_neg_rmse,-0.708,0.021
train_neg_rmse,-0.577,0.005
test_neg_mse,-0.501,0.03
train_neg_mse,-0.333,0.005


## Testing SVC Model

In [10]:
# pipe_svc = make_pipeline(
#     preprocessor,
#     SVC()
# )

# param_dist_svc = {
#     "svc__gamma": np.round(np.logspace(1, 2, 6), 1)
# }

# svc_search = RandomizedSearchCV(
#     pipe_svc, param_dist_ridge, n_iter=10, n_jobs=-1, return_train_score=True
# )

# svc_search.fit(X_train.iloc[0:1000], y_train.iloc[0:1000])

## Testing Random Forest Regressor Model

In [11]:
pipe_rfr = make_pipeline(
    preprocessor,
    RandomForestRegressor(n_jobs=-1)
)

param_dist_rfr = {
    "randomforestregressor__max_depth": np.arange(20, 100, 2),
    "randomforestregressor__max_features": ['auto', 'sqrt'],
    "randomforestregressor__bootstrap": [True, False],
    "randomforestregressor__min_samples_leaf": [1, 2, 4],
    "randomforestregressor__min_samples_split": [2, 5, 10]
}

rfr_search = RandomizedSearchCV(
    pipe_rfr, param_dist_rfr, n_iter=20, n_jobs=-1, return_train_score=True
)

rfr_search.fit(X_train.iloc[0:2000], y_train.iloc[0:2000])
rfr_search.best_score_

  warn(


0.38716632600811396

In [12]:
pipe_rfr_opt = make_pipeline(
    preprocessor,
    RandomForestRegressor(max_depth=rfr_search.best_params_["randomforestregressor__max_depth"],
                          bootstrap=rfr_search.best_params_["randomforestregressor__bootstrap"],
                          min_samples_leaf=rfr_search.best_params_["randomforestregressor__min_samples_leaf"],
                          min_samples_split=rfr_search.best_params_["randomforestregressor__min_samples_split"])
)

In [13]:
cross_val_results["random_forest"] = pd.DataFrame(cross_validate(pipe_rfr_opt, X_train.iloc[0:2000], y_train.iloc[0:2000], return_train_score=True, scoring=scoring_dict)).agg(['mean', 'std']).round(3).T
cross_val_results["random_forest"]

Unnamed: 0,mean,std
fit_time,16.186,0.302
score_time,0.082,0.002
test_r2,0.387,0.046
train_r2,0.769,0.004
test_MAPE,-0.077,0.004
train_MAPE,-0.045,0.001
test_neg_rmse,-0.645,0.021
train_neg_rmse,-0.398,0.003
test_neg_mse,-0.417,0.027
train_neg_mse,-0.158,0.003


In [14]:
cross_val_results["dummy_regressor"] = cross_val_results["dummy_regressor"].drop(columns="std").rename(columns={"mean": "Dummy_Regressor"})
cross_val_results["random_forest"] = cross_val_results["random_forest"].drop(columns="std").rename(columns={"mean": "Random_Forest"})
cross_val_results["ridge"] = cross_val_results["ridge"].drop(columns="std").rename(columns={"mean": "Ridge"})

In [15]:
cross_val_results_df = cross_val_results["dummy_regressor"].join(cross_val_results["ridge"], how="inner").join(cross_val_results["random_forest"], how="inner")
cross_val_results_df

Unnamed: 0,Dummy_Regressor,Ridge,Random_Forest
fit_time,0.002,0.87,16.186
score_time,0.001,0.213,0.082
test_r2,-0.001,0.266,0.387
train_r2,0.0,0.512,0.769
test_MAPE,-0.102,-0.086,-0.077
train_MAPE,-0.102,-0.07,-0.045
test_neg_rmse,-0.826,-0.708,-0.645
train_neg_rmse,-0.827,-0.577,-0.398
test_neg_mse,-0.683,-0.501,-0.417
train_neg_mse,-0.683,-0.333,-0.158


In [18]:
pipe_rfr_opt.fit(X_train, y_train)

In [19]:
pipe_rfr_opt.score(X_test, y_test)

0.44705448040292595

In [20]:
pipe_rfr_opt.predict(X_test)

array([6.9090758 , 7.30222252, 7.29238633, ..., 7.4977171 , 6.41909947,
       6.47112679])

In [75]:
# actual_scores_dict = {
#     "Score": y_test,
#     "Type": "Actual Score"
# }

# prediction_scores_dict = {
#     "Score": pipe_rfr_opt.predict(X_test),
#     "Type": "Predicted Score"
# }

# act_df = pd.DataFrame(actual_scores_dict)
# pred_df = pd.DataFrame(prediction_scores_dict)

# results_df = pd.concat([act_df, pred_df])
# results_df

results_dict = {
    "Actual Scores": y_test,
    "Predicted Scores": pipe_rfr_opt.predict(X_test)
}

results_df = pd.DataFrame(results_dict)
results_df

Unnamed: 0,Actual Scores,Predicted Scores
0,6.56,6.909076
1,7.77,7.302223
2,7.36,7.292386
3,5.88,6.279644
4,8.55,7.772736
...,...,...
6236,5.77,6.158014
6237,5.97,6.288487
6238,7.53,7.497717
6239,6.67,6.419099


In [96]:
prediction_results_points = alt.Chart(results_df).mark_circle(opacity = 1, size = 3, color = "#f75402").encode(
    x = alt.X("Actual Scores", scale=alt.Scale(domain=[2, 10])),
    y = alt.Y("Predicted Scores", scale=alt.Scale(domain=[2, 10]))
).properties(
    width = 1000,
    height = 1000
)

line_plot = alt.Chart(line_df).mark_line(color = "#1049ad").encode(
    x = "Actual Scores",
    y = "Predicted Scores"
)

x = prediction_results_points + line_plot + prediction_results_points.transform_loess('Actual Scores', 'Predicted Scores', bandwidth = 0.9).mark_line()
x

<VegaLite 4 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html
