## Imports

In [191]:
import pandas as pd
import numpy as np
from ast import literal_eval

from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)

from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC

from sklearn.base import TransformerMixin


In [175]:
# Solution adapted from StackOverFlow solution: https://tinyurl.com/59hmeesh
class MyMultiLabelBinarizer(TransformerMixin):
    def __init__(self, *args, **kwargs):
        self.encoder = MultiLabelBinarizer(*args, **kwargs)

    def fit(self, x, y=0):
        self.encoder.fit(x)
        return self

    def transform(self, x, y=0):
        return self.encoder.transform(x)

## Reading and Splitting Model Data

In [176]:
train_df = pd.read_csv("../data/processed/training_split.csv")
test_df = pd.read_csv("../data/processed/testing_split.csv")

categorical_list_features = ["boardgamecategory", "boardgamemechanic", "boardgamefamily", "boardgamedesigner", "boardgameartist", "boardgamepublisher"]
# for feat in categorical_list_features:
#     train_df[feat] = train_df[feat].apply(literal_eval)
#     test_df[feat] = test_df[feat].apply(literal_eval)
#     train_df[feat] = train_df[feat].apply(lambda x: set(x))
#     test_df[feat] = test_df[feat].apply(lambda x: set(x))

X_train, y_train = train_df.drop(columns="average"), train_df["average"]
X_test, y_test = test_df.drop(columns="average"), test_df["average"]

## Setting Up the Column Transformers

In [187]:
numerical_features = ["yearpublished", "minplayers", "maxplayers", "playingtime", "minplaytime", "maxplaytime", "minage"]
text_feature = "description"
categorical_features = ["boardgamecategory", "boardgamemechanic", "boardgamefamily", "boardgamedesigner", "boardgameartist", "boardgamepublisher"]

preprocessor = make_column_transformer(
    (StandardScaler(), numerical_features),
    (CountVectorizer(stop_words="english", max_features=1000), text_feature),
    (OneHotEncoder(handle_unknown="ignore", sparse="False"), categorical_features)
)

## Testing Ridge Model

In [188]:
pipe_ridge = make_pipeline(
    preprocessor,
    Ridge()
)

param_dist_ridge = {
    "ridge__alpha": 10.0 ** np.arange(-6, 6, 1)
}

ridge_alpha_search = RandomizedSearchCV(
    pipe_ridge, param_dist_ridge, n_iter=12, n_jobs=-1, return_train_score=True
)

ridge_alpha_search.fit(X_train, y_train)

In [189]:
ridge_alpha_search.best_score_

0.26555599794366336

## Testing SVC Model

In [193]:
# pipe_svc = make_pipeline(
#     preprocessor,
#     SVC()
# )

# param_dist_svc = {
#     "svc__gamma": np.round(np.logspace(1, 2, 6), 1)
# }

# svc_search = RandomizedSearchCV(
#     pipe_svc, param_dist_ridge, n_iter=10, n_jobs=-1, return_train_score=True
# )

# svc_search.fit(X_train.iloc[0:1000], y_train.iloc[0:1000])

ValueError: Invalid parameter 'ridge' for estimator Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['yearpublished',
                                                   'minplayers', 'maxplayers',
                                                   'playingtime', 'minplaytime',
                                                   'maxplaytime', 'minage']),
                                                 ('countvectorizer',
                                                  CountVectorizer(max_features=1000,
                                                                  stop_words='english'),
                                                  'description'),
                                                 ('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse='False'),
                                                  ['boardgamecategory',
                                                   'boardgamemechanic',
                                                   'boardgamefamily',
                                                   'boardgamedesigner',
                                                   'boardgameartist',
                                                   'boardgamepublisher'])])),
                ('svc', SVC())]). Valid parameters are: ['memory', 'steps', 'verbose'].

## Testing Random Forest Regressor Model

In [202]:
pipe_rfr = make_pipeline(
    preprocessor,
    RandomForestRegressor()
)

param_dist_rfr = {
    "randomforestregressor__max_depth": np.arange(20, 100, 2),
    "randomforestregressor__max_features": ['auto', 'sqrt'],
    "randomforestregressor__bootstrap": [True, False],
    "randomforestregressor__min_samples_leaf": [1, 2, 4],
    "randomforestregressor__min_samples_split": [2, 5, 10]
}

rfr_search = RandomizedSearchCV(
    pipe_rfr, param_dist_rfr, n_iter=25, n_jobs=-1, return_train_score=True
)

rfr_search.fit(X_train.iloc[0:4000], y_train.iloc[0:4000])

  warn(


In [203]:
rfr_search.best_score_

0.41924670352192683

In [204]:
rfr_search.score(X_test.iloc[0:4000], y_test.iloc[0:4000])

0.44005020764942804

In [162]:
# cross_val_results = {}
# dummy_regressor = DummyRegressor()
# cross_val_results['dummy_regressor'] = pd.DataFrame(cross_validate(dummy_regressor, X_train, y_train, return_train_score = True)).agg(['mean', 'std']).round(3).T
# cross_val_results['dummy_regressor']

Unnamed: 0,mean,std
fit_time,0.002,0.001
score_time,0.0,0.0
test_score,-0.001,0.001
train_score,0.0,0.0
