<a href="https://colab.research.google.com/github/Otobi1/Board-Game-Prediction-End-to-End/blob/master/04_Board_Game_Prediction_Production_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# Import the necessary libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures, LabelBinarizer
from sklearn.metrics import confusion_matrix
import pickle


import warnings
warnings.filterwarnings('ignore')

In [11]:
# Mount the drive, because the data is already on the GDrive

from google.colab import drive # for more on writing and reading files through colab https://towardsdatascience.com/reading-and-writing-files-with-google-colaboratory-f0c234683946
drive.mount("/drive") 

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [13]:
# Loading the dataset from the GDrive

df = pd.read_csv("/drive/My Drive/Colab Notebooks/final_game_board_dataset.csv")

In [14]:
# Getting the list of feature names

df.keys()

Index(['name', 'Year Published', 'Minimum Players', 'Maximum Players',
       'Minimum Playtime', 'Maximum Playtime', 'Minimum Age',
       'Language Dependence', 'Number of Categories', 'Number of Mechanics',
       ...
       'Category Transportation', 'Category Travel', 'Category Trivia',
       'Category Video Game Theme', 'Category Vietnam War', 'Category Wargame',
       'Category Word Game', 'Category World War I', 'Category World War II',
       'Category Zombies'],
      dtype='object', length=200)

In [15]:
# Creating the list of least impact mechanics and categories based on the previous notebooks

mech_list = [
    'Mechanic Action Drafting', 'Mechanic Action Queue',
    'Mechanic Action Retrieval', 'Mechanic Action Timer',
    'Mechanic Action/Event', 'Mechanic Advantage Token',
    'Mechanic Area Movement', 'Mechanic Area-Impulse',
    'Mechanic Auction/Bidding', 'Mechanic Auction: Sealed Bid',
    'Mechanic Auction: Turn Order Until Pass',
    'Mechanic Automatic Resource Growth', 'Mechanic Bias', 'Mechanic Bribery',
    'Mechanic Card Drafting', 'Mechanic Catch the Leader', 'Mechanic Chaining',
    'Mechanic Chit-Pull System', 'Mechanic Command Cards',
    'Mechanic Commodity Speculation', 'Mechanic Communication Limits',
    'Mechanic Connections', 'Mechanic Critical Hits and Failures',
    'Mechanic Dice Rolling', 'Mechanic Different Dice Movement',
    'Mechanic Drafting', 'Mechanic Elapsed Real Time Ending',
    'Mechanic Enclosure', 'Mechanic Events', 'Mechanic Flicking',
    'Mechanic Follow', 'Mechanic Grid Coverage', 'Mechanic Grid Movement',
    'Mechanic Hand Management', 'Mechanic Hidden Movement',
    'Mechanic Hidden Roles', 'Mechanic Highest-Lowest Scoring',
    'Mechanic Increase Value of Unchosen Resources', 'Mechanic Investment',
    'Mechanic Legacy Game', 'Mechanic Line Drawing', 'Mechanic Mancala',
    'Mechanic Market', 'Mechanic Measurement Movement',
    'Mechanic Melding and Splaying', 'Mechanic Minimap Resolution',
    'Mechanic Modular Board', 'Mechanic Move Through Deck',
    'Mechanic Multiple Maps', 'Mechanic Negotiation', 'Mechanic Ownership',
    'Mechanic Pattern Building', 'Mechanic Physical Removal',
    'Mechanic Pick-up and Deliver', 'Mechanic Player Judge',
    'Mechanic Point to Point Movement', 'Mechanic Race', 'Mechanic Real-Time',
    'Mechanic Resource to Move', 'Mechanic Role Playing',
    'Mechanic Roles with Asymmetric Information', 'Mechanic Rondel',
    'Mechanic Score-and-Reset Game', 'Mechanic Secret Unit Deployment',
    'Mechanic Set Collection', 'Mechanic Simulation',
    'Mechanic Simultaneous Action Selection', 'Mechanic Singing',
    'Mechanic Solo / Solitaire Game', 'Mechanic Square Grid',
    'Mechanic Stacking and Balancing', 'Mechanic Stock Holding',
    'Mechanic Take That', 'Mechanic Team-Based Game',
    'Mechanic Tile Placement', 'Mechanic Time Track', 'Mechanic Trading',
    'Mechanic Traitor Game', 'Mechanic Trick-taking', 'Mechanic Voting',
    'Mechanic Worker Placement'
]

cat_list = [
    'Category Abstract Strategy', 'Category Adventure',
    'Category American Civil War', 'Category American Indian Wars',
    'Category American West', 'Category Ancient', 'Category Animals',
    'Category Arabian', 'Category Aviation / Flight', 'Category Bluffing',
    'Category Book', 'Category City Building', 'Category Civil War',
    'Category Collectible Components', 'Category Comic Book / Strip',
    'Category Dice', 'Category Educational', 'Category Electronic',
    'Category Environmental', 'Category Exploration', 'Category Fantasy',
    'Category Farming', 'Category Fighting', 'Category Game System',
    'Category Horror', 'Category Industry / Manufacturing', 'Category Mafia',
    'Category Maze', 'Category Memory', 'Category Modern Warfare',
    'Category Movies / TV / Radio theme', 'Category Music',
    'Category Mythology', 'Category Nautical', 'Category Negotiation',
    'Category Novel-based', 'Category Number', 'Category Pirates',
    'Category Political', 'Category Post-Napoleonic', 'Category Prehistoric',
    'Category Print & Play', 'Category Real-time', 'Category Religious',
    'Category Science Fiction', 'Category Space Exploration',
    'Category Spies/Secret Agents', 'Category Sports',
    'Category Transportation', 'Category Travel', 'Category Trivia',
    'Category Video Game Theme', 'Category World War I',
    'Category World War II', 'Category Zombies'
]

In [16]:
# Summing the less impactful mechanics into a single column

df['Mechanic Other'] = df[mech_list].sum(axis = 1)

In [17]:
df['Mechanic Other'].value_counts()

1    10365
0     8671
Name: Mechanic Other, dtype: int64

In [18]:
# Summing the least impactful categories into a single column

df['Category Other'] = df[cat_list].sum(axis = 1)

In [19]:
df['Category Other'].value_counts()

0    9646
1    9390
Name: Category Other, dtype: int64

In [20]:
# dropping the less impactful mechanics and categories

df.drop(columns = mech_list, inplace = True) 
df.drop(columns = cat_list, inplace = True)
df.shape

(19036, 66)

In [21]:
# converting the avgweight feature to categorical data for better modeling
# using the categories found here: https://boardgamegeek.com/wiki/page/Weight

conversion_dict = {1: 'light',
                   2: 'medium light',
                   3: 'medium',
                   4: 'medium heavy',
                   5: 'heavy'}

In [22]:
df['Weight'] = np.round(df['Weight']) # rounding avgweight
df['Weight'] = df['Weight'].map(conversion_dict) # mapping the conversion dictionary to avgweight

In [23]:
df.drop(columns=[
    'Year Published', 'boardgamecategory', 'boardgamemechanic', 'Language Dependence',
    'description', 'Number of Categories', 'Number of Mechanics'],
        inplace=True)

In [24]:
df = df[[
    'name', 'Minimum Players', 'Maximum Players', 'Minimum Playtime',
    'Maximum Playtime', 'Minimum Age', 'Weight', 'Mechanic Acting',
    'Mechanic Action Points', 'Mechanic Alliances',
    'Mechanic Area Majority / Influence', 'Mechanic Betting and Bluffing',
    'Mechanic Campaign / Battle Card Driven', 'Mechanic Contracts',
    'Mechanic Cooperative Game', 'Mechanic Crayon Rail System',
    'Mechanic End Game Bonuses', 'Mechanic Hexagon Grid', 'Mechanic Memory',
    'Mechanic Network and Route Building', 'Mechanic Paper-and-Pencil',
    'Mechanic Pattern Recognition', 'Mechanic Player Elimination',
    'Mechanic Push Your Luck', 'Mechanic Rock-Paper-Scissors',
    'Mechanic Roll / Spin and Move', 'Mechanic Storytelling',
    'Mechanic Variable Phase Order', 'Mechanic Variable Player Powers',
    'Mechanic Other', 'Category Action / Dexterity', 'Category Age of Reason',
    'Category American Revolutionary War', 'Category Card Game',
    'Category Civilization', 'Category Deduction', 'Category Economic',
    'Category Expansion for Base-game', 'Category Humor',
    'Category Korean War', 'Category Math', 'Category Mature / Adult',
    'Category Medical', 'Category Medieval', 'Category Miniatures',
    'Category Murder/Mystery', 'Category Napoleonic', 'Category Party Game',
    'Category Pike and Shot', 'Category Puzzle', 'Category Racing',
    'Category Renaissance', 'Category Territory Building', 'Category Trains',
    'Category Vietnam War', 'Category Wargame', 'Category Word Game',
    'Category Other'
]]

In [25]:
# Splitting the dataset

X = df._get_numeric_data()
y = df['Weight']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 1919)

In [26]:
X.keys() ###### confirm if this list tallies with the index.html

Index(['Minimum Players', 'Maximum Players', 'Minimum Playtime',
       'Maximum Playtime', 'Minimum Age', 'Mechanic Acting',
       'Mechanic Action Points', 'Mechanic Alliances',
       'Mechanic Area Majority / Influence', 'Mechanic Betting and Bluffing',
       'Mechanic Campaign / Battle Card Driven', 'Mechanic Contracts',
       'Mechanic Cooperative Game', 'Mechanic Crayon Rail System',
       'Mechanic End Game Bonuses', 'Mechanic Hexagon Grid', 'Mechanic Memory',
       'Mechanic Network and Route Building', 'Mechanic Paper-and-Pencil',
       'Mechanic Pattern Recognition', 'Mechanic Player Elimination',
       'Mechanic Push Your Luck', 'Mechanic Rock-Paper-Scissors',
       'Mechanic Roll / Spin and Move', 'Mechanic Storytelling',
       'Mechanic Variable Phase Order', 'Mechanic Variable Player Powers',
       'Mechanic Other', 'Category Action / Dexterity',
       'Category Age of Reason', 'Category American Revolutionary War',
       'Category Card Game', 'Category Civil

In [27]:
y.value_counts(normalize = True)

medium light    0.441007
light           0.299170
medium          0.204245
medium heavy    0.053215
heavy           0.002364
Name: Weight, dtype: float64

In [28]:
# Logistic Regression Model 

pipe = Pipeline(steps = [("lr", LogisticRegression())]) # Running a pipeline of logistic regression 

params = {"lr__penalty": ["l1"], 
          "lr__C": [1], 
          "lr__solver": ["liblinear"]} # Setting parameters

gs_lr = GridSearchCV(pipe, param_grid = params, cv = 5, scoring = "accuracy", n_jobs = -2)

gs_lr.fit(X_train, y_train)
gs_lr.best_estimator_

Pipeline(memory=None,
         steps=[('lr',
                 LogisticRegression(C=1, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l1', random_state=None,
                                    solver='liblinear', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [29]:
# scoring the model 

gs_lr.score(X_train, y_train) 

0.5862576171464593

In [30]:
# cross validation scoring 

cross_val_score(gs_lr.best_estimator_, X, y, cv = 5).mean() 

0.57690669057221

In [31]:
# Scoring the test data

gs_lr.score(X_test, y_test)

# Inference: this model already performs better than the base model but there is room for improvement

0.5828955662954403

In [32]:
# KNN Model

# - Setting the KNN Model with a StandardScaler

pipe = Pipeline(steps = [("sc", StandardScaler()), ("knn", KNeighborsClassifier())])

# Setting Model Params 

params = {"knn__n_neighbors": [21], "knn__p": [1]}

gs_knn = GridSearchCV(pipe, param_grid = params, cv = 5, scoring = "accuracy")

gs_knn.fit(X_train, y_train)
gs_knn.best_estimator_

Pipeline(memory=None,
         steps=[('sc',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('knn',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=None, n_neighbors=21, p=1,
                                      weights='uniform'))],
         verbose=False)

In [33]:
# Scoring the model

gs_knn.score(X_train, y_train)

0.6497863696855082

In [34]:
# Cross validation scoring 

cross_val_score(gs_knn.best_estimator_, X, y, cv = 5).mean() 

0.6087393743943599

In [35]:
# Scoring the test data split

gs_knn.score(X_test, y_test)

#  Inference: this model performs almost as well as the Logistic Regression

0.6104223576381593

In [36]:
# Decision Tree Model 

# - Setting the decision tree model

pipe = Pipeline(steps = [("tree", DecisionTreeClassifier())])

# - Setting the model parameters 

params = {"tree__max_depth": [6, 8]}

gs_tree = GridSearchCV(pipe, param_grid = params, cv = 5, scoring = "accuracy")

gs_tree.fit(X_train, y_train)
gs_tree.best_estimator_

Pipeline(memory=None,
         steps=[('tree',
                 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                        criterion='gini', max_depth=8,
                                        max_features=None, max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        presort='deprecated', random_state=None,
                                        splitter='best'))],
         verbose=False)

In [37]:
# Scoring the model 

gs_tree.score(X_train, y_train)

0.6662464103102893

In [38]:
# Cross validation score 

cross_val_score(gs_tree.best_estimator_, X, y, cv = 5).mean()

0.6215034004145393

In [39]:
# Scoring the test data split

gs_tree.score(X_test, y_test)

# Inference: The decision tree performs a bit better than the logistic regression model

0.6299642782097079

In [40]:
# Bagging Classifier Model 

# - Setting up a Bagging Classifier Model 

pipe = Pipeline(steps = [("bag", BaggingClassifier())])

# - Setting up model parameters 

params = {"bag__n_estimators": [200]}

gs_bag = GridSearchCV(pipe, param_grid = params, cv = 5, scoring = "accuracy")

gs_bag.fit(X_train, y_train)
gs_bag.best_estimator_

Pipeline(memory=None,
         steps=[('bag',
                 BaggingClassifier(base_estimator=None, bootstrap=True,
                                   bootstrap_features=False, max_features=1.0,
                                   max_samples=1.0, n_estimators=200,
                                   n_jobs=None, oob_score=False,
                                   random_state=None, verbose=0,
                                   warm_start=False))],
         verbose=False)

In [41]:
# Scoring the model 

gs_bag.score(X_train, y_train)

# Inference: looks overfit 

0.8724521958394621

In [42]:
# Cross validation score 

cross_val_score(gs_bag.best_estimator_, X, y, cv = 5).mean()

0.5986524988245889

In [43]:
# Scoring the test data

gs_bag.score(X_test, y_test) 

# Inference: although with the best performance on the test data so far, the train set seems overfit 

0.6062197940743854

In [44]:
# Random Forest Model 

# - Setting up the model

pipe = Pipeline(steps = [("forest", RandomForestClassifier())])

# Setting the model parameters 

params = {"forest__n_estimators": [150], "forest__max_depth": [15]}

gs_forest = GridSearchCV(pipe, param_grid = params, cv = 5, scoring = "accuracy")

gs_forest.fit(X_train, y_train)

gs_forest.best_estimator_

Pipeline(memory=None,
         steps=[('forest',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=15, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=150, n_jobs=None,
                                        oob_score=False, random_state=None,
                                        verbose=0, warm_start=False))],
         verbose=False)

In [45]:
# Scoring the model 

gs_forest.score(X_train, y_train)

0.7510681515724592

In [46]:
# Cross validation scoring 
cross_val_score(gs_forest.best_estimator_, X, y, cv = 5).mean()

0.6458787080632095

In [47]:
# Scoring the test data split

gs_forest.score(X_test, y_test)

# Inference: Random Forest as good as the Bagging Classifier, lets continue up the bias-variance trade off

0.6438327379701618

In [48]:
# Extra Trees Model 

# - Setting up the model

pipe = Pipeline(steps = [("extra", ExtraTreesClassifier())])

# Setting up the model parameters 

params = {"extra__n_estimators": [600], "extra__max_depth": [None]}

gs_extra = GridSearchCV(pipe, param_grid = params, cv = 5, scoring = "accuracy")

gs_extra.fit(X_train, y_train)
gs_extra.best_estimator_

Pipeline(memory=None,
         steps=[('extra',
                 ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                                      class_weight=None, criterion='gini',
                                      max_depth=None, max_features='auto',
                                      max_leaf_nodes=None, max_samples=None,
                                      min_impurity_decrease=0.0,
                                      min_impurity_split=None,
                                      min_samples_leaf=1, min_samples_split=2,
                                      min_weight_fraction_leaf=0.0,
                                      n_estimators=600, n_jobs=None,
                                      oob_score=False, random_state=None,
                                      verbose=0, warm_start=False))],
         verbose=False)

In [49]:
# Scoring the model 

gs_extra.score(X_train, y_train)

0.8724521958394621

In [50]:
# Cross validation score 

cross_val_score(gs_extra.best_estimator_, X, y, cv = 5).mean()

0.5924539575483464

In [51]:
# Scoring the test data split

gs_extra.score(X_test, y_test)

# Inference: About as strong as the Bagging Classifier, lets try the boosting models next

0.6047488968270646

In [52]:
# AdaBoost Model 

# - Setting up the model 

pipe = Pipeline(steps = [("ada", AdaBoostClassifier())])

# - Setting the model parameters 

params = {"ada__n_estimators": [10]}

gs_ada = GridSearchCV(pipe, param_grid = params, cv = 5, scoring = "accuracy")

gs_ada.fit(X_train, y_train)
gs_ada.best_estimator_

Pipeline(memory=None,
         steps=[('ada',
                 AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
                                    learning_rate=1.0, n_estimators=10,
                                    random_state=None))],
         verbose=False)

In [53]:
# Scoring the model 

gs_ada.score(X_train, y_train)

0.5640540729845206

In [54]:
# Cross validation scoring 

cross_val_score(gs_ada.best_estimator_, X, y, cv = 5).mean()

0.5344075928243638

In [55]:
# Scoring the test data split 

gs_ada.score(X_test, y_test)

# Inference: did not perform as well, even lower than KNN

0.5578903130909855

In [56]:
# Gradient Boost Model 

# - Setting up the model 

pipe = Pipeline(steps = [("grad", GradientBoostingClassifier())])

# - Setting up the model parameters 

params = {"grad__n_estimators": [300], "grad__max_depth": [3]}

gs_grad = GridSearchCV(pipe, param_grid = params, cv = 5, scoring = "accuracy")

gs_grad.fit(X_train, y_train)
gs_grad.best_estimator_

Pipeline(memory=None,
         steps=[('grad',
                 GradientBoostingClassifier(ccp_alpha=0.0,
                                            criterion='friedman_mse', init=None,
                                            learning_rate=0.1, loss='deviance',
                                            max_depth=3, max_features=None,
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
                                            min_samples_leaf=1,
                                            min_samples_split=2,
                                            min_weight_fraction_leaf=0.0,
                                            n_estimators=300,
                                            n_iter_no_change=None,
                                            presort='deprecated',
                                            random_sta

In [57]:
# Scoring the model

gs_grad.score(X_train, y_train)

0.7007774742592981

In [58]:
# Cross validation scoring 

cross_val_score(gs_grad.best_estimator_, X, y, cv = 5).mean()

0.6444597578984311

In [59]:
# Scoring the test data split

gs_grad.score(X_test, y_test)

# Inference: Best model so far, but lets try SVM

0.6486656860685018

In [60]:
# SVC Model 

# - Setting up the model 

pipe = Pipeline(steps = [("svc", SVC())])

# - Setting the model parameters 

params = {"svc__C": [3]}

gs_svc = GridSearchCV(pipe, param_grid = params, cv = 5, scoring = "accuracy")

gs_svc.fit(X_train, y_train)

gs_svc.best_estimator_

Pipeline(memory=None,
         steps=[('svc',
                 SVC(C=3, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='rbf', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [61]:
# Scoring the model 

gs_svc.score(X_train, y_train)

0.5585907403516145

In [62]:
# Cross validation scoring 

cross_val_score(gs_svc.best_estimator_, X, y, cv = 5).mean()

0.5688668099233389

In [63]:
# Scoring the test data split 

gs_svc.score(X_test, y_test)

# Inference: the added dimensionality did not help much, this is the poorest performing model yet.

0.5543181340617777

In [64]:
# Voting Classifier Model

# - Setting up the model 

knn_pipe = Pipeline([("ss", StandardScaler()), ("knn", KNeighborsClassifier())])

# = Setting up the Voting Classifier Model  with multiple models inside

vote = VotingClassifier([("rand", RandomForestClassifier()), 
                         ("grad", GradientBoostingClassifier()), 
                         ("lr", LogisticRegression()), 
                         ("tree", DecisionTreeClassifier()), 
                         ("bag", BaggingClassifier()), 
                         ("ada", AdaBoostClassifier()), 
                         ("extra", ExtraTreesClassifier()), 
                         ("knn_pipe", knn_pipe)], voting = "soft")

# Setting up the model parameters for each model in the voting classifier 

vote_params = {"rand__n_estimators": [150], 
               "rand__max_depth": [15], 
               "grad__n_estimators": [300], 
               "tree__max_depth": [8], 
               "bag__n_estimators": [200], 
               "ada__n_estimators": [10], 
               "extra__n_estimators": [600], 
               "knn_pipe__knn__n_neighbors": [21],
               "lr__penalty": ["l1"], 
               "lr__C": [1], 
               "lr__solver": ["liblinear"]}

gs_vc = GridSearchCV(vote, param_grid = vote_params, cv = 5, scoring = "accuracy")

gs_vc.fit(X_train, y_train)

gs_vc.best_estimator_

VotingClassifier(estimators=[('rand',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=15,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
        

In [65]:
# Scoring the model 

gs_vc.score(X_train, y_train)

0.8120753659732437

In [66]:
# Cross validation scoring 

cross_val_score(gs_vc.best_estimator_, X, y, cv = 5).mean()

0.6426741677758574

In [67]:
# Scoring the test data split

gs_vc.score(X_test, y_test)


# Model Selection
# inference: the voting classifier performed best of all the models tested but they are hard to evaluate because they are ensemble models. 
# So while we will use the voting classifier to predict the complexity of the game, we'll evaluate the Logistic Regression model as it is only a few points of the voting classifier

0.6469846606429922

In [69]:
# Pickling 

with open("prod_model.pkl", "wb") as f_out:
  pickle.dump(gs_grad, f_out)
  f_out.close()