In [1]:
%matplotlib inline

import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from functools import reduce
from pathlib import Path
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv(Path("training_data_ml") / '2018_10_18_trump.csv', header=None)

data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29,30,31,32,33,34,35,36,37,38
0,0,0,0,1,1,0,1,1,0,0,...,1,0,0,0,1,0,0,0,53248,6
1,0,0,0,0,0,0,0,0,1,1,...,0,1,0,0,0,1,0,0,4613,5
2,1,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,1,1,0,68780,6
3,0,0,0,0,0,0,0,0,0,1,...,0,0,1,1,0,0,0,0,24555,5
4,0,1,0,0,0,0,0,0,1,1,...,0,1,0,0,0,0,0,1,8392,4


In [3]:
cards = [
# Diamonds
'DA','DK','DQ','DJ','D10','D9','D8','D7','D6',
# Hearts
'HA','HK','HQ','HJ','H10','H9','H8','H7','H6',
# Spades
'SA','SK','SQ','SJ','S10','S9','S8','S7','S6',
# Clubs
'CA','CK','CQ','CJ','C10','C9','C8','C7','C6'
]

# Forehand (yes = 1, no = 0)
forehand = ['FH']

user  = ['user']
trump = ['trump']

feature_columns = cards + forehand

data.columns = feature_columns + user + trump
data.drop('user', axis='columns', inplace=True)

data.head()

Unnamed: 0,DA,DK,DQ,DJ,D10,D9,D8,D7,D6,HA,...,CK,CQ,CJ,C10,C9,C8,C7,C6,FH,trump
0,0,0,0,1,1,0,1,1,0,0,...,0,1,0,0,0,1,0,0,0,6
1,0,0,0,0,0,0,0,0,1,1,...,0,0,1,0,0,0,1,0,0,5
2,1,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,1,1,0,6
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,1,0,0,0,0,5
4,0,1,0,0,0,0,0,0,1,1,...,0,0,1,0,0,0,0,0,1,4


In [4]:
data[cards + forehand] = data[cards + forehand].astype(bool)
data.trump = data.trump.astype('category')
data.trump.cat.rename_categories({0: 'DIAMONDS', 1: 'HEARTS', 2: 'SPADES', 3:'CLUBS',
                                  4: 'OBE_ABE', 5: 'UNE_UFE', 6: 'PUSH', 10: 'PUSH'}, inplace=True)
data.head()

Unnamed: 0,DA,DK,DQ,DJ,D10,D9,D8,D7,D6,HA,...,CK,CQ,CJ,C10,C9,C8,C7,C6,FH,trump
0,False,False,False,True,True,False,True,True,False,False,...,False,True,False,False,False,True,False,False,False,PUSH
1,False,False,False,False,False,False,False,False,True,True,...,False,False,True,False,False,False,True,False,False,UNE_UFE
2,True,False,False,True,False,False,False,False,False,False,...,False,True,False,False,False,False,True,True,False,PUSH
3,False,False,False,False,False,False,False,False,False,True,...,False,False,False,True,True,False,False,False,False,UNE_UFE
4,False,True,False,False,False,False,False,False,True,True,...,False,False,True,False,False,False,False,False,True,OBE_ABE


In [5]:
X_train, X_test, y_train, y_test = train_test_split(data[feature_columns], data.trump, test_size=0.2, stratify=data.trump, random_state=42)

In [6]:
def add_interaction(data, feature_cols, interaction):
    for color in "DHSC":
        new_col = f"{color}_{interaction}"
        data[new_col] = reduce(lambda a, b: a & b, [data[f"{color}{feature}"] for feature in interaction])
        feature_cols.append(new_col)

X_train_interactions = X_train.copy()
X_test_interactions = X_test.copy()
feature_columns_interactions = list(feature_columns)

for dataframe in [X_train_interactions, X_test_interactions]:
    add_interaction(dataframe, feature_columns_interactions, "J9")
    add_interaction(dataframe, feature_columns_interactions, "AKQ")

print(X_train.head())
print(X_train_interactions.head())

           DA     DK     DQ     DJ    D10     D9     D8     D7     D6     HA  \
207217  False  False  False  False   True  False  False  False   True   True   
18232   False  False  False  False  False  False   True   True  False  False   
226960  False   True  False  False  False  False  False  False  False  False   
128112  False   True  False  False  False  False  False  False  False  False   
126859   True  False  False   True  False  False   True  False  False  False   

        ...     CA     CK     CQ     CJ    C10     C9     C8     C7     C6  \
207217  ...  False  False  False  False  False  False  False  False   True   
18232   ...  False   True  False   True  False  False  False   True  False   
226960  ...   True  False  False  False  False  False  False  False  False   
128112  ...  False   True   True  False  False  False   True  False  False   
126859  ...  False  False   True  False  False  False  False  False  False   

           FH  
207217  False  
18232   False  
22

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

classifier_logisticRegression = LogisticRegression()
classifier_sgd = SGDClassifier()
classifier_randomForest = RandomForestClassifier()
# classifier_svc = SVC()
classifier_kNeighbors = KNeighborsClassifier()
classifier_gradientBoosting = GradientBoostingClassifier()

linear_classifiers = [classifier_logisticRegression, classifier_sgd]
nonlinear_classifiers = [
    classifier_randomForest,
    # classifier_svc,
    classifier_kNeighbors,
    classifier_gradientBoosting
]

names = [
    "Logistic Regression",
    "Stochastic Gradient Descent",
    "Random Forest",
    # "Support Vector Classification",
    "K Neighbors",
    "Gradient Boosting"
]
classifiers = linear_classifiers + nonlinear_classifiers
parameters = [
    { # Logistic Regression
        # "penalty": ["l1", "l2", "elasticnet", "none"],
        "C": [i / 10 for i in range(5, 21)],
        # "fit_intercept": [True, False],
        # "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]
    },
    { # SGD
        # "loss": ["hinge", "log", "modified_huber", "squared_hinge", "perceptron", "squared_loss", "huber", "epsilon_insensitive", "squared_epsilon_insensitive"],
        # "penalty": ["l1", "l2", "elasticnet"],
        "alpha": [i / 100000 for i in range(5, 21)],
        # "fit_intercept": [True, False],
        # "learning_rate": ["constant", "optimal", "invscaling", "adaptive"]
    },
    { # Random Forest
        "n_estimators": [i * 10 for i in range(8, 13)],
        # "criterion": ["gini", "entropy"],
        # "max_depth": [i for i in range(2, 6)] + [None],
        # "min_samples_split": [i for i in range(1, 4)],
        # "min_samples_leaf": [i for i in range(1, 3)],
        # "max_features": ["sqrt", "log2", None]
    },
    # { # SVC
    # },
    { # K Means
        "n_neighbors": [i for i in range(2, 11)],
        # "weights": ["uniform", "distance"]
    },
    { # Gradient Boosting
        # "loss": ["deviance", "exponential"],
        # "learning_rate": [i / 10 for i in range(1, 6)],
        "n_estimators": [i * 100 for i in range(1, 4)],
        # "min_samples_split": [i for i in range(1, 4)],
        # "min_samples_leaf": [i for i in range(1, 3)],
        # "max_depth": [i for i in range(2, 6)],
        # "max_features": ["sqrt", "log2", None]
    }
]

results = []
for name, classifier, params in zip(names, classifiers, parameters):
    print(f"Grid search for {name}")
    gs = GridSearchCV(classifier, param_grid=params, cv=5, scoring="accuracy", n_jobs=-1)
    gs.fit(X_train_interactions if classifier in linear_classifiers else X_train, y_train)
    print(f"Best accuracy score found: {gs.best_score_:.3f}\n".format(gs.best_score_))
    results.append([name, gs.best_score_, gs.best_estimator_])

results

Grid search for Logistic Regression
Best accuracy score found: 0.643

Grid search for Stochastic Gradient Descent
Best accuracy score found: 0.624

Grid search for Random Forest
Best accuracy score found: 0.640

Grid search for K Neighbors
Best accuracy score found: 0.580

Grid search for Gradient Boosting
Best accuracy score found: 0.659



[['Logistic Regression',
  0.643159869380949,
  LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
                     intercept_scaling=1, l1_ratio=None, max_iter=100,
                     multi_class='auto', n_jobs=None, penalty='l2',
                     random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                     warm_start=False)],
 ['Stochastic Gradient Descent',
  0.6235461682762454,
  SGDClassifier(alpha=5e-05, average=False, class_weight=None,
                early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
                l1_ratio=0.15, learning_rate='optimal', loss='hinge',
                max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
                power_t=0.5, random_state=None, shuffle=True, tol=0.001,
                validation_fraction=0.1, verbose=0, warm_start=False)],
 ['Random Forest',
  0.6400541930104913,
  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
        

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

names = [
    "Logistic Regression",
    "Stochastic Gradient Descent",
    "Random Forest",
    # "Support Vector Classification",
    "K Neighbors",
    "Gradient Boosting"
]

# Based on GridSearch results
classifier_logisticRegression = LogisticRegression(C=0.5)
classifier_sgd = SGDClassifier(alpha=5e-05)
classifier_randomForest = RandomForestClassifier(n_estimators=120)
# classifier_svc = SVC()
classifier_kNeighbors = KNeighborsClassifier(n_neighbors=10)
classifier_gradientBoosting = GradientBoostingClassifier(n_estimators=200)

linear_classifiers = [classifier_logisticRegression, classifier_sgd]
nonlinear_classifiers = [
    classifier_randomForest,
    # classifier_svc,
    classifier_kNeighbors,
    classifier_gradientBoosting
]
classifiers = linear_classifiers + nonlinear_classifiers

for name, classifier in zip(names, classifiers):
    print(f"Getting score for {name}:")
    classifier.fit(X_train_interactions if classifier in linear_classifiers else X_train, y_train)
    print(classifier.score(X_test_interactions if classifier in linear_classifiers else X_test, y_test))

Getting score for Logistic Regression:
0.6402278885569374
Getting score for Stochastic Gradient Descent:
0.6250121586882512
Getting score for Random Forest:
0.6380045855624262
Getting score for K Neighbors:
0.5820746196067533
Getting score for Gradient Boosting:
0.6563329396234281


In [9]:
import pickle

# Gradient Boosting chosen, because it has the highest score on unseen data.
with open(Path("trained_model") / "gradient_boosting.pkl", "wb") as file:
    pickle.dump(classifier_gradientBoosting, file)

In [10]:
# check if save & load works
with open(Path("trained_model") / "gradient_boosting.pkl", "rb") as file:
    model = pickle.load(file)

model.score(X_test, y_test) # this must return the same score as above

0.6563329396234281

In [12]:
# Check if model works well and how it responds

#           Diamonds     Rest           Forehand  D J&9    Rest J&9      D A&K&Q  Rest AKQ
pred_data = [True] * 9 + [False] * 27 + [False] #+ [True] + [False] * 3 + [True] + [False] * 3
print(model.predict([pred_data])) # Should choose DIAMONDS

#           Rest           Clubs        Forehand  Rest J&9      C J&9    Rest A&K&Q    C AKQ
pred_data = [False] * 27 + [True] * 9 + [False] #+ [False] * 3 + [True] + [False] * 3 + [True]
print(model.predict([pred_data])) # Should choose CLUBS

#           D 9-A         D 6-8        H 8-A         H 6&7        S 8-A         S 6&7        C 8-A         C 6&7        Forehand  J&9           AKQ
pred_data = [False] * 6 + [True] * 3 + [False] * 7 + [True] * 2 + [False] * 7 + [True] * 2 + [False] * 7 + [True] * 2 + [False] #+ [False] * 4 + [False] * 4
print(model.predict([pred_data])) # Should choose UNE_UFE

['DIAMONDS']
['CLUBS']
['UNE_UFE']
