# 03 - Baseline Model

## Setup

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, Markdown

sns.set(style="darkgrid")
pd.set_option('display.max_columns', None)  

import sys, os, yaml

DATASET = "Tic-Tac-Toe"
COLAB = 'google.colab' in sys.modules

DEBUG = False
SEED = 666

In [17]:
COLAB = 'google.colab' in sys.modules

if COLAB:
  from google.colab import drive
  if not os.path.isdir("/content/gdrive"):
    drive.mount("/content/gdrive")
    d = "/content/gdrive/MyDrive/datasets"
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
  if not os.path.isdir(ROOT): os.makedirs(ROOT)
else:
  ROOT = "./"

def makedirs(d):
  if COLAB:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  else:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d, mode=0o777, exist_ok=True)

for d in ['orig','data','output']: makedirs(d)

## Load Dataset

In [18]:
df = pd.read_pickle(f"{ROOT}/data/data.pkl")
print(df.shape)
df.head()

(958, 10)


Unnamed: 0,Top-left-square,Top-middle-square,Top-right-square,Middle-left-square,Middle-middle-square,Middle-right-square,Bottom-left-square,Bottom-middle-square,Bottom-right-square,Score
0,1,1,1,1,-1,-1,1,-1,-1,1
1,1,1,1,1,-1,-1,-1,1,-1,1
2,1,1,1,1,-1,-1,-1,-1,1,1
3,1,1,1,1,-1,-1,-1,0,0,1
4,1,1,1,1,-1,-1,0,-1,0,1


## Preprocessing Data

In [19]:
target = "Score"
print(f"target = {target}")

cat_features = [c for c in df.select_dtypes("category").columns if c!= target]
print(f"\nCategorical features ({len(cat_features)}): {cat_features}")


target = Score

Categorical features (9): ['Top-left-square', 'Top-middle-square', 'Top-right-square', 'Middle-left-square', 'Middle-middle-square', 'Middle-right-square', 'Bottom-left-square', 'Bottom-middle-square', 'Bottom-right-square']


In [20]:
X = df[cat_features]
y = df[target]

## Train/Test Split

In [21]:
y.value_counts(normalize=True)

Score
1     0.653445
-1    0.346555
Name: proportion, dtype: float64

In [22]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y, train_size=0.80, stratify=y, random_state=SEED)

print(X_train.shape,X_test.shape)
y.value_counts(normalize=True)

(766, 9) (192, 9)


Score
1     0.653445
-1    0.346555
Name: proportion, dtype: float64

## Eval Models

In [23]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn.preprocessing import OneHotEncoder

#ohe = OneHotEncoder()
#ohe.fit(X_train)
#X_train = ohe.transform(X_train)
#X_test = ohe.transform(X_test)

classifiers = {
    "KNN" : KNeighborsClassifier(),
    "KNN(3)" : KNeighborsClassifier(3),
    "DT" : DecisionTreeClassifier(),
    "DT(max_depth=5)" : DecisionTreeClassifier(max_depth=5),
    "Perceptron" : Perceptron(),
}

In [24]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, f1_score

metrics = {
    'accuracy' : accuracy_score,
    'precision' : precision_score,
    'recall' : recall_score,
    'f1' : f1_score,
    'roc_auc' : roc_auc_score,
}

In [25]:
def generate_metrics():
    
    data = []
    
    for model_name, model in classifiers.items():
        
        print (f"{model_name} ...")
        
        row = {'Model': model_name}
        model.fit(X_train, y_train)
        
        for metric_name, metric in metrics.items():
            # Scoring on SEEN data - effectively "useless"
            y_pred = model.predict(X_train)
            row['train_'+metric_name] = metric(y_train, y_pred)
        
            # Scoring on UNSEEN data - important
            y_pred = model.predict(X_test)
            row['test_'+metric_name] = metric(y_test, y_pred)
            
        data.append(row)
    return pd.DataFrame(data)

df_results = generate_metrics()
print(df_results.shape)

KNN ...


KNN(3) ...
DT ...
DT(max_depth=5) ...
Perceptron ...
(5, 11)


In [26]:
def highlight_col(x):
    model_color = 'background-color: lightgreen'
    alt_color = ['background-color: lightblue','background-color: lightyellow']
    
    df1 = pd.DataFrame('', index=x.index, columns=x.columns)  
    df1.iloc[:, 0] = model_color
    for k in range (1,df.shape[1],2):
        df1.iloc[:,k:k+2] = alt_color[(k//2)%2] 
    return df1 
   
df_results.style.apply(highlight_col, axis=None)

Unnamed: 0,Model,train_accuracy,test_accuracy,train_precision,test_precision,train_recall,test_recall,train_f1,test_f1,train_roc_auc,test_roc_auc
0,KNN,0.998695,0.994792,0.998008,0.992063,1.0,1.0,0.999003,0.996016,0.998113,0.992537
1,KNN(3),1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,DT,1.0,0.96875,1.0,0.968504,1.0,0.984,1.0,0.97619,1.0,0.962149
3,DT(max_depth=5),0.916449,0.869792,0.937876,0.884615,0.934132,0.92,0.936,0.901961,0.908575,0.84806
4,Perceptron,0.983029,0.989583,0.974708,0.984252,1.0,1.0,0.987192,0.992063,0.975472,0.985075


## Save Best Model

### KNN(3)

In [27]:
import joblib

In [28]:
knn3_model = KNeighborsClassifier(3)
knn3_model.fit(X_train, y_train)

joblib.dump(knn3_model,f"{ROOT}/output/tic-tac-toe-model.joblib")


['.//output/tic-tac-toe-model.joblib']

In [30]:
joblib.load(f"{ROOT}/output/tic-tac-toe-model.joblib")