# 03 - Baseline Model

## Setup

In [408]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, Markdown

sns.set(style="darkgrid")
pd.set_option('display.max_columns', None)  

import sys, os, yaml

DATASET = "Tic-Tac-Toe"
COLAB = 'google.colab' in sys.modules

DEBUG = False
SEED = 666

In [409]:
COLAB = 'google.colab' in sys.modules

if COLAB:
  from google.colab import drive
  if not os.path.isdir("/content/gdrive"):
    drive.mount("/content/gdrive")
    d = "/content/gdrive/MyDrive/datasets"
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
  if not os.path.isdir(ROOT): os.makedirs(ROOT)
else:
  ROOT = "./"

def makedirs(d):
  if COLAB:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  else:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d, mode=0o777, exist_ok=True)

for d in ['orig','data','output']: makedirs(d)

## Load Dataset

In [410]:
df = pd.read_pickle(f"{ROOT}/data/data.pkl")
print(df.shape)
df.head()

(958, 10)


Unnamed: 0,Top-left-square,Top-middle-square,Top-right-square,Middle-left-square,Middle-middle-square,Middle-right-square,Bottom-left-square,Bottom-middle-square,Bottom-right-square,Score
0,1,1,1,1,-1,-1,1,-1,-1,1
1,1,1,1,1,-1,-1,-1,1,-1,1
2,1,1,1,1,-1,-1,-1,-1,1,1
3,1,1,1,1,-1,-1,-1,0,0,1
4,1,1,1,1,-1,-1,0,-1,0,1


## Preprocessing Data

In [411]:
target = "Score"
print(f"target = {target}")

cat_features = [c for c in df.select_dtypes("category").columns if c!= target]
print(f"\nCategorical features ({len(cat_features)}): {cat_features}")


target = Score

Categorical features (9): ['Top-left-square', 'Top-middle-square', 'Top-right-square', 'Middle-left-square', 'Middle-middle-square', 'Middle-right-square', 'Bottom-left-square', 'Bottom-middle-square', 'Bottom-right-square']


In [412]:
X = df[cat_features]
y = df[target]

## Train/Test Split

In [413]:
y.value_counts(normalize=True)

Score
1     0.653445
-1    0.346555
Name: proportion, dtype: float64

In [414]:
from sklearn.model_selection import train_test_split, GridSearchCV

X_train,X_test,y_train,y_test = train_test_split(X,y, train_size=0.80, random_state=SEED)

print(X_train.shape,X_test.shape)
y.value_counts(normalize=True)

(766, 9) (192, 9)


Score
1     0.653445
-1    0.346555
Name: proportion, dtype: float64

## Eval Models

In [415]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn.preprocessing import OneHotEncoder

#ohe = OneHotEncoder()
#ohe.fit(X_train)
#X_train = ohe.transform(X_train)
#X_test = ohe.transform(X_test)

classifiers = {
    "KNN" : KNeighborsClassifier(n_jobs=-1),
    "KNN(3)" : KNeighborsClassifier(3,n_jobs=-1),
    "DT" : DecisionTreeClassifier(max_features=9),
    "DT(max_depth=5)" : DecisionTreeClassifier(max_depth=5,max_features=9),
    "Perceptron" : Perceptron(n_jobs=-1),
}

In [416]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, f1_score

metrics = {
    'accuracy' : accuracy_score,
    'precision' : precision_score,
    'recall' : recall_score,
    'f1' : f1_score,
    'roc_auc' : roc_auc_score,
}

In [417]:
def generate_metrics():
    
    data = []
    
    for model_name, model in classifiers.items():
        
        print (f"{model_name} ...")
        
        row = {'Model': model_name}
        model.fit(X_train, y_train)
        
        for metric_name, metric in metrics.items():
            # Scoring on SEEN data - effectively "useless"
            y_pred = model.predict(X_train)
            row['train_'+metric_name] = metric(y_train, y_pred)
        
            # Scoring on UNSEEN data - important
            y_pred = model.predict(X_test)
            row['test_'+metric_name] = metric(y_test, y_pred)
            
        data.append(row)
    return pd.DataFrame(data)

df_results = generate_metrics()
print(df_results.shape)

KNN ...


KNN(3) ...
DT ...
DT(max_depth=5) ...
Perceptron ...
(5, 11)


In [418]:
def highlight_col(x):
    model_color = 'background-color: lightgreen'
    alt_color = ['background-color: lightblue','background-color: lightyellow']
    
    df1 = pd.DataFrame('', index=x.index, columns=x.columns)  
    df1.iloc[:, 0] = model_color
    for k in range (1,df.shape[1],2):
        df1.iloc[:,k:k+2] = alt_color[(k//2)%2] 
    return df1 
   
df_results.style.apply(highlight_col, axis=None)

Unnamed: 0,Model,train_accuracy,test_accuracy,train_precision,test_precision,train_recall,test_recall,train_f1,test_f1,train_roc_auc,test_roc_auc
0,KNN,0.998695,1.0,0.998012,1.0,1.0,1.0,0.999005,1.0,0.998106,1.0
1,KNN(3),1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,DT,1.0,0.963542,1.0,0.97561,1.0,0.967742,1.0,0.97166,1.0,0.961812
3,DT(max_depth=5),0.915144,0.916667,0.94501,0.935484,0.924303,0.935484,0.934542,0.935484,0.911015,0.908918
4,Perceptron,0.979112,0.989583,0.969112,0.984127,1.0,1.0,0.984314,0.992,0.969697,0.985294


ROC-AUC (roc_auc_score): 
- Área sob a curva ROC (Receiver Operating Characteristic). Adequado para conjuntos de dados desbalanceados e `problemas de classificação binária`. Mede a capacidade do modelo de `distinguir entre instâncias positivas e negativas`.

KNN(3) já obtivemos 100%.... Não há necessidade de Hyperparameter Tunning, com GridSerach por exemplo...

## Save Best Model

### KNN(3)

In [419]:
import joblib

In [420]:
knn3_model = KNeighborsClassifier(3,n_jobs=-1)
knn3_model.fit(X_train, y_train)

In [421]:
joblib.dump(knn3_model,f"{ROOT}/output/tic-tac-toe-model-data.joblib")

['.//output/tic-tac-toe-model-data.joblib']

# Treino no Dataset Modificado

## Load Dataset

In [422]:
df = pd.read_pickle(f"{ROOT}/data/data_3.pkl")
print(df.shape)
df.head()

(5478, 10)


Unnamed: 0,Top-left-square,Top-middle-square,Top-right-square,Middle-left-square,Middle-middle-square,Middle-right-square,Bottom-left-square,Bottom-middle-square,Bottom-right-square,Score
0,1,-1,1,1,0,0,0,0,-1,-1
1,1,1,-1,0,-1,-1,1,-1,1,1
2,0,1,0,0,0,0,1,-1,0,-1
3,1,0,-1,1,0,0,0,0,0,1
4,-1,1,-1,-1,1,-1,1,0,1,1


## Preprocessing Data

In [423]:
target = "Score"
print(f"target = {target}")

cat_features = [c for c in df.select_dtypes("category").columns if c!= target]
print(f"\nCategorical features ({len(cat_features)}): {cat_features}")


target = Score

Categorical features (9): ['Top-left-square', 'Top-middle-square', 'Top-right-square', 'Middle-left-square', 'Middle-middle-square', 'Middle-right-square', 'Bottom-left-square', 'Bottom-middle-square', 'Bottom-right-square']


In [424]:
X = df[cat_features]
y = df[target]

## Train/Test Split

In [425]:
y.value_counts(normalize=True)

Score
1     0.535962
-1    0.464038
Name: proportion, dtype: float64

In [426]:
from sklearn.model_selection import train_test_split, GridSearchCV

X_train,X_test,y_train,y_test = train_test_split(X,y, train_size=0.80, test_size=0.20, random_state=SEED)

print(X_train.shape,X_test.shape)
y.value_counts(normalize=True)

(4382, 9) (1096, 9)


Score
1     0.535962
-1    0.464038
Name: proportion, dtype: float64

#### Avaliando a performance do Model treinado com o outro dataset

In [427]:
classifiers = {
    'knn_3': knn3_model,
}

In [428]:
data = []

for model_name, model in classifiers.items():
    
    print (f"{model_name} ...")
    
    row = {'Model': model_name}
    
    for metric_name, metric in metrics.items():
        # Scoring on SEEN data - effectively "useless"
        y_pred = model.predict(X_train)
        row['train_'+metric_name] = metric(y_train, y_pred)
    
        # Scoring on UNSEEN data - important
        y_pred = model.predict(X_test)
        row['test_'+metric_name] = metric(y_test, y_pred)
        
    data.append(row)
df_results = pd.DataFrame(data)
print(df_results.shape)

knn_3 ...


(1, 11)


In [429]:
df_results.style.apply(highlight_col, axis=None)

Unnamed: 0,Model,train_accuracy,test_accuracy,train_precision,test_precision,train_recall,test_recall,train_f1,test_f1,train_roc_auc,test_roc_auc
0,knn_3,0.613647,0.607664,0.620513,0.613139,0.720544,0.717949,0.666798,0.661417,0.605225,0.599679


## Eval Models

In [430]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn.preprocessing import OneHotEncoder

#ohe = OneHotEncoder()
#ohe.fit(X_train)
#X_train = ohe.transform(X_train)
#X_test = ohe.transform(X_test)

classifiers = {
    "KNN" : KNeighborsClassifier(n_jobs=-1),
    "KNN(3)" : KNeighborsClassifier(3,n_jobs=-1),
    "DT" : DecisionTreeClassifier(max_features=9),
    "DT(max_depth=5)" : DecisionTreeClassifier(max_depth=5,max_features=9),
    "Perceptron" : Perceptron(n_jobs=-1),
}

In [431]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, f1_score

metrics = {
    'accuracy' : accuracy_score,
    'precision' : precision_score,
    'recall' : recall_score,
    'f1' : f1_score,
    'roc_auc' : roc_auc_score,
}

In [432]:
def generate_metrics():
    
    data = []
    
    for model_name, model in classifiers.items():
        
        print (f"{model_name} ...")
        
        row = {'Model': model_name}
        model.fit(X_train, y_train)
        
        for metric_name, metric in metrics.items():
            # Scoring on SEEN data - effectively "useless"
            y_pred = model.predict(X_train)
            row['train_'+metric_name] = metric(y_train, y_pred)
        
            # Scoring on UNSEEN data - important
            y_pred = model.predict(X_test)
            row['test_'+metric_name] = metric(y_test, y_pred)
            
        data.append(row)
    return pd.DataFrame(data)

df_results = generate_metrics()
print(df_results.shape)

KNN ...
KNN(3) ...
DT ...
DT(max_depth=5) ...
Perceptron ...
(5, 11)


In [433]:
def highlight_col(x):
    model_color = 'background-color: lightgreen'
    alt_color = ['background-color: lightblue','background-color: lightyellow']
    
    df1 = pd.DataFrame('', index=x.index, columns=x.columns)  
    df1.iloc[:, 0] = model_color
    for k in range (1,df.shape[1],2):
        df1.iloc[:,k:k+2] = alt_color[(k//2)%2] 
    return df1 
   
df_results.style.apply(highlight_col, axis=None)

Unnamed: 0,Model,train_accuracy,test_accuracy,train_precision,test_precision,train_recall,test_recall,train_f1,test_f1,train_roc_auc,test_roc_auc
0,KNN,0.715655,0.639599,0.73601,0.666667,0.73288,0.649573,0.734442,0.658009,0.714298,0.638876
1,KNN(3),0.788681,0.62865,0.810187,0.661818,0.791578,0.622222,0.800775,0.64141,0.788453,0.629115
2,DT,1.0,0.731752,1.0,0.762162,1.0,0.723077,1.0,0.742105,1.0,0.73238
3,DT(max_depth=5),0.672752,0.668796,0.678613,0.67236,0.740961,0.740171,0.708418,0.704638,0.667379,0.663628
4,Perceptron,0.534003,0.514599,0.566854,0.547748,0.55721,0.519658,0.561991,0.533333,0.532175,0.514232


## Hyperparameter Tunning

DecisionTreeClassifier

In [434]:
dt_model = DecisionTreeClassifier()

param_grid = {
    'max_depth': [7,11,12,15,16,20,None],
    'min_samples_split': [2, 5, 10, 0.1, 0.2, 0.5],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [9],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(estimator=dt_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train, )

grid_search.best_params_

{'criterion': 'entropy',
 'max_depth': 15,
 'max_features': 9,
 'min_samples_leaf': 2,
 'min_samples_split': 2}

In [435]:
best_dt = grid_search.best_estimator_

In [436]:
knn_model = KNeighborsClassifier()

param_grid = {
    'n_neighbors': [3,5,7,15,20,25,30],  
    'weights': ['uniform', 'distance'],  
    'metric': ['euclidean', 'manhattan', 'minkowski'],    
    'n_jobs':[-1]
}

grid_search = GridSearchCV(estimator=knn_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train, )

grid_search.best_params_

{'metric': 'euclidean', 'n_jobs': -1, 'n_neighbors': 25, 'weights': 'distance'}

In [437]:
best_knn = grid_search.best_estimator_

In [438]:
classifiers = {
    'best_knn': best_knn,
    'best_DT' : best_dt,
}

In [439]:
df_results = generate_metrics()
df_results.style.apply(highlight_col, axis=None)

best_knn ...
best_DT ...


Unnamed: 0,Model,train_accuracy,test_accuracy,train_precision,test_precision,train_recall,test_recall,train_f1,test_f1,train_roc_auc,test_roc_auc
0,best_knn,1.0,0.898723,1.0,0.896321,1.0,0.916239,1.0,0.906171,1.0,0.897454
1,best_DT,0.910543,0.768248,0.941415,0.820116,0.888558,0.724786,0.914223,0.76951,0.912275,0.771395


## Save Best Model

### Best model

In [440]:
import joblib

In [441]:
best_model_index = df_results['test_roc_auc'].idxmax()
best_model = df_results.loc[best_model_index, 'Model']

In [442]:
if best_model == 'best_knn':
    best_model = best_knn
else:
    best_model = best_dt

In [443]:
joblib.dump(best_model,f"{ROOT}/output/tic-tac-toe-model-data2.joblib")

['.//output/tic-tac-toe-model-data2.joblib']