## Dataset

In [None]:
import pandas as pd

games = pd.read_csv('../data/') # TODO: add dataset name

### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

features = ['elo_difference', 'WIN_PCT_10_home','WIN_PCT_10_away', 'FG_PCT_avg_10_HOME', 
            'FG_PCT_avg_10_AWAY', 'FT_PCT_avg_10_HOME', 
            'FT_PCT_avg_10_AWAY', 'FG3_PCT_avg_10_HOME', 'FG3_PCT_avg_10_AWAY', 'REB_avg_10_HOME', 'REB_avg_10_AWAY']

# Separate the features and the target
X = games[features]
y = games['HOME_TEAM_WINS']

# Initialize the scaler
scaler = StandardScaler()
# Fit the scaler to the features
scaler.fit(X)
# Transform the features using the scaler
X_scaled = scaler.transform(X)

### Splitting

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

## Models

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rdf = RandomForestClassifier(n_estimators=100)  
# n_estimator=1000 better accuracy but too much time
rdf.fit(X_train, y_train)

#### Feature Importance

In [None]:
# Get the feature importances
importances = rdf.feature_importances_

# Sort the feature importances in descending order
sorted_importances = sorted(importances, reverse=True)

# Print the feature names and importances
for feature, importance in zip(X.columns, sorted_importances):
    print(f"{feature}: {importance:.2f}")

### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)

#### Feature Importance

In [None]:
# Get the feature importances
importances = gnb.feature_importances_

# Sort the feature importances in descending order
sorted_importances = sorted(importances, reverse=True)

# Print the feature names and importances
for feature, importance in zip(X.columns, sorted_importances):
    print(f"{feature}: {importance:.2f}")

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lrg = LogisticRegression(penalty=None)

`penalty='l2'` is very close but consistently worse. By GridSearchCV.

|params|None|l2|
|------|----|--|
|mean_test_AUC|0.69534876|0.69481222|
|mean_train_AUC|0.69622475|0.69620209|
|mean_test_Accuracy|0.66015328|0.65864726|
|mean_train_Accuracy|0.65961871|0.65985651|

In [None]:
lrg.fit(X_train, y_train)

#### Feature Importance

In [None]:
# Get the feature importances
importances = lrg.feature_importances_

# Sort the feature importances in descending order
sorted_importances = sorted(importances, reverse=True)

# Print the feature names and importances
for feature, importance in zip(X.columns, sorted_importances):
    print(f"{feature}: {importance:.2f}")

### Dummy Classfier

In [None]:
from sklearn.dummy import DummyClassifier

dmc = DummyClassifier(strategy='most_frequent', random_state=42)

dmc.fit(X_train, y_train)


## Assessment

### Cross Validation

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

def cross_validation(model, X, y, cvn=10):
    """
    Proxy for cross validation
    """
    scores = cross_val_score(model, X, y, cv = StratifiedKFold(n_splits = cvn))
    return scores.mean(), scores.std()

### Classification Matrices

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

def confusion_matrix_data(model, X_test, y_test, name):
    """
    Compute the confusion matrix of the given model.
    Export a graph for the matrix. 
    Return confusion matrix array containing tn, fp, fn and tp.
    """
    predictions = model.predict(X_test)
    cm = confusion_matrix(y_test, predictions, labels=model.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, 
                                display_labels=model.classes_)
    # export graph
    disp.plot()
    plt.savefig(fname='../iterations_evaluation/'+name+'_confusion_matrix')

    return cm

### ROC Curves

In [None]:
from sklearn.metrics import auc, roc_curve, RocCurveDisplay

def roc_data(y, pred, name):
    """
    Compute ROC information for the given model. 
    Export a graph for the curve. Return AUC value.
    """
    fpr, tpr, thresholds = roc_curve(y, pred)
    print(fpr, tpr)
    roc_auc = auc(fpr, tpr)
    display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
                                   estimator_name='wins')
    # export graph
    display.plot()
    plt.savefig(fname='../iterations_evaluation/'+name+'_roc')

    return roc_auc

### FNR, FPR

In [None]:
def fpr_fnr_data(cm):
    """
    Compute False Positive Rate and False Negative Rate from confusion matrix.
    """
    # the confusion matrix has True Negative, False Positive, False Negative 
    # and True Positive info.
    tn, fp, fn, tp = cm.ravel() # .ravel() flattens the array to a 1-Dimension array    # TODO: needs numpy?

    # To get the total number of wins and losses we count the ones and zeroes, 
    # respectively, from the Target variable.
    wins, losses = y.value_counts()

    fpr = fp/losses     # FPR = FP/N
    fnr = fn/wins       # FNR = FN/P

    return fpr, fnr

In [None]:
import json
from os import path

def indexes(model, y, X_test, y_test, name):
    """
    Compute accuracy, FPR, FNR, confusion matrix and ROC curve.
    Exports two images, one for the confusion matrix anthe other for ROC; 
    all other indexes are added to a json file.
    """
    # Indexes computation
    accuracy = cross_val_score(model, X, y, cv = StratifiedKFold(n_splits = 10))    # accuracy
    cm = confusion_matrix_data(model, X_test, y_test, name)             # confusion matrix
    fpr, fnr = fpr_fnr_data(cm)                                         # False Positive Rate, False Negative Rate
    area_under_curve = roc_data(y, model.predict_proba(X)[:, 1], name)  # Receiver Operating Characteristic curve and Area Under the Curve
    
    # Indexes export handling
    json_filename = '../iterations_evaluation/iterations.json'

    if path.isfile(json_filename) is True:  # file exists
        # get json file
        with open(file=json_filename,mode='r') as f: # read mode
            all_indexes = json.load(f)  # json file is translated to a python dictionary
    else:   # file does not exists
        with open(file=json_filename,mode='x') as f: # create mode
            all_indexes = dict()    # create an empty dictionary
    # add new indexes to dictionary
    all_indexes.update({
        name: {
            'accuracy': [accuracy.mean(), accuracy.std()],
            'confusion_matrix': [x.item() for x in cm.ravel()], # cm is numpy.ndarray and its elements are numpy.int64; both types aren't "JSON Serialiazable" so they have to be translated to native Python types
            'FPR': fpr,
            'FNR': fnr,
            'AUC': area_under_curve
        }
    })
    # rewrite file with new indexes
    with open(file=json_filename,mode='w') as f:
        json.dump(all_indexes, f, indent=4, separators=(',',': '))
    
    return accuracy, cm, fpr, fnr, area_under_curve

In [None]:
dummy = indexes(dmy, y, X_test, y_test, 'dummy_final')
randomFor = indexes(rdf, y, X_test, y_test, 'RndFor_final')
NaiveBayes = indexes(gnb, y, X_test, y_test, 'NaiveBayes_final')
LogisticRegres = indexes(lrg, y, X_test, y_test, 'LogiRegre_final')