In [15]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn
import math

In [4]:
#Testing MLFlow
#Commenting out to not run it again

#from sklearn.linear_model import LogisticRegression

#if __name__ == "__main__":
#    X = np.array([-2, -1, 0, 1, 2, 1]).reshape(-1, 1)
#    y = np.array([0, 0, 1, 1, 1, 0])
#    lr = LogisticRegression()
#    lr.fit(X, y)
#    score = lr.score(X, y)
#    print("Score: %s" % score)
#    mlflow.log_metric("score", score)
#    mlflow.sklearn.log_model(lr, "model")
#    print("Model saved in run %s" % mlflow.active_run().info.run_uuid)
#    mlflow.end_run()

## Testing Random Forest Model

In [8]:
from sklearn.model_selection import train_test_split

In [None]:
# Launch the experiment on mlflow
experiment_name = "RF 100 Games"
mlflow.set_experiment(experiment_name)

In [None]:
df = pd.read_csv('csvs/Games_1_100.csv')

In [None]:
#Defining X and y variables
X = df.drop(['game_id','home_win'], axis=1)
y = df['home_win']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [None]:
#Scale the Data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train2 = sc.fit_transform(x_train)
x_test2 = sc.transform(x_test)

In [None]:
#Script for Logistical Regression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix  

for name,method in [('RandomForestClassifier', RandomForestClassifier(n_estimators = 10, criterion = 'entropy',random_state=10))]: 
    method.fit(x_train2,y_train)
    predict = method.predict(x_test2)
    target_names=['loss', 'win']
    print(confusion_matrix(y_test,predict))  
    print(classification_report(y_test,predict,target_names=target_names))

## Trial 2 - Testing different accuracies

In [None]:
#Dropping game_id
df.drop('game_id', axis=1, inplace = True)

In [None]:
#Defining X and y variables
X = df.drop(['home_win'], axis=1)
y = df['home_win']

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [None]:
#Script for Logistical Regression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score  
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from numpy import mean
from numpy import std

for name,method in [('RandomForestClassifier', RandomForestClassifier(n_estimators = 10, criterion = 'entropy',random_state=10))]: 
    method.fit(x_train,y_train)
    predict = method.predict(x_test)
    target_names=['loss', 'win']
    # Calculate the absolute errors
    errors = abs(predict - y_test)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    n_scores = cross_val_score(method, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    n2_scores = cross_val_score(method, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
    print('MAE: %.3f (%.3f)' % (mean(n2_scores), std(n2_scores)))
    #Accuracy performance
    print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
    # Print out the mean absolute error (mae)
    print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
    print(accuracy_score(y_test, predict))
    print(confusion_matrix(y_test,predict))  
    print(classification_report(y_test,predict,target_names=target_names))

## Testing RF Model with MLFlow

In [None]:
# Launch the experiment on mlflow
experiment_name = "100 Games Test"
mlflow.set_experiment(experiment_name)

df = pd.read_csv('Games_1_100.csv')
df.drop('game_id', axis=1, inplace = True)

#Defining X and y variables
X = df.drop('home_win', axis=1)
y = df['home_win']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

#Script for Logistical Regression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix 

for name,method in [('RandomForestClassifier', RandomForestClassifier(n_estimators = 10, criterion = 'entropy',random_state=10))]: 
    method.fit(x_train2,y_train)
    predict = method.predict(x_test2)
    target_names=['loss', 'win']
    metrics = {'Accuracy':accuracy_score(y_test,predict)}
    cm = confusion_matrix(y_test, predict)
    t_n, f_p, f_n, t_p = cm.ravel()
    mlflow.log_metric("tn", t_n)
    mlflow.log_metric("fp", f_p)
    mlflow.log_metric("fn", f_n)
    mlflow.log_metric("tp", t_p)
    # Log in mlflow (metrics)
    mlflow.log_metrics(metrics)
    mlflow.sklearn.log_model(method, "RF-model")
mlflow.end_run()

#### Adding to the model

In [None]:
# Launch the experiment on mlflow
experiment_name = "100 Games Test"
mlflow.set_experiment(experiment_name)

df = pd.read_csv('csvs/Games_1_100.csv')

#Dropping game_id
df.drop('game_id', axis=1, inplace = True)
#Defining X and y variables
X = df.drop('home_win', axis=1)
y = df['home_win']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

#Script for RF
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score 
n_estimators = 10
max_depth = 6
max_features = 3 

for name,method in [('RandomForestClassifier', RandomForestClassifier(n_estimators = n_estimators, max_depth = max_depth, max_features = max_features, random_state=10))]: 
    method.fit(x_train,y_train)
    predict = method.predict(x_test)
    target_names=['loss', 'win']

    # Log parameters
    mlflow.log_param("num_trees", n_estimators)
    mlflow.log_param("maxdepth", max_depth)
    mlflow.log_param("max_feat", max_features)

    #Creating metrics
    metrics = {'Accuracy':accuracy_score(y_test,predict)}
    cm = confusion_matrix(y_test, predict)
    t_n, f_p, f_n, t_p = cm.ravel()

    # Log in mlflow (metrics)
    mlflow.log_metric("tn", t_n)
    mlflow.log_metric("fp", f_p)
    mlflow.log_metric("fn", f_n)
    mlflow.log_metric("tp", t_p)
    mlflow.log_metrics(metrics)
    
    #Logging model
    mlflow.sklearn.log_model(method, "RF-model")
mlflow.end_run()

## Testing Logistic Regression Model

In [31]:
#Training Model on first 100 games to start
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

#Launch the experiment on mlflow
experiment_name = "100 Games Test"
mlflow.set_experiment(experiment_name)

df = pd.read_csv('csvs/Games_1_100.csv')

#Dropping game_id
df.drop('game_id', axis=1, inplace = True)

#Defining X and y variables
X = df.drop(['home_win'], axis=1)
y = df['home_win']

#splitting data for training and testing
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

#Script for Logistical Regression
solver = 'liblinear'
global model
model = LogisticRegression(solver = solver, random_state=10)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
target_names=['loss', 'win']

# Log parameters
mlflow.log_param("solver", solver)

#Creating metrics
metrics = {'Accuracy':accuracy_score(y_test,y_pred)}
cm = confusion_matrix(y_test, predict)
t_n, f_p, f_n, t_p = cm.ravel()

#Log in mlflow (metrics)
mlflow.log_metric("tn", t_n)
mlflow.log_metric("fp", f_p)
mlflow.log_metric("fn", f_n)
mlflow.log_metric("tp", t_p)
mlflow.log_metrics(metrics)

#Logging model
mlflow.sklearn.log_model(model, "LR-model")
mlflow.end_run()

#print(accuracy_score(y_test, y_pred))
# Report training set score
#train_score = model.score(x_train, y_train) * 100
# Report test set score
#test_score = model.score(x_test, y_test) * 100
#print(confusion_matrix(y_test,y_pred))  
#print(classification_report(y_test,y_pred,target_names=target_names))

#### Looking into feature importance

In [22]:
#Finding intercept and coefficients of Log Regression
w0 = model.intercept_[0]
w = w1,w2,w3,w4,w5,w6,w7,w8,w9,w10,w11,w12 = model.coef_[0]

array([ 0.09078132,  0.14094186,  0.20727365,  0.47025203,  0.43889043,
        0.90348524,  0.6187065 , -0.19073833,  0.0078641 , -0.0482597 ,
        0.076516  , -0.64830143])

In [None]:
#Calculate feature importance in Log Reg
labels = df.drop('home_win', axis=1).columns
feature_importance = pd.DataFrame(labels, columns = ["feature"])
feature_importance["importance"] = pow(math.e, w)
feature_importance = feature_importance.sort_values(by = ["importance"], ascending=False)
 
#Image formatting
axis_fs = 18 #fontsize
title_fs = 22 #fontsize
sns.set(style="whitegrid")

#Plotting
ax = sns.barplot(x="importance", y="feature", data=feature_importance)
ax.set_xlabel('Importance',fontsize = axis_fs) 
ax.set_ylabel('Feature', fontsize = axis_fs)#ylabel
ax.set_title('Logistic Regression\nfeature importance', fontsize = title_fs)

plt.tight_layout()
plt.savefig("images/LR_feature_importance.png",dpi=120)