In [1]:
from flask import Flask, render_template, redirect, jsonify

import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, inspect, func
import json
from datetime import datetime
from pathlib import Path
import pandas as pd

import time

  
# Database Setup
connection_string = "postgres:Golfer7!@localhost:5432/afl_statistics_DB"
engine = create_engine(f'postgresql://{connection_string}')

# reflect an existing database into a new model
Base = automap_base()

# reflect the tables
Base.prepare(engine, reflect=True)

# Save reference to the table
afl_table = Base.classes.afl_team_performance

In [2]:
# Sklearn Packages
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Sklearn Evaluation Metrics
from sklearn import metrics
from sklearn.metrics import mean_squared_error, precision_score, confusion_matrix, accuracy_score

In [9]:
df = pd.read_sql_query('select * from "afl_team_performance_last_5_games"',con=engine)

In [10]:
# Visualizes all the columns
pd.set_option('display.max_columns', None)

In [11]:
df.head()

Unnamed: 0,team,gameid,date,year,round,venue,starttime,home_away,team_score,rainfall,team_points,opposing_team_score,win_loss_margin,win_loss_margin_percent,win_loss,disposals,kicks,marks,handballs,goals,behinds,hitouts,tackles,rebounds,inside50s,clearances,clangers,frees,frees_against,contested_possessions,uncontested_possessions,contested_marks,marks_inside50,one_percenters,bounces,goal_assists
0,Adelaide,2012PF02,2012-09-22,2012,PF,M.C.G.,17:15:00,0,92.0,1.0,0.0,97.0,-5.0,-5.0,0.0,363.0,213.0,115.0,150.0,14.0,7.0,45.0,39.0,47.0,38.0,33.0,52.0,22.0,24.0,126.0,241.0,15.0,12.0,47.0,18.0,12.0
1,Adelaide,2012QF02,2012-09-08,2012,QF,Football Park,14:45:00,1,42.0,0.2,0.0,71.0,-29.0,-69.0,0.0,334.0,201.0,79.0,133.0,5.0,10.0,42.0,50.0,26.0,59.0,33.0,50.0,23.0,17.0,146.0,185.0,11.0,6.0,55.0,17.0,2.0
2,Adelaide,2012R1004,2012-06-02,2012,R10,Subiaco,14:40:00,0,111.0,0.0,4.0,82.0,29.0,35.0,1.0,351.0,215.0,85.0,136.0,17.0,6.0,34.0,58.0,29.0,48.0,52.0,41.0,20.0,14.0,155.0,201.0,11.0,12.0,57.0,4.0,14.0
3,Adelaide,2012R1202,2012-06-15,2012,R12,Football Park,20:10:00,1,115.0,0.0,4.0,111.0,4.0,4.0,1.0,373.0,224.0,87.0,149.0,17.0,9.0,63.0,59.0,35.0,56.0,36.0,46.0,16.0,16.0,166.0,207.0,13.0,15.0,39.0,9.0,12.0
4,Adelaide,2012R1306,2012-06-24,2012,R13,Docklands,16:40:00,0,89.0,0.0,0.0,121.0,-32.0,-36.0,0.0,286.0,192.0,96.0,94.0,13.0,9.0,36.0,42.0,36.0,43.0,31.0,39.0,13.0,16.0,115.0,171.0,11.0,14.0,55.0,4.0,9.0


In [12]:
index = df.index
number_of_rows = len(index)
print(number_of_rows)

3976


In [13]:
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,team,gameid,date,year,round,venue,starttime,home_away,team_score,rainfall,team_points,opposing_team_score,win_loss_margin,win_loss_margin_percent,win_loss,disposals,kicks,marks,handballs,goals,behinds,hitouts,tackles,rebounds,inside50s,clearances,clangers,frees,frees_against,contested_possessions,uncontested_possessions,contested_marks,marks_inside50,one_percenters,bounces,goal_assists
0,Adelaide,2012PF02,2012-09-22,2012,PF,M.C.G.,17:15:00,0,92.0,1.0,0.0,97.0,-5.0,-5.0,0.0,363.0,213.0,115.0,150.0,14.0,7.0,45.0,39.0,47.0,38.0,33.0,52.0,22.0,24.0,126.0,241.0,15.0,12.0,47.0,18.0,12.0
1,Adelaide,2012QF02,2012-09-08,2012,QF,Football Park,14:45:00,1,42.0,0.2,0.0,71.0,-29.0,-69.0,0.0,334.0,201.0,79.0,133.0,5.0,10.0,42.0,50.0,26.0,59.0,33.0,50.0,23.0,17.0,146.0,185.0,11.0,6.0,55.0,17.0,2.0
2,Adelaide,2012R1004,2012-06-02,2012,R10,Subiaco,14:40:00,0,111.0,0.0,4.0,82.0,29.0,35.0,1.0,351.0,215.0,85.0,136.0,17.0,6.0,34.0,58.0,29.0,48.0,52.0,41.0,20.0,14.0,155.0,201.0,11.0,12.0,57.0,4.0,14.0
3,Adelaide,2012R1202,2012-06-15,2012,R12,Football Park,20:10:00,1,115.0,0.0,4.0,111.0,4.0,4.0,1.0,373.0,224.0,87.0,149.0,17.0,9.0,63.0,59.0,35.0,56.0,36.0,46.0,16.0,16.0,166.0,207.0,13.0,15.0,39.0,9.0,12.0
4,Adelaide,2012R1306,2012-06-24,2012,R13,Docklands,16:40:00,0,89.0,0.0,0.0,121.0,-32.0,-36.0,0.0,286.0,192.0,96.0,94.0,13.0,9.0,36.0,42.0,36.0,43.0,31.0,39.0,13.0,16.0,115.0,171.0,11.0,14.0,55.0,4.0,9.0


In [23]:
# Set features. This will also be used as your x values.
features = df.drop(columns=['date','team','venue', 'gameid','round','starttime', 'team_score', 'team_points', 'opposing_team_score', 'win_loss_margin', 'win_loss_margin_percent','goals','behinds'])
target = df.win_loss.values.reshape(-1,1)

In [25]:
print(features.shape, target.shape)

(3976, 23) (3976, 1)


In [26]:
#X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=1, test_size=.40)

In [27]:
X_train = features[features['year'] < 2018]
X_test = features[features['year'] > 2017]

#y_train = X_train.win_loss.values.reshape(-1,1)
#y_test = X_test.win_loss.values.reshape(-1,1)
y_train = X_train.win_loss.values
y_test = X_test.win_loss.values
                  
X_train = X_train.drop(columns='win_loss')     
X_test = X_test.drop(columns='win_loss')    

In [28]:
# Checking if train test split ran correclty
for dataset in [y_train, y_test]:
    print(round(len(dataset)/len(target), 2))

0.61
0.39


In [29]:
y_test.shape

(1566,)

In [30]:
X_train.head()

Unnamed: 0,year,home_away,rainfall,disposals,kicks,marks,handballs,hitouts,tackles,rebounds,inside50s,clearances,clangers,frees,frees_against,contested_possessions,uncontested_possessions,contested_marks,marks_inside50,one_percenters,bounces,goal_assists
0,2012,0,1.0,363.0,213.0,115.0,150.0,45.0,39.0,47.0,38.0,33.0,52.0,22.0,24.0,126.0,241.0,15.0,12.0,47.0,18.0,12.0
1,2012,1,0.2,334.0,201.0,79.0,133.0,42.0,50.0,26.0,59.0,33.0,50.0,23.0,17.0,146.0,185.0,11.0,6.0,55.0,17.0,2.0
2,2012,0,0.0,351.0,215.0,85.0,136.0,34.0,58.0,29.0,48.0,52.0,41.0,20.0,14.0,155.0,201.0,11.0,12.0,57.0,4.0,14.0
3,2012,1,0.0,373.0,224.0,87.0,149.0,63.0,59.0,35.0,56.0,36.0,46.0,16.0,16.0,166.0,207.0,13.0,15.0,39.0,9.0,12.0
4,2012,0,0.0,286.0,192.0,96.0,94.0,36.0,42.0,36.0,43.0,31.0,39.0,13.0,16.0,115.0,171.0,11.0,14.0,55.0,4.0,9.0


In [31]:
X_test.head()

Unnamed: 0,year,home_away,rainfall,disposals,kicks,marks,handballs,hitouts,tackles,rebounds,inside50s,clearances,clangers,frees,frees_against,contested_possessions,uncontested_possessions,contested_marks,marks_inside50,one_percenters,bounces,goal_assists
137,2018,0,0.0,345.0,201.0,90.0,144.0,26.0,66.0,38.0,38.0,27.0,49.0,13.0,22.0,117.0,225.0,7.0,6.0,32.0,7.0,5.0
138,2018,0,0.0,387.0,204.0,83.0,183.0,37.0,50.0,43.0,53.0,40.0,48.0,12.0,19.0,142.0,242.0,11.0,10.0,60.0,10.0,7.0
139,2018,1,0.0,377.0,214.0,65.0,163.0,49.0,76.0,34.0,57.0,52.0,41.0,27.0,13.0,185.0,197.0,6.0,6.0,59.0,7.0,9.0
140,2018,0,21.8,360.0,205.0,81.0,155.0,42.0,54.0,35.0,50.0,34.0,43.0,19.0,18.0,150.0,215.0,12.0,7.0,49.0,13.0,8.0
141,2018,0,2.2,349.0,220.0,115.0,129.0,32.0,46.0,42.0,35.0,27.0,58.0,21.0,22.0,123.0,221.0,6.0,6.0,44.0,3.0,1.0


In [32]:
# Evaluation function

def evaluation(y_true, y_pred):
    
# Print Accuracy, Recall, F1 Score, and Precision metrics.
    print('Evaluation Metrics:')
    print('Accuracy: ' + str(metrics.accuracy_score(y_test, y_pred)))
    print('Recall: ' + str(metrics.recall_score(y_test, y_pred)))
    print('F1 Score: ' + str(metrics.f1_score(y_test, y_pred)))
    print('Precision: ' + str(metrics.precision_score(y_test, y_pred)))
    
# Print Confusion Matrix
    print('\nConfusion Matrix:')
    print(' TN,  FP, FN, TP')
    print(confusion_matrix(y_true, y_pred).ravel())
    
# Function Prints best parameters for GridSearchCV
def print_results(results):
    print('Best Parameters: {}\n'.format(results.best_params_)) 

In [33]:
# Logistic Regression Model
lr = LogisticRegression(C=100, max_iter=400, class_weight='balanced')

# Fitting Model to the train set
lr.fit(X_train, y_train)

# Predicting on the test set
y_pred = lr.predict(X_test)

# Evaluating model
evaluation(y_test, y_pred)

Evaluation Metrics:
Accuracy: 0.7586206896551724
Recall: 0.6422136422136422
F1 Score: 0.7252906976744187
Precision: 0.8330550918196995

Confusion Matrix:
 TN,  FP, FN, TP
[689 100 278 499]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
