# Develop an initial model

In [1]:
# Python modules
import dateutil
from datetime import datetime
import re

# Data science packages
import pandas as pd
import numpy as np

# Scikit Learn utility classes & functions
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error

# Scikit Learn models
from sklearn.linear_model import Lasso, ElasticNet, Ridge
from sklearn.ensemble import (AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor,
                              RandomForestRegressor)
from sklearn.svm import SVR, LinearSVR

# Our own code
from src.data.preprocessing import betting_model_df, team_betting_model_df, cum_team_df
from src.model.metrics import measure_estimators

np.random.seed(42)

  from numpy.core.umath_tests import inner1d


In [2]:
# Set up data and create cumulative features

model_df = betting_model_df()
team_df = team_betting_model_df(model_df)
cum_df = cum_team_df(team_df)
cum_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,team,oppo_team,score,oppo_score,win_odds,line_odds,oppo_win_odds,oppo_line_odds,round_number,year,at_home,ladder_position,cum_percent,cum_win_points,last_week_score,oppo_ladder_position,oppo_cum_percent,oppo_cum_win_points,oppo_last_week_score
team,year,round_number,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Adelaide,2010,2,Adelaide,Sydney,75.0,118.0,1.54,-12.5,2.49,12.5,2,2010,1.0,15,0.525424,0.0,62.0,9,0.916667,0.0,88.0
Adelaide,2010,3,Adelaide,Melbourne,41.0,57.0,1.52,-13.5,2.55,13.5,3,2010,1.0,14,0.580508,0.0,75.0,12,0.719212,0.0,85.0
Adelaide,2010,4,Adelaide,Carlton,55.0,103.0,1.81,-2.5,2.01,2.5,4,2010,1.0,15,0.607509,0.0,41.0,10,1.063910,4.0,75.0
Adelaide,2010,5,Adelaide,Western Bulldogs,72.0,121.0,7.50,40.5,1.09,-40.5,5,2010,1.0,15,0.588384,0.0,55.0,8,1.082873,8.0,79.0
Adelaide,2010,6,Adelaide,Port Adelaide,74.0,97.0,2.55,12.5,1.52,-12.5,6,2010,1.0,15,0.589942,0.0,72.0,10,0.801670,12.0,54.0
Adelaide,2010,7,Adelaide,Richmond,104.0,54.0,1.20,-29.5,4.66,29.5,7,2010,1.0,15,0.617264,0.0,74.0,16,0.491413,0.0,53.0
Adelaide,2010,8,Adelaide,North Melbourne,75.0,84.0,2.38,9.5,1.59,-9.5,8,2010,1.0,15,0.723054,4.0,104.0,11,0.780952,12.0,91.0
Adelaide,2010,9,Adelaide,Brisbane,93.0,81.0,2.14,5.5,1.72,-5.5,9,2010,1.0,15,0.742021,4.0,75.0,9,0.920792,16.0,74.0
Adelaide,2010,10,Adelaide,St Kilda,76.0,123.0,4.25,26.5,1.23,-26.5,10,2010,1.0,15,0.781513,8.0,93.0,5,1.172144,24.0,91.0
Adelaide,2010,11,Adelaide,Fremantle,105.0,82.0,2.89,17.5,1.42,-17.5,11,2010,1.0,15,0.760460,8.0,76.0,2,1.278221,32.0,139.0


In [3]:
# Set up & split data for models

team_features = pd.get_dummies(cum_df.drop(['score', 'oppo_score'], axis=1))
team_labels = pd.Series(cum_df['score'] - cum_df['oppo_score'], name='score_diff')
data = train_test_split(team_features, team_labels)

In [4]:
# Pass data to models & measure performance

linear = (Lasso(), ElasticNet(), Ridge(), LinearSVR())
measure_estimators(linear, data, model_type='regression')

ensemble = (AdaBoostRegressor(),
            BaggingRegressor(),
            ExtraTreesRegressor(),
            GradientBoostingRegressor(),
            RandomForestRegressor(),
            SVR(kernel='rbf'))
measure_estimators(ensemble, data, model_type='regression')



Lasso
Mean CV accuracy: 0.7188143778635837
Test accuracy: 0.732574679943101

Mean CV negative error score: -28.791164269733315
Test error score: 29.129133698565298


ElasticNet
Mean CV accuracy: 0.7178698877644065
Test accuracy: 0.7297297297297297

Mean CV negative error score: -28.992197233861855
Test error score: 29.583121157136382


Ridge
Mean CV accuracy: 0.7183381927480271
Test accuracy: 0.7311522048364154

Mean CV negative error score: -29.07693136463123
Test error score: 29.486379909946386


LinearSVR
Mean CV accuracy: 0.7202339273451835
Test accuracy: 0.7240398293029872

Mean CV negative error score: -29.284470143886058
Test error score: 29.952657286270316


AdaBoostRegressor
Mean CV accuracy: 0.7207157411264086
Test accuracy: 0.7254623044096729

Mean CV negative error score: -29.46279381196043
Test error score: 30.46100049979439


BaggingRegressor
Mean CV accuracy: 0.6941664509011493
Test accuracy: 0.701280227596017

Mean CV negative error score: -30.72708288773064
Test erro