In [154]:
import pandas as pd
import numpy as np
from flaml import AutoML
import yaml


%load_ext autoreload
%autoreload 2
%reload_ext autoreload




The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [168]:
# Read YAML file
with open("config.yaml", "r") as stream:
    config_input = yaml.safe_load(stream)

# FLAML
FLAML_SETTINGS = config_input["flaml"]["settings"]

# Data
FEATURE_SET = config_input["data"]["feature_set"]
TEST_YEAR = config_input["data"]["training"]["test_year"]
TARGET = config_input["data"]["training"]["target"]
TRAIN_COLUMNS = config_input["data"]["training"]["train_cols"]


In [169]:
df = pd.read_csv(FEATURE_SET)
df["Date"] = pd.to_datetime(df["Date"])
# df = df.loc[df['Round']==1]
df_test = df.loc[df["Date"].dt.year == TEST_YEAR]
df_train = df.loc[df["Date"].dt.year != TEST_YEAR]


In [161]:
df.head(5)


Unnamed: 0.1,Unnamed: 0,Date,Round,Game number,Venue,Home Team,Away Team,Home team score,Away team score,Home team score detail,...,Home_L_away,Away_W_away,Away_D_away,Away_L_away,Stk_away,Pts_away,%_away,Stkn_away,Stkd_away,target
0,0,2000-03-16 18:40:00,2,1,M.C.G.,Essendon,Richmond,130,87,"[5, 4, 10, 7, 15, 7, 20, 10]",...,0.0,0.0,0.0,0.0,1W,4.0,102.17,1.0,W,1
1,1,2000-03-17 18:40:00,2,2,M.C.G.,Melbourne,North Melbourne,145,120,"[7, 0, 13, 2, 18, 4, 23, 7]",...,1.0,0.0,0.0,0.0,1L,0.0,72.08,-1.0,L,1
2,2,2000-03-18 13:10:00,2,3,Kardinia Park,Geelong,St Kilda,139,99,"[7, 4, 11, 7, 15, 13, 21, 13]",...,1.0,0.0,0.0,0.0,1L,0.0,74.63,-1.0,L,1
3,3,2000-03-18 18:40:00,2,4,Football Park,Port Adelaide,Fremantle,87,125,"[3, 5, 7, 9, 9, 13, 12, 15]",...,1.0,0.0,0.0,0.0,1L,0.0,82.95,-1.0,L,0
4,4,2000-03-19 13:10:00,2,5,M.C.G.,Collingwood,Adelaide,103,92,"[5, 5, 7, 10, 10, 14, 14, 19]",...,1.0,0.0,0.0,0.0,1L,0.0,82.44,-1.0,L,1


In [137]:

numerical_cols = [
    "Pos_home",
    "P_home",
    "W_home",
    "D_home",
    "L_home",
    "For_home",
    "Agn_home",
    "Max_home",
    "Min_home",
    "Home_W_home",
    "Home_D_home",
    "Home_L_home",
    "Away_W_home",
    "Away_D_home",
    "Away_L_home",
    "Stk_home",
    "Pts_home",
    "%_home",
    "Stkn_home",
    "Stkd_home",
    "Pos_away",
    "P_away",
    "W_away",
    "D_away",
    "L_away",
    "For_away",
    "Agn_away",
    "Max_away",
    "Min_away",
    "Home_W_away",
    "Home_D_away",
    "Home_L_away",
    "Away_W_away",
    "Away_D_away",
    "Away_L_away",
    "Stk_away",
    "Pts_away",
    "%_away",
    "Stkn_away",
    "Stkd_away",
]

numerical_cols = [
    # "%_home",
    # "%_away",
    # "For_home",
    # "For_away",
    # "Agn_home",
    # "Agn_away",
    "Pts_home",
    "Pts_away",
    "Round"
]

target = "target"


In [138]:
settings = {
    "time_budget": 300,  # total running time in seconds
    "metric": "log_loss",  # can be: 'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',
    # 'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'ndcg', 'micro_f1', 'macro_f1'
    "task": "classification",  # task type
    "log_file_name": 'airlines_experiment.log',  # flaml log file
    "seed": 7654321,  # random seed
    "verbose": 0,
    "ensemble": True
}


In [139]:


automl = AutoML()
automl.fit(df_train[numerical_cols], df_train[target], **settings)




In [140]:
print("Best ML leaner:", automl.best_estimator)
print("Best hyperparmeter config:", automl.best_config)
print("Best log_loss on validation data: {0:.4g}".format(1 - automl.best_loss))
print("Training duration of best run: {0:.4g} s".format(automl.best_config_train_time))



Best ML leaner: xgb_limitdepth
Best hyperparmeter config: {'n_estimators': 605, 'max_depth': 2, 'min_child_weight': 13.06946287629127, 'learning_rate': 0.02442816070393605, 'subsample': 0.3792778389216569, 'colsample_bylevel': 0.6315239528200942, 'colsample_bytree': 0.922760488427569, 'reg_alpha': 0.021678786878906026, 'reg_lambda': 1.7572615931426092}
Best log_loss on validation data: 0.3772
Training duration of best run: 1.485 s


In [124]:
# automl.model.estimator
#

In [141]:
"""compute predictions of testing dataset"""
# y_pred = automl.predict(X_test)
# print('Predicted labels', y_pred)
# print('True labels', y_test)
# df_score = df
df_test["score"] = automl.predict_proba(df_test[numerical_cols])[:, 1]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["score"] = automl.predict_proba(df_test[numerical_cols])[:, 1]


In [142]:
def score_function(dataf):
    dataf.loc[dataf["target"] == 1, "prob_score"] = 1 + np.log2(dataf["score"])
    dataf.loc[dataf["target"] == 0, "prob_score"] = 1 + np.log2(1 - dataf["score"])
    return dataf["prob_score"]


In [143]:
df_test["prob_score"] = score_function(df_test)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["prob_score"] = score_function(df_test)


In [153]:
df_test[
    [
        "Date",
        "Round",
        "Game number",
        "Home Team",
        "Away Team",
        "score",
        "target",
        "prob_score",
        "For_home",
        "W_home"
    ]
].sort_values(by="prob_score", ascending=True).head(20)

Unnamed: 0,Date,Round,Game number,Home Team,Away Team,score,target,prob_score,For_home,W_home
4387,2022-07-10 13:10:00,17,7,Brisbane Lions,Essendon,0.866067,0,-1.90042,1499,11.0
4358,2022-06-17 19:50:00,14,2,St Kilda,Essendon,0.834837,0,-1.598033,1010,8.0
4393,2022-07-16 16:35:00,18,4,North Melbourne,Richmond,0.21123,1,-1.243111,895,1.0
4248,2022-03-18 18:50:00,1,3,St Kilda,Collingwood,0.783355,0,-1.206596,1644,10.0
4413,2022-07-30 19:30:00,20,6,Adelaide,Carlton,0.217997,1,-1.19762,1377,5.0
4305,2022-04-30 19:25:00,7,6,St Kilda,Port Adelaide,0.779913,0,-1.183856,573,5.0
4335,2022-05-22 17:20:00,10,9,Fremantle,Collingwood,0.767428,0,-1.104253,736,7.0
4374,2022-07-02 13:45:00,16,3,Essendon,Sydney,0.25077,1,-0.995561,1043,3.0
4254,2022-03-20 18:40:00,1,9,West Coast,Gold Coast,0.740037,0,-0.943623,1752,10.0
4312,2022-05-07 13:45:00,8,4,Sydney,Gold Coast,0.729445,0,-0.886005,684,5.0


In [145]:
df_test[
    [
        "Date",
        "Round",
        "Game number",
        "Home Team",
        "Away Team",
        "score",
        "target",
        "prob_score",
        "Pts_home",
        "Pts_away"
    ]
].sort_values(by="prob_score", ascending=True).head(20)

Unnamed: 0,Date,Round,Game number,Home Team,Away Team,score,target,prob_score,Pts_home,Pts_away
4387,2022-07-10 13:10:00,17,7,Brisbane Lions,Essendon,0.866067,0,-1.90042,44.0,16.0
4358,2022-06-17 19:50:00,14,2,St Kilda,Essendon,0.834837,0,-1.598033,32.0,8.0
4393,2022-07-16 16:35:00,18,4,North Melbourne,Richmond,0.21123,1,-1.243111,4.0,36.0
4248,2022-03-18 18:50:00,1,3,St Kilda,Collingwood,0.783355,0,-1.206596,40.0,24.0
4413,2022-07-30 19:30:00,20,6,Adelaide,Carlton,0.217997,1,-1.19762,20.0,48.0
4305,2022-04-30 19:25:00,7,6,St Kilda,Port Adelaide,0.779913,0,-1.183856,20.0,4.0
4335,2022-05-22 17:20:00,10,9,Fremantle,Collingwood,0.767428,0,-1.104253,28.0,16.0
4374,2022-07-02 13:45:00,16,3,Essendon,Sydney,0.25077,1,-0.995561,12.0,36.0
4254,2022-03-20 18:40:00,1,9,West Coast,Gold Coast,0.740037,0,-0.943623,40.0,28.0
4312,2022-05-07 13:45:00,8,4,Sydney,Gold Coast,0.729445,0,-0.886005,20.0,8.0


In [146]:
df_test[
    [
        "Date",
        "Round",
        "Game number",
        "Home Team",
        "Away Team",
        "score",
        "target",
        "prob_score",
        "For_home",
        "W_home"
    ]
].sort_values(by="prob_score", ascending=True).head(20)


Unnamed: 0,Date,Round,Game number,Home Team,Away Team,score,target,prob_score,For_home,W_home
4387,2022-07-10 13:10:00,17,7,Brisbane Lions,Essendon,0.866067,0,-1.90042,1499,11.0
4358,2022-06-17 19:50:00,14,2,St Kilda,Essendon,0.834837,0,-1.598033,1010,8.0
4393,2022-07-16 16:35:00,18,4,North Melbourne,Richmond,0.21123,1,-1.243111,895,1.0
4248,2022-03-18 18:50:00,1,3,St Kilda,Collingwood,0.783355,0,-1.206596,1644,10.0
4413,2022-07-30 19:30:00,20,6,Adelaide,Carlton,0.217997,1,-1.19762,1377,5.0
4305,2022-04-30 19:25:00,7,6,St Kilda,Port Adelaide,0.779913,0,-1.183856,573,5.0
4335,2022-05-22 17:20:00,10,9,Fremantle,Collingwood,0.767428,0,-1.104253,736,7.0
4374,2022-07-02 13:45:00,16,3,Essendon,Sydney,0.25077,1,-0.995561,1043,3.0
4254,2022-03-20 18:40:00,1,9,West Coast,Gold Coast,0.740037,0,-0.943623,1752,10.0
4312,2022-05-07 13:45:00,8,4,Sydney,Gold Coast,0.729445,0,-0.886005,684,5.0


In [147]:
df_test["prob_score"].sum()


33.48650654972192

In [90]:


'''pickle and save the automl object'''
import pickle
with open('automl.pkl', 'wb') as f:
    pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
'''load pickled automl object'''
with open('automl.pkl', 'rb') as f:
    automl = pickle.load(f)




In [73]:
current_round = pd.read_csv('current_round.csv')

In [148]:
current_round["score"] = automl.predict_proba(current_round[numerical_cols])[:, 1]
current_round[
    [
        "Date",
        "Round",
        "Game number",
        "Home Team",
        "Away Team",
        "score",
        "For_home",
        "W_home",
        "W_away"
    ]
].sort_values(by="score", ascending=True).head(20)

Unnamed: 0,Date,Round,Game number,Home Team,Away Team,score,For_home,W_home,W_away
5,2023-03-18 20:00:00,1,6,Gold Coast,Sydney,0.336915,1871,10.0,16.0
3,2023-03-18 16:35:00,1,4,Port Adelaide,Brisbane Lions,0.369296,1806,10.0,15.0
8,2023-03-19 16:40:00,1,9,St Kilda,Fremantle,0.433362,1703,11.0,15.0
6,2023-03-19 13:10:00,1,7,Greater Western Sydney,Adelaide,0.507303,1631,6.0,8.0
2,2023-03-18 13:45:00,1,3,North Melbourne,West Coast,0.577578,1337,2.0,2.0
7,2023-03-19 15:20:00,1,8,Hawthorn,Essendon,0.657099,1787,8.0,7.0
0,2023-03-16 19:20:00,1,1,Richmond,Carlton,0.677889,2165,13.0,12.0
1,2023-03-17 19:40:00,1,2,Geelong,Collingwood,0.713426,2146,18.0,16.0
4,2023-03-18 19:25:00,1,5,Melbourne,Western Bulldogs,0.764716,1936,16.0,12.0


In [118]:
current_round["score"] = automl.predict_proba(current_round[numerical_cols])[:, 1]
current_round[
    [
        "Date",
        "Round",
        "Game number",
        "Home Team",
        "Away Team",
        "score",
        "For_home",
        "W_home",
        "W_away"
    ]
].sort_values(by="score", ascending=True).head(20)

Unnamed: 0,Date,Round,Game number,Home Team,Away Team,score,For_home,W_home,W_away
5,2023-03-18 20:00:00,1,6,Gold Coast,Sydney,0.434582,1871,10.0,16.0
8,2023-03-19 16:40:00,1,9,St Kilda,Fremantle,0.517898,1703,11.0,15.0
3,2023-03-18 16:35:00,1,4,Port Adelaide,Brisbane Lions,0.589319,1806,10.0,15.0
6,2023-03-19 13:10:00,1,7,Greater Western Sydney,Adelaide,0.657994,1631,6.0,8.0
2,2023-03-18 13:45:00,1,3,North Melbourne,West Coast,0.68946,1337,2.0,2.0
0,2023-03-16 19:20:00,1,1,Richmond,Carlton,0.739468,2165,13.0,12.0
7,2023-03-19 15:20:00,1,8,Hawthorn,Essendon,0.739793,1787,8.0,7.0
4,2023-03-18 19:25:00,1,5,Melbourne,Western Bulldogs,0.825064,1936,16.0,12.0
1,2023-03-17 19:40:00,1,2,Geelong,Collingwood,0.886449,2146,18.0,16.0


In [149]:
current_round["score"] = automl.predict_proba(current_round[numerical_cols])[:, 1]
current_round[
    [
        "Date",
        "Round",
        "Game number",
        "Home Team",
        "Away Team",
        "score",
        "For_home",
        "W_home",
        "W_away"
    ]
].sort_values(by="score", ascending=True).head(20)

Unnamed: 0,Date,Round,Game number,Home Team,Away Team,score,For_home,W_home,W_away
5,2023-03-18 20:00:00,1,6,Gold Coast,Sydney,0.336915,1871,10.0,16.0
3,2023-03-18 16:35:00,1,4,Port Adelaide,Brisbane Lions,0.369296,1806,10.0,15.0
8,2023-03-19 16:40:00,1,9,St Kilda,Fremantle,0.433362,1703,11.0,15.0
6,2023-03-19 13:10:00,1,7,Greater Western Sydney,Adelaide,0.507303,1631,6.0,8.0
2,2023-03-18 13:45:00,1,3,North Melbourne,West Coast,0.577578,1337,2.0,2.0
7,2023-03-19 15:20:00,1,8,Hawthorn,Essendon,0.657099,1787,8.0,7.0
0,2023-03-16 19:20:00,1,1,Richmond,Carlton,0.677889,2165,13.0,12.0
1,2023-03-17 19:40:00,1,2,Geelong,Collingwood,0.713426,2146,18.0,16.0
4,2023-03-18 19:25:00,1,5,Melbourne,Western Bulldogs,0.764716,1936,16.0,12.0


In [150]:
mapper = pd.read_csv('submit_mapper.csv')
df_mapped = pd.merge(current_round, mapper, left_on='Home Team', right_on='Season_team')

In [151]:
df_mapped

Unnamed: 0,Date,Round,Game number,Venue,Home Team,Away Team,Home team score,Away team score,Home team score detail,Away team score detail,...,Away_D_away,Away_L_away,Stk_away,Pts_away,%_away,Stkn_away,Stkd_away,score,Submit_team,Season_team
0,2023-03-16 19:20:00,1,1,M.C.G.,Richmond,Carlton,,,,,...,0.0,7.0,4L,48.0,108.34,-4.0,L,0.677889,Richmond,Richmond
1,2023-03-17 19:40:00,1,2,M.C.G.,Geelong,Collingwood,,,,,...,0.0,3.0,1W,64.0,104.31,1.0,W,0.713426,Geelong,Geelong
2,2023-03-18 13:45:00,1,3,Docklands,North Melbourne,West Coast,,,,,...,0.0,10.0,8L,8.0,59.82,-8.0,L,0.577578,Kangaroos,North Melbourne
3,2023-03-18 16:35:00,1,4,Adelaide Oval,Port Adelaide,Brisbane Lions,,,,,...,0.0,5.0,1L,60.0,119.34,-1.0,L,0.369296,P_Adelaide,Port Adelaide
4,2023-03-18 19:25:00,1,5,M.C.G.,Melbourne,Western Bulldogs,,,,,...,0.0,6.0,2W,48.0,108.89,2.0,W,0.764716,Melbourne,Melbourne
5,2023-03-18 20:00:00,1,6,Carrara,Gold Coast,Sydney,,,,,...,0.0,4.0,7W,64.0,127.91,7.0,W,0.336915,Gold_Coast,Gold Coast
6,2023-03-19 13:10:00,1,7,Manuka Oval,Greater Western Sydney,Adelaide,,,,,...,0.0,8.0,1L,32.0,86.66,-1.0,L,0.507303,G_W_Sydney,Greater Western Sydney
7,2023-03-19 15:20:00,1,8,M.C.G.,Hawthorn,Essendon,,,,,...,0.0,9.0,3L,28.0,83.23,-3.0,L,0.657099,Hawthorn,Hawthorn
8,2023-03-19 16:40:00,1,9,Docklands,St Kilda,Fremantle,,,,,...,1.0,2.0,3W,62.0,117.03,3.0,W,0.433362,St_Kilda,St Kilda


In [152]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
browser = webdriver.Firefox()

browser.get("http://probabilistic-footy.monash.edu/~footy/tips.shtml")

time.sleep(3)
# username = browser.find_element_by_xpath("/html/body/form/table/tbody/tr[1]/td[2]/input")
# password = browser.find_element_by_xpath("/html/body/form/table/tbody/tr[2]/td[2]/input")
username = browser.find_element('name','name')
password = browser.find_element('name','passwd')
username.send_keys("")
password.send_keys("")
login_attempt = browser.find_element("xpath","//*[@type='submit']")
login_attempt.submit()

time.sleep(3)
main_table = browser.find_elements(By.TAG_NAME,'tbody')
rower = main_table[1].find_elements(By.TAG_NAME,'tr')
for rows in range(len(rower)-1):
    home_team = rower[rows+1].find_elements(By.TAG_NAME,'td')[2].text
    prediction = df_mapped.loc[df_mapped['Submit_team'] == home_team]['score'].values[0]
    #gamer = rower[rows+1].find_elements_by_tag_name('td')[4]
    gamer = browser.find_element('name','game'+str(rows+1))
    gamer.clear()
    gamer.send_keys(prediction.astype('str'))
   # print('game'+str(rows+1))
   # print(prediction)

time.sleep(3)
login_attempt = browser.find_element("xpath","//*[@type='submit']")
login_attempt.submit()
