# Get Dataset

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os
from tqdm import tqdm

In [11]:
games_list = os.listdir('../data/curated/NormalisedData')
games_list = [file for file in games_list if file.endswith('.csv')]
games_list_no_2023 = [game for game in games_list if '2023' not in games_list]

In [12]:
train_games, valtest_games = train_test_split(
    games_list_no_2023, test_size=0.3, random_state=42)
val_games, test_games = train_test_split(
    valtest_games, test_size=0.5, random_state=42)

In [13]:
def get_dataset(games_list):

    df = pd.DataFrame()

    for game in tqdm(games_list):
        game_df = pd.read_csv(f'../data/curated/NormalisedData/{game}')
        df = pd.concat([df, game_df])
        
    df.fillna(0, inplace=True)

    return df

In [16]:
train_df = get_dataset(train_games)
val_df = get_dataset(val_games)
test_df = get_dataset(test_games)

100%|██████████| 1240/1240 [00:08<00:00, 154.26it/s]
100%|██████████| 266/266 [00:01<00:00, 152.04it/s]
100%|██████████| 266/266 [00:01<00:00, 201.44it/s]


In [32]:
train_df.to_csv('../data/curated/modelling/train.csv')
val_df.to_csv('../data/curated/modelling/val.csv')
test_df.to_csv('../data/curated/modelling/test.csv')

# Get Feature Importance

In [54]:
import pickle

In [48]:
FULL_FEATURES = ['Kicks', 'Handballs', 'Disposals', 'Marks', 'Goals',
                 'Behinds', 'Tackles', 'Hitouts', 'Goal Assists', 'Inside 50s',
                 'Clearances', 'Clangers', 'Rebound 50s', 'Frees For', 'Frees Agains',
                 'Contested Possessions', 'Uncontested Possessions',
                 'Effective Disposals', 'Disposal Efficiency %', 'Contested Marks',
                 'Marks Inside 50', 'One Percenters', 'Bounces', 'Centre Clearances',
                 'Stoppage Clearances', 'Score Involvements', 'Metres Gained',
                 'Turnovers', 'Intercepts', 'Tackles Inside 50', 'Time On Ground %',
                 'Winloss', 'Uncontested Marks',
                 'Marks Outside 50', 'Tackles Outside 50', 'Behind Assists',
                 'Ineffective Disposals']

train_x = train_df[FULL_FEATURES]
train_y = train_df['target']
val_x = train_df[FULL_FEATURES]
val_y = train_df['target']

In [49]:
from JXAutoML.NingXiang import NingXiang

In [50]:
feature_selector = NingXiang()
feature_selector.read_in_train_data(train_x, train_y, val_x, val_y)
feature_selector.set_model_type('Regression')

NingXiang Initialised
Read in Train X data
Read in Train y data
Read in Val X data
Read in Val y data
Successfully recorded model type: Regression


In [51]:
feature_order_dict = feature_selector.get_rf_based_feature_combinations(
    n_jobs=-1)

Begin fitting Random Forest
Finished fitting Random Forest


In [52]:
feature_selector.show_rf_stats()

{'Kicks': 0.032018868636446304, 'Handballs': 0.0025596342309811963, 'Disposals': 0.14181193595024147, 'Marks': 0.006182684992518879, 'Goals': 0.06093404120308137, 'Behinds': 0.001346870280264095, 'Tackles': 0.025934273867540082, 'Hitouts': 0.038609381932767396, 'Goal Assists': 0.0029296720917979923, 'Inside 50s': 0.0015084281763098409, 'Clearances': 0.0012309113448375584, 'Clangers': 0.01489551294412083, 'Rebound 50s': 0.0017479320346366492, 'Frees For': 0.0015092411277024898, 'Frees Agains': 0.002949367296404547, 'Contested Possessions': 0.09826809310426389, 'Uncontested Possessions': 0.0013103220713735096, 'Effective Disposals': 0.4530041135446561, 'Disposal Efficiency %': 0.001984288476661717, 'Contested Marks': 0.007967626789488655, 'Marks Inside 50': 0.0031080800157206262, 'One Percenters': 0.020520839772818596, 'Bounces': 0.0014837388948946782, 'Centre Clearances': 0.001376700613015881, 'Stoppage Clearances': 0.0011718185195943183, 'Score Involvements': 0.03579325603620612, 'Metr

In [55]:
with open('../models/feature_order_dict.pickle', 'wb') as f:
    pickle.dump(feature_order_dict, f)