# Get Dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os
from tqdm import tqdm

In [2]:
games_list = os.listdir('../data/curated/NormalisedData')
games_list = [file for file in games_list if file.endswith('.csv')]
games_list_no_2023 = [game for game in games_list if '2023' not in games_list]

In [3]:
train_games, valtest_games = train_test_split(
    games_list_no_2023, test_size=0.3, random_state=42)
val_games, test_games = train_test_split(
    valtest_games, test_size=0.5, random_state=42)

In [4]:
def get_dataset(games_list):

    df = pd.DataFrame()

    for game in tqdm(games_list):
        game_df = pd.read_csv(f'../data/curated/NormalisedData/{game}')
        df = pd.concat([df, game_df])

    df.fillna(0, inplace=True)

    return df

In [5]:
train_df = get_dataset(train_games)
val_df = get_dataset(val_games)
test_df = get_dataset(test_games)

100%|██████████| 1240/1240 [00:13<00:00, 95.13it/s]
100%|██████████| 266/266 [00:02<00:00, 114.10it/s]
100%|██████████| 266/266 [00:01<00:00, 178.90it/s]


In [6]:
train_df.to_csv('../data/curated/modelling/train.csv')
val_df.to_csv('../data/curated/modelling/val.csv')
test_df.to_csv('../data/curated/modelling/test.csv')

## Get Graph Dataset

In [7]:
def get_graph_dataset(games_list):

    df = pd.DataFrame()

    game_index = 0
    for game in tqdm(games_list):
        game_df = pd.read_csv(f'../data/curated/NormalisedData/{game}')
        game_df['idx'] = game_index
        df = pd.concat([df, game_df])

        game_index += 1

    df.fillna(0, inplace=True)

    return df

In [8]:
train_df = get_graph_dataset(train_games)
val_df = get_graph_dataset(val_games)
test_df = get_graph_dataset(test_games)

100%|██████████| 1240/1240 [00:14<00:00, 85.24it/s]
100%|██████████| 266/266 [00:01<00:00, 154.38it/s]
100%|██████████| 266/266 [00:01<00:00, 142.42it/s]


In [10]:
train_df.to_csv('../data/curated/modelling/graph_train.csv')
val_df.to_csv('../data/curated/modelling/graph_val.csv')
test_df.to_csv('../data/curated/modelling/graph_test.csv')

# Get Feature Importance

In [7]:
import pickle

In [8]:
FULL_FEATURES = ['Kicks', 'Handballs', 'Disposals', 'Marks', 'Goals',
                 'Behinds', 'Tackles', 'Hitouts', 'Goal Assists', 'Inside 50s',
                 'Clearances', 'Clangers', 'Rebound 50s', 'Frees For', 'Frees Agains',
                 'Contested Possessions', 'Uncontested Possessions',
                 'Effective Disposals', 'Disposal Efficiency %', 'Contested Marks',
                 'Marks Inside 50', 'One Percenters', 'Bounces', 'Centre Clearances',
                 'Stoppage Clearances', 'Score Involvements', 'Metres Gained',
                 'Turnovers', 'Intercepts', 'Tackles Inside 50', 'Time On Ground %',
                 'Winloss', 'Uncontested Marks',
                 'Marks Outside 50', 'Tackles Outside 50', 'Behind Assists',
                 'Ineffective Disposals']

train_x = train_df[FULL_FEATURES]
train_y = train_df['target']
val_x = train_df[FULL_FEATURES]
val_y = train_df['target']

In [9]:
from JXAutoML.NingXiang import NingXiang

In [10]:
feature_selector = NingXiang()
feature_selector.read_in_train_data(train_x, train_y, val_x, val_y)
feature_selector.set_model_type('Regression')

NingXiang Initialised
Read in Train X data
Read in Train y data
Read in Val X data
Read in Val y data
Successfully recorded model type: Regression


In [11]:
feature_order_dict = feature_selector.get_rf_based_feature_combinations(
    n_jobs=-1)

Begin fitting Random Forest
Finished fitting Random Forest


In [12]:
feature_selector.show_rf_stats()

{'Kicks': 0.03091068524063583, 'Handballs': 0.0024938955638784577, 'Disposals': 0.14373963665173684, 'Marks': 0.006534704402619065, 'Goals': 0.06889811212145383, 'Behinds': 0.001315932163125528, 'Tackles': 0.024498385936867262, 'Hitouts': 0.036861456141004334, 'Goal Assists': 0.002791286785315583, 'Inside 50s': 0.0015612958274024852, 'Clearances': 0.0016048564551753846, 'Clangers': 0.013478212410024884, 'Rebound 50s': 0.001816243232521673, 'Frees For': 0.0016746375542726695, 'Frees Agains': 0.0027744130929174774, 'Contested Possessions': 0.09817211701597632, 'Uncontested Possessions': 0.00148483878716943, 'Effective Disposals': 0.44653042996227005, 'Disposal Efficiency %': 0.0017440127912776507, 'Contested Marks': 0.007887635522744046, 'Marks Inside 50': 0.003435843808971593, 'One Percenters': 0.0193881552465415, 'Bounces': 0.0015301389875180692, 'Centre Clearances': 0.0013317930796239103, 'Stoppage Clearances': 0.001303891526667163, 'Score Involvements': 0.03924760631684517, 'Metres G

In [13]:
with open('../models/feature_importance_ordering.pickle', 'wb') as f:
    pickle.dump(feature_order_dict, f)