In [263]:
%%javascript
 IPython.OutputArea.prototype._should_scroll = function(lines) {
     return false;
 }

<IPython.core.display.Javascript object>

# Imports and Loading Data

In [264]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
import warnings

In [265]:
# import warnings filter
from warnings import simplefilter
# ignores warnings
warnings.simplefilter(action='ignore', category=Warning)

In [266]:
# Constant variables for splitting data into test and train datasets
ALL_SEASONS = "2014-23"
TEST_TRAIN_SEASON_SPLIT = "2022-23"

In [267]:
# Loading match data
matches_df = pd.read_csv(f"data/calculatedData/Calculated - {ALL_SEASONS}.csv", index_col=0)

# Formatting and Cleaning Data

In [268]:
matches_df["date"] = pd.to_datetime(matches_df["date"])

# Retains both team names after one hot encoding takes place
matches_df["team"] = matches_df["team1Name"]
matches_df["opponent"] = matches_df["team2Name"]

matches_df.rename(columns = {'team1Points': 'teamPoints', 'team2Points': 'opponentPoints'}, inplace = True)

In [269]:
# For filtering out columns that won't be used as features
main_features_exclude = ["team", "opponent", "season", "date", "team1Name", "team2Name", "teamPoints", "opponentPoints",
                         "totalPoints", "venue", "teamType", "referee", "result", "day", "month", "targetCode",
                         "avgLast5BpsTeam1", "avgLast5BpsAgainstTeam1", "avgLast5BpsTeam2", "avgLast5BpsAgainstTeam2",
                         "avgLast5TriesTeam1", "avgLast5ConversionsTeam1", "avgLast5PenaltiesTeam1",
                         "avgLast5DropGoalsTeam1", "avgLast5TriesAgainstTeam1", "avgLast5ConversionsAgainstTeam1",
                         "avgLast5PenaltiesAgainstTeam1", "avgLast5DropGoalsAgainstTeam1", "avgLast5TriesTeam2",
                         "avgLast5ConversionsTeam2", "avgLast5PenaltiesTeam2", "avgLast5DropGoalsTeam2",
                         "avgLast5TriesAgainstTeam2", "avgLast5ConversionsAgainstTeam2",
                         "avgLast5PenaltiesAgainstTeam2", "avgLast5DropGoalsAgainstTeam2"]

# For filtering out smaller subset of columns that won't be used as features - for second iteration of prediction modelling
extended_features_exclude = ["team", "opponent", "season", "date", "team1Name", "team2Name", "teamPoints",
                             "opponentPoints", "totalPoints",  "venue", "teamType", "referee", "result", "day",
                             "month", "targetCode"]

In [270]:
# Creates list of columns that will be used as features
main_features = matches_df.columns[~matches_df.columns.isin(main_features_exclude)]
main_features

Index(['avgLast5ScoresTeam1', 'avgLast5ScoresAgainstTeam1',
       'avgLast5ScoresTeam2', 'avgLast5ScoresAgainstTeam2', 'wonLastGameTeam1',
       'wonLastGameTeam2', 'team1LastSeasonStanding',
       'team2LastSeasonStanding'],
      dtype='object')

In [271]:
# Scales numeric columns
scaler = MinMaxScaler()
matches_df[main_features] = scaler.fit_transform(matches_df[main_features])

In [272]:
# One hot encode categorical columns
transformer = make_column_transformer(
    (OneHotEncoder(sparse = False), ['team1Name', 'team2Name', 'venue', 'month']),
    remainder = 'passthrough')

# Creates new dataframe with one hot encoded columns
transformed = transformer.fit_transform(matches_df)
matches_one_hot_encoded_df = pd.DataFrame(
    transformed, 
    columns = transformer.get_feature_names()
)

In [273]:
# Convert numeric columns back to numeric values
numeric_columns = [i for i in matches_one_hot_encoded_df.columns if i not in ["date", "team", "opponent", "team1Name",
                                                                              "team2Name", "season", "venue",
                                                                              "teamType", "day", "month", "referee",
                                                                              "result"]]
for column in numeric_columns:
    matches_one_hot_encoded_df[column] = pd.to_numeric(matches_one_hot_encoded_df[column])

In [274]:
# Gets columns to use as features - including one hot encoded columns
main_features = matches_one_hot_encoded_df.columns[~matches_one_hot_encoded_df.columns.isin(main_features_exclude)].tolist()
extended_features = matches_one_hot_encoded_df.columns[~matches_one_hot_encoded_df.columns.isin(extended_features_exclude)].tolist()

# Training Model and Generating Predictions

In [275]:
rf = RandomForestClassifier(n_estimators = 100, min_samples_split = 10, random_state = 1)

In [276]:
train = matches_one_hot_encoded_df[matches_one_hot_encoded_df["season"] != TEST_TRAIN_SEASON_SPLIT]
test = matches_one_hot_encoded_df[matches_one_hot_encoded_df["season"] == TEST_TRAIN_SEASON_SPLIT]

In [277]:
# Fits model using training dataset with main_features, with the targetColumn as the goal to predict
rf.fit(train[main_features], train["totalPoints"])

RandomForestClassifier(min_samples_split=10, random_state=1)

In [278]:
# Generates predictions using test dataset with main_features
predictions = rf.predict(test[main_features])

In [279]:
# Combines the actual targetCode values with the predicted targetCode values from test dataset
combined_target_codes_df = pd.DataFrame(dict(actual = test["totalPoints"], prediction = predictions))

# Merges with matches_one_hot_encoded_df and uses some key columns of interest
combined_target_codes_df = combined_target_codes_df.merge(matches_one_hot_encoded_df[["date", "team", "opponent",
                                                                                      "teamPoints", "opponentPoints",
                                                                                      "result", "teamType"]],
                                                          left_index=True, right_index=True)

In [280]:
# Creates dataframe with predictions for both teams on the same row
same_row_df = combined_target_codes_df.merge(combined_target_codes_df, left_on = ["date", "team"],
                                             right_on = ["date", "opponent"])

# Filters to remove second row for each match
same_row_reduced_df = same_row_df.loc[same_row_df['teamType_x'] == "home"]

In [281]:
same_row_reduced_df['points_difference_in_predictions'] = same_row_reduced_df['actual_x'] - same_row_reduced_df['prediction_x']

In [282]:
same_row_reduced_df.to_csv(f"data/predictedResults/points/Predicted Total Points - {TEST_TRAIN_SEASON_SPLIT} - First Iteration.csv")

In [283]:
print("Prediction Modelling Complete")

Prediction Modelling Complete


# Seperate Functions for Running Predictions on Multiple Datasets

In [284]:
# Runs prediction modelling
def run_modelling(target_df, features, season):
    
    train = target_df[target_df["season"] != season]
    test = target_df[target_df["season"] == season]
    
    # Fits model using training dataset with selected_columns, with the targetColumn as the goal to predict
    rf.fit(train[features], train["totalPoints"])
    
    # Generates predictions using test dataset with features
    predictions = rf.predict(test[features])
    
    # Combines the actual targetCode values with the predicted targetCode values from test dataset
    combined_target_codes_df = pd.DataFrame(dict(actual = test["totalPoints"],
                                                 prediction = predictions), index = test.index)
    
    # Merges with target_df with a selection of key columns of interest
    combined_target_codes_df = combined_target_codes_df.merge(target_df[["date", "team", "opponent", "teamPoints",
                                                                         "opponentPoints", "result", "teamType"]],
                                                              left_index = True, right_index = True)
    
    # Creates dataframe with predictions for both teams on each row
    same_row_df = combined_target_codes_df.merge(combined_target_codes_df, left_on=["date", "team"],
                                                 right_on=["date", "opponent"])
    
    return same_row_df

In [285]:
# Main script for creating predictions
def make_predictions(first_season_start, first_season_end, final_season, run_second_iteration):

    # Season
    season = f"{first_season_start}-{first_season_end}"
    
    # Loading match data
    matches_df = pd.read_csv(f"data/calculatedData/Calculated - 2014-{first_season_end}.csv", index_col = 0)
    
    # Formatting and Cleaning Data
    matches_df["date"] = pd.to_datetime(matches_df["date"])
    
    # Retains both team names after one hot encoding takes place
    matches_df["team"] = matches_df["team1Name"]
    matches_df["opponent"] = matches_df["team2Name"]
    
    matches_df.rename(columns = {'team1Points': 'teamPoints', 'team2Points': 'opponentPoints'}, inplace = True)
    
    # Creates list of columns that will be used as features
    selected_columns = matches_df.columns[~matches_df.columns.isin(main_features_exclude)].tolist()
    selected_columns_extended = matches_df.columns[~matches_df.columns.isin(extended_features_exclude)].tolist()
    
    # Scales numeric columns
    matches_df[selected_columns] = scaler.fit_transform(matches_df[selected_columns])

    # Creates new dataframe with one hot encoded columns
    transformed = transformer.fit_transform(matches_df)
    matches_one_hot_encoded_df = pd.DataFrame(
        transformed, 
        columns = transformer.get_feature_names()
    )
    
    # Convert numeric columns back to numeric values
    numeric_columns = [i for i in matches_one_hot_encoded_df.columns if i not in ["date", "opponent", "team", "team1Name", "team2Name", "season",
                                                           "venue", "teamType", "day", "month", "referee", "result"]]
    for column in numeric_columns:
        matches_one_hot_encoded_df[column] = pd.to_numeric(matches_one_hot_encoded_df[column])

    # Gets columns to use as features - including one hot encoded columns
    selected_columns = matches_one_hot_encoded_df.columns[~matches_one_hot_encoded_df.columns.isin(main_features_exclude)].tolist()
    selected_columns_extended = matches_one_hot_encoded_df.columns[~matches_one_hot_encoded_df.columns.isin(extended_features_exclude)].tolist()
    
    # Runs prediction modelling on matches_one_hot_encoded_df
    first_modelling_df = run_modelling(matches_one_hot_encoded_df, selected_columns, season)

    # Filters to remove second column for each match
    same_row_reduced_df = first_modelling_df.loc[first_modelling_df['teamType_x'] == "home"]
    
    same_row_reduced_df['points_difference_in_predictions'] = same_row_reduced_df['actual_x'] - same_row_reduced_df['prediction_x']

    return same_row_reduced_df

In [286]:
first_season_start = 2015
first_season_end = 16
final_season = 23

while first_season_end <= final_season:
    combined_target_codes_df = make_predictions(str(first_season_start), str(first_season_end), str(final_season), False)
    combined_target_codes_df.to_csv(f"data/predictedResults/points/Predicted Total Points {first_season_start}-{first_season_end}.csv")
    first_season_start += 1
    first_season_end += 1

In [287]:
print("Prediction Modelling Complete")

Prediction Modelling Complete
