In [29]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [30]:
mainDF = pd.read_csv('rawData.csv')

# Data Cleaning

In [31]:
mainDF['Attendance'] = mainDF['Attendance'].str.replace(',', '', regex=True)  # Remove commas
mainDF['Attendance'] = pd.to_numeric(mainDF['Attendance'], errors='coerce') 

## Data Encoding
Label Encoding is used for Venues and Teams since one-hot encoding will introduce a lot of empty tables and space. However, drawback is that ML model might misinterpret the venues and teams as having an order. Label encoding could assign large numbers to certain categories, making it harder for some algorithms to interpret the values

In [32]:
for column in mainDF.columns:
    print(column)
    print(type(mainDF[column][0]))

GameId
<class 'str'>
Year
<class 'numpy.int64'>
Round
<class 'numpy.int64'>
Date
<class 'str'>
MaxTemp
<class 'numpy.float64'>
MinTemp
<class 'numpy.float64'>
Rainfall
<class 'numpy.float64'>
Venue
<class 'str'>
StartTime
<class 'str'>
Attendance
<class 'numpy.int64'>
HomeTeam
<class 'str'>
HomeTeamScoreQT
<class 'numpy.float64'>
HomeTeamScoreHT
<class 'numpy.float64'>
HomeTeamScore3QT
<class 'numpy.float64'>
HomeTeamScoreFT
<class 'numpy.float64'>
HomeTeamScore
<class 'numpy.int64'>
AwayTeam
<class 'str'>
AwayTeamScoreQT
<class 'numpy.float64'>
AwayTeamScoreHT
<class 'numpy.float64'>
AwayTeamScore3QT
<class 'numpy.float64'>
AwayTeamScoreFT
<class 'numpy.float64'>
AwayTeamScore
<class 'numpy.int64'>
homePosition
<class 'numpy.float64'>
homePoints
<class 'numpy.float64'>
homePercentage
<class 'numpy.float64'>
awayPosition
<class 'numpy.float64'>
awayPoints
<class 'numpy.float64'>
awayPercentage
<class 'numpy.float64'>


In [33]:
print(mainDF['HomeTeam'].unique())
print(mainDF['Venue'].unique())

['Greater Western Sydney' 'Richmond' 'Hawthorn' 'Melbourne' 'Gold Coast'
 'Fremantle' 'North Melbourne' 'Western Bulldogs' 'Port Adelaide'
 'Brisbane Lions' 'Essendon' 'Sydney' 'West Coast' 'Adelaide'
 'Collingwood' 'St Kilda' 'Geelong' 'Carlton']
['Stadium Australia' 'M.C.G.' 'Carrara' 'Subiaco' 'Docklands'
 'Football Park' 'Gabba' 'S.C.G.' 'Bellerive Oval' 'Blacktown'
 'Kardinia Park' 'Manuka Oval' 'York Park' 'Marrara Oval'
 'Sydney Showground' "Cazaly's Stadium" 'Wellington' 'Adelaide Oval'
 'Traeger Park' 'Jiangwan Stadium' 'Eureka Stadium' 'Perth Stadium'
 'Riverway Stadium' 'Norwood Oval' 'Summit Sports Park']


In [34]:
team_name_mapping = {
    'Adelaide': 1,
    'Brisbane Lions': 2,
    'Carlton': 3,
    'Collingwood': 4,
    'Essendon': 5,
    'Fremantle': 6,
    'Geelong': 7,
    'Gold Coast': 8,
    'Greater Western Sydney': 9,
    'Hawthorn': 10,
    'Melbourne': 11,
    'North Melbourne': 12,
    'Port Adelaide': 13,
    'Richmond': 14,
    'St Kilda': 15,
    'Sydney': 16,
    'West Coast': 17,
    'Western Bulldogs': 18
}

mainDF['HomeTeam'] = mainDF['HomeTeam'].map(team_name_mapping)
mainDF['AwayTeam'] = mainDF['AwayTeam'].map(team_name_mapping)

In [35]:
# Removing stadiums which have low capacity or low usage
stadiums_to_remove = [
    'Bellerive Oval', 'Manuka Oval', 'Stadium Australia', 'Marrara Oval', 
    "Cazaly's Stadium", 'Eureka Stadium', 'Traeger Park', 'Wellington', 
    'Jiangwan Stadium', 'Norwood Oval', 'Blacktown', 'Riverway Stadium', 
    'Summit Sports Park'
]


mainDF = mainDF[~mainDF['Venue'].isin(stadiums_to_remove)]

venue_name_mapping = {
    'M.C.G.': 1, 'Carrara': 2, 'Subiaco': 3, 'Docklands': 4, 
    'Football Park': 5, 'Gabba': 6, 'S.C.G.': 7, 'Kardinia Park': 8, 
    'York Park': 9, 'Sydney Showground': 10, 'Adelaide Oval': 11, 
    'Perth Stadium': 12
}

mainDF['Venue'] = mainDF['Venue'].map(venue_name_mapping)

In [36]:
# Dealing with Datetime Features
mainDF['Date'] = pd.to_datetime(mainDF['Date'], errors='coerce')

#Create Day Variable
mainDF['Day'] = mainDF['Date'].dt.dayofweek  # Extract the day of the week (Monday=0, Sunday=6)

#Create Month Variable
mainDF['Month'] = mainDF['Date'].dt.month

# Assuming StartTime is in 12-hour clock format (e.g., '7:20 PM')
mainDF['StartTime'] = pd.to_datetime(mainDF['StartTime'], format='%I:%M %p', errors='coerce')

# If StartTime is in 24-hour clock format (e.g., '19:20')
# mainDF['StartTime'] = pd.to_datetime(mainDF['StartTime'], format='%H:%M', errors='coerce')

# Extract minutes since midnight
mainDF['MinutesSinceMidnight'] = mainDF['StartTime'].dt.hour * 60 + mainDF['StartTime'].dt.minute

#Holiday/Seasonal Indicators


In [37]:

#Dealing with Null Values
mainDF[mainDF.isnull().any(axis=1)]

Unnamed: 0,GameId,Year,Round,Date,MaxTemp,MinTemp,Rainfall,Venue,StartTime,Attendance,...,AwayTeamScore,homePosition,homePoints,homePercentage,awayPosition,awayPoints,awayPercentage,Day,Month,MinutesSinceMidnight
198,2012QF01,2012,-4,2012-09-07,14.2,7.8,10.6,1,1900-01-01 19:50:00,85639,...,97,,,,,,,4,9,1190
199,2012EF01,2012,-5,2012-09-08,15.7,9.0,0.8,1,1900-01-01 19:45:00,44460,...,96,,,,,,,5,9,1185
200,2012QF02,2012,-4,2012-09-08,15.2,11.0,0.2,5,1900-01-01 14:45:00,44849,...,71,,,,,,,5,9,885
201,2012EF02,2012,-5,2012-09-09,29.3,8.8,0.0,3,1900-01-01 13:15:00,41790,...,66,,,,,,,6,9,795
202,2012SF01,2012,-3,2012-09-14,14.5,11.6,0.0,5,1900-01-01 20:15:00,31742,...,71,,,,,,,4,9,1215
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2442,2023SF01,2023,-3,2023-09-15,24.5,14.4,0.0,1,1900-01-01 19:50:00,96412,...,73,,,,,,,4,9,1190
2443,2023SF02,2023,-3,2023-09-16,24.0,7.6,0.0,11,1900-01-01 19:10:00,45520,...,93,,,,,,,5,9,1150
2444,2023PF01,2023,-2,2023-09-22,15.3,5.7,0.0,1,1900-01-01 19:50:00,97665,...,57,,,,,,,4,9,1190
2445,2023PF02,2023,-2,2023-09-23,24.1,14.2,0.0,6,1900-01-01 17:15:00,36012,...,63,,,,,,,5,9,1035


In [38]:
# Forward-fill NaN values within each HomeTeam group
mainDF['homePosition'] = mainDF.groupby('HomeTeam')['homePosition'].ffill()
mainDF['homePercentage'] = mainDF.groupby('HomeTeam')['homePercentage'].ffill()
mainDF['homePoints'] = mainDF.groupby('HomeTeam')['homePoints'].ffill()

mainDF['awayPosition'] = mainDF.groupby('AwayTeam')['awayPosition'].ffill()
mainDF['awayPercentage'] = mainDF.groupby('AwayTeam')['awayPercentage'].ffill()
mainDF['awayPoints'] = mainDF.groupby('AwayTeam')['awayPoints'].ffill()

# Removing Outliers

In [39]:
#Removing Years affected by COVID-19
mainDF = mainDF[(mainDF.Year != 2020)& (mainDF.Year != 2021)]

In [40]:
mainDF.to_csv(r'C:\Users\raadr\OneDrive\Desktop\AflAnalysis-\data\interim\fullSesData.csv', index=False)

In [41]:
# Create a new DataFrame for Home and Away Season
HADF = mainDF[mainDF.Round > 0]
HADF

Unnamed: 0,GameId,Year,Round,Date,MaxTemp,MinTemp,Rainfall,Venue,StartTime,Attendance,...,AwayTeamScore,homePosition,homePoints,homePercentage,awayPosition,awayPoints,awayPercentage,Day,Month,MinutesSinceMidnight
1,2012R0102,2012,1,2012-03-29,25.7,9.7,0.0,1,1900-01-01 19:45:00,78285,...,125,15.0,0.0,64.8,4.0,4.0,154.3,3,3,1185
2,2012R0103,2012,1,2012-03-30,27.4,9.7,0.0,1,1900-01-01 19:50:00,78466,...,115,6.0,4.0,119.1,13.0,0.0,83.9,4,3,1190
3,2012R0104,2012,1,2012-03-31,29.1,15.1,0.6,1,1900-01-01 13:45:00,33473,...,119,14.0,0.0,65.5,5.0,4.0,152.6,5,3,825
4,2012R0105,2012,1,2012-03-31,28.2,19.7,0.0,2,1900-01-01 15:45:00,12790,...,137,17.0,0.0,49.6,2.0,4.0,201.5,5,3,945
5,2012R0106,2012,1,2012-03-31,24.0,9.0,0.0,3,1900-01-01 16:45:00,34601,...,101,8.0,4.0,104.0,11.0,0.0,96.2,5,3,1005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2433,2023R2405,2023,24,2023-08-26,13.9,9.4,0.0,8,1900-01-01 19:25:00,20533,...,104,12.0,42.0,112.6,9.0,48.0,108.7,5,8,1165
2434,2023R2406,2023,24,2023-08-26,24.3,9.5,0.0,12,1900-01-01 18:10:00,47027,...,123,18.0,12.0,53.0,10.0,44.0,116.8,5,8,1090
2435,2023R2407,2023,24,2023-08-27,16.0,9.1,0.0,11,1900-01-01 12:00:00,39860,...,63,3.0,68.0,112.7,13.0,42.0,93.6,6,8,720
2436,2023R2408,2023,24,2023-08-27,20.8,9.8,0.0,7,1900-01-01 15:20:00,41753,...,77,8.0,50.0,110.0,4.0,64.0,125.2,6,8,920


In [42]:
HADF.to_csv(r'C:\Users\raadr\OneDrive\Desktop\AflAnalysis-\data\interim\H&AsesData.csv', index=False)

In [54]:
team_name_mapping = {
    'AD' : 'Adelaide',
    'BL' : 'Brisbane Lions',
    'CA' : 'Carlton',
    'CW' : 'Collingwood',
    'ES' : 'Essendon',
    'FR' : 'Fremantle',
    'GE' : 'Geelong',
    'GC' : 'Gold Coast',
    'GW' : 'Greater Western Sydney',
    'HW' : 'Hawthorn',
    'ME' : 'Melbourne',
    'NM' : 'North Melbourne',
    'PA' : 'Port Adelaide',
    'RI' : 'Richmond',
    'SK' : 'St Kilda',
    'SY' : 'Sydney',
    'WC' : 'West Coast',
    'WB' : 'Western Bulldogs'
}

bettingDf = pd.read_csv(r'C:\Users\raadr\OneDrive\Desktop\AflAnalysis-\data\external\aflOdds.csv')
bettingDf['HomeTeam'] = bettingDf['HomeTeam'].replace(team_name_mapping)
bettingDf['AwayTeam'] = bettingDf['AwayTeam'].replace(team_name_mapping)

KeyError: 'Home Team'

In [53]:
bettingDf

Unnamed: 0,Date,Kick Off (local),Home Team,Away Team,Venue,Home Score,Away Score,Play Off Game?,Home Goals,Home Behinds,...,Total Score Close,Total Score Over Open,Total Score Over Min,Total Score Over Max,Total Score Over Close,Total Score Under Open,Total Score Under Min,Total Score Under Max,Total Score Under Close,Notes
0,28-Sep-24,14:30,Sydney,Brisbane,MCG,60,120,Y,9,6,...,175.5,1.89,1.88,1.91,1.91,1.89,1.88,1.91,1.91,
1,21-Sep-24,17:15,Geelong,Brisbane,MCG,85,95,Y,12,13,...,168.5,1.91,1.91,1.91,1.91,1.91,1.91,1.91,1.91,
2,20-Sep-24,19:40,Sydney,Port Adelaide,SCG,95,59,Y,14,11,...,169.5,1.89,1.88,1.91,1.91,1.89,1.88,1.91,1.91,
3,14-Sep-24,19:30,GWS Giants,Brisbane,ENGIE Stadium,100,105,Y,15,10,...,167.5,1.91,1.91,1.91,1.95,1.91,1.91,1.91,1.87,
4,13-Sep-24,19:10,Port Adelaide,Hawthorn,Adelaide Oval,75,72,Y,11,9,...,166.5,1.89,1.88,1.91,1.91,1.89,1.88,1.91,1.91,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3132,27-Jun-09,14:10,Collingwood,Fremantle,MCG,169,85,,26,13,...,,,,,,,,,,
3133,26-Jun-09,19:40,Essendon,Carlton,MCG,136,67,,21,10,...,,,,,,,,,,
3134,21-Jun-09,14:40,Fremantle,Geelong,Domain Stadium,75,94,,11,9,...,,,,,,,,,,
3135,20-Jun-09,19:10,Sydney,Collingwood,Accor Stadium,66,89,,9,12,...,,,,,,,,,,
