In [39]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [40]:
mainDF = pd.read_csv('rawData.csv')
mainDF.head()
for column in mainDF.columns:
    print(column)
    print(type(mainDF[column][0]))

GameId
<class 'str'>
Year
<class 'numpy.int64'>
Round
<class 'numpy.int64'>
Date
<class 'str'>
MaxTemp
<class 'numpy.float64'>
MinTemp
<class 'numpy.float64'>
Rainfall
<class 'numpy.float64'>
Venue
<class 'str'>
StartTime
<class 'str'>
Attendance
<class 'str'>
HomeTeam
<class 'str'>
HomeTeamScoreQT
<class 'numpy.float64'>
HomeTeamScoreHT
<class 'numpy.float64'>
HomeTeamScore3QT
<class 'numpy.float64'>
HomeTeamScoreFT
<class 'numpy.float64'>
HomeTeamScore
<class 'numpy.int64'>
AwayTeam
<class 'str'>
AwayTeamScoreQT
<class 'numpy.float64'>
AwayTeamScoreHT
<class 'numpy.float64'>
AwayTeamScore3QT
<class 'numpy.float64'>
AwayTeamScoreFT
<class 'numpy.float64'>
AwayTeamScore
<class 'numpy.int64'>
homePosition
<class 'numpy.float64'>
homePoints
<class 'numpy.float64'>
homePercentage
<class 'numpy.float64'>
awayPosition
<class 'numpy.float64'>
awayPoints
<class 'numpy.float64'>
awayPercentage
<class 'numpy.float64'>


# Data Cleaning

In [41]:
mainDF['Attendance'] = mainDF['Attendance'].str.replace(',', '', regex=True)  # Remove commas
mainDF['Attendance'] = pd.to_numeric(mainDF['Attendance'], errors='coerce') 

## Data Encoding
Label Encoding is used for Venues and Teams since one-hot encoding will introduce a lot of empty tables and space. However, drawback is that ML model might misinterpret the venues and teams as having an order. Label encoding could assign large numbers to certain categories, making it harder for some algorithms to interpret the values

In [42]:
print(mainDF['HomeTeam'].unique())
print(mainDF['Venue'].unique())

['Greater Western Sydney' 'Richmond' 'Hawthorn' 'Melbourne' 'Gold Coast'
 'Fremantle' 'North Melbourne' 'Western Bulldogs' 'Port Adelaide'
 'Brisbane Lions' 'Essendon' 'Sydney' 'West Coast' 'Adelaide'
 'Collingwood' 'St Kilda' 'Geelong' 'Carlton']
['Stadium Australia' 'M.C.G.' 'Carrara' 'Subiaco' 'Docklands'
 'Football Park' 'Gabba' 'S.C.G.' 'Bellerive Oval' 'Blacktown'
 'Kardinia Park' 'Manuka Oval' 'York Park' 'Marrara Oval'
 'Sydney Showground' "Cazaly's Stadium" 'Wellington' 'Adelaide Oval'
 'Traeger Park' 'Jiangwan Stadium' 'Eureka Stadium' 'Perth Stadium'
 'Riverway Stadium' 'Norwood Oval' 'Summit Sports Park']


In [43]:
team_name_mapping = {
    'Adelaide': 1,
    'Brisbane Lions': 2,
    'Carlton': 3,
    'Collingwood': 4,
    'Essendon': 5,
    'Fremantle': 6,
    'Geelong': 7,
    'Gold Coast': 8,
    'Greater Western Sydney': 9,
    'Hawthorn': 10,
    'Melbourne': 11,
    'North Melbourne': 12,
    'Port Adelaide': 13,
    'Richmond': 14,
    'St Kilda': 15,
    'Sydney': 16,
    'West Coast': 17,
    'Western Bulldogs': 18
}

mainDF['HomeTeam'] = mainDF['HomeTeam'].map(team_name_mapping)
mainDF['AwayTeam'] = mainDF['AwayTeam'].map(team_name_mapping)

In [16]:
stadium_name_mapping = {
    'Stadium Australia': 1,
    'M.C.G.': 2,
    'Carrara': 3,
    'Subiaco': 4,
    'Docklands': 5,
    'Football Park': 6,
    'Gabba': 7,
    'S.C.G.': 8,
    'Bellerive Oval': 9,
    'Blacktown': 10,
    'Kardinia Park': 11,
    'Manuka Oval': 12,
    'York Park': 13,
    'Marrara Oval': 14,
    'Sydney Showground': 15,
    "Cazaly's Stadium": 16,
    'Wellington': 17,
    'Adelaide Oval': 18,
    'Traeger Park': 19,
    'Jiangwan Stadium': 20,
    'Eureka Stadium': 21,
    'Perth Stadium': 22,
    'Riverway Stadium': 23,
    'Norwood Oval': 24,
    'Summit Sports Park': 25
}
mainDF['Venue'] = mainDF['Venue'].map(venue_name_mapping)
mainDF