In [71]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [72]:
#Initialise the dataframe 
df = pd.DataFrame(columns = ["Position", "Team", "Played", "Points", "Percentage", "Round", "Year"])

In [73]:
for year in range(2012, 2025):
    url = f"https://afltables.com/afl/seas/{year}.html"

    # Send a GET request to the URL
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    table = soup.find_all("td", {"width": "15%", "valign": "top"})

    column_data = table[0].find_all("tr")

    #Opening Round
    round = 1
    teams_in_round = set()
    for row_index, row in enumerate(column_data[1:], start=1):
            row_data = row.find_all("td")
            length = len(df)
            individual_row = [row_index] + [data.text.strip() for data in row_data] + [1] + [year]
            #print(individual_row)
            df.loc[length] = individual_row
            teams_in_round.add(individual_row[1])

    all_teams = {"GW", "SY", "GC", "CA", "PA", "FR", "ES", "ME", "GE", "AD", "SK", "BL", "HW", "RI", "CW", "NM", "WB", "WC"}  # Replace with all possible team names
    missing_teams = all_teams - teams_in_round

    for team in missing_teams:
        # Append a row for the missing team with all values set to 0
        individual_row = [len(df) + 1, team, 0, 0, 0, round, 2024]  # Adjust based on your table structure
        df.loc[len(df)] = individual_row
    

    for round in range(1, len(table)):
        column_data = table[round].find_all("tr")
        for row_index, row in enumerate(column_data[1:], start=1):
            row_data = row.find_all("td")
            length = len(df)
            individual_row = [row_index] + [data.text.strip() for data in row_data] + [round + 1] + [year]
            #print(individual_row)
            df.loc[length] = individual_row

In [74]:
df

Unnamed: 0,Position,Team,Played,Points,Percentage,Round,Year
0,1,SY,1,4,270.3,1,2012
1,2,AD,1,4,201.5,1,2012
2,3,WC,1,4,156.3,1,2012
3,4,CA,1,4,154.3,1,2012
4,5,BL,1,4,152.6,1,2012
...,...,...,...,...,...,...,...
5341,14,ME,23,44,98.5,25,2024
5342,15,AD,23,34,99.1,25,2024
5343,16,WC,23,20,68.1,25,2024
5344,17,NM,23,12,63.5,25,2024


In [75]:
team_name_mapping = {
    'AD' : 'Adelaide',
    'BL' : 'Brisbane Lions',
    'CA' : 'Carlton',
    'CW' : 'Collingwood',
    'ES' : 'Essendon',
    'FR' : 'Fremantle',
    'GE' : 'Geelong',
    'GC' : 'Gold Coast',
    'GW' : 'Greater Western Sydney',
    'HW' : 'Hawthorn',
    'ME' : 'Melbourne',
    'NM' : 'North Melbourne',
    'PA' : 'Port Adelaide',
    'RI' : 'Richmond',
    'SK' : 'St Kilda',
    'SY' : 'Sydney',
    'WC' : 'West Coast',
    'WB' : 'Western Bulldogs'
}

df['Team'] = df['Team'].replace(team_name_mapping)
df

Unnamed: 0,Position,Team,Played,Points,Percentage,Round,Year
0,1,Sydney,1,4,270.3,1,2012
1,2,Adelaide,1,4,201.5,1,2012
2,3,West Coast,1,4,156.3,1,2012
3,4,Carlton,1,4,154.3,1,2012
4,5,Brisbane Lions,1,4,152.6,1,2012
...,...,...,...,...,...,...,...
5341,14,Melbourne,23,44,98.5,25,2024
5342,15,Adelaide,23,34,99.1,25,2024
5343,16,West Coast,23,20,68.1,25,2024
5344,17,North Melbourne,23,12,63.5,25,2024


In [76]:
mainDF = pd.read_csv(r'C:\Users\raadr\OneDrive\Desktop\AflAnalysis-\data\games.csv')
mainDF.head()

Unnamed: 0,GameId,Year,Round,Date,MaxTemp,MinTemp,Rainfall,Venue,StartTime,Attendance,...,HomeTeamScoreHT,HomeTeamScore3QT,HomeTeamScoreFT,HomeTeamScore,AwayTeam,AwayTeamScoreQT,AwayTeamScoreHT,AwayTeamScore3QT,AwayTeamScoreFT,AwayTeamScore
0,2012R0101,2012,R1,2012-03-24,24.0,12.2,0.0,Stadium Australia,7:20 PM,38203,...,3.3,3.4,5.7,37,Sydney,4.1,8.4,13.8,14.16,100
1,2012R0102,2012,R1,2012-03-29,25.7,9.7,0.0,M.C.G.,7:45 PM,78285,...,5.6,10.7,12.9,81,Carlton,3.2,8.7,11.13,18.17,125
2,2012R0103,2012,R1,2012-03-30,27.4,9.7,0.0,M.C.G.,7:50 PM,78466,...,10.6,14.1,20.17,137,Collingwood,2.7,7.9,12.16,16.19,115
3,2012R0104,2012,R1,2012-03-31,29.1,15.1,0.6,M.C.G.,1:45 PM,33473,...,7.4,8.8,11.12,78,Brisbane Lions,1.4,7.8,13.13,17.17,119
4,2012R0105,2012,R1,2012-03-31,28.2,19.7,0.0,Carrara,3:45 PM,12790,...,5.3,8.6,10.8,68,Adelaide,7.8,11.1,15.16,19.23,137


In [77]:
#Remove R from Round column
mainDF['Round'] = mainDF['Round'].str.replace('R', '')

def clean_round(value):
    try:
        return int(value)  # Try converting to int
    except ValueError:
        return -1

mainDF['Round'] = mainDF['Round'].apply(clean_round)
mainDF.head()

Unnamed: 0,GameId,Year,Round,Date,MaxTemp,MinTemp,Rainfall,Venue,StartTime,Attendance,...,HomeTeamScoreHT,HomeTeamScore3QT,HomeTeamScoreFT,HomeTeamScore,AwayTeam,AwayTeamScoreQT,AwayTeamScoreHT,AwayTeamScore3QT,AwayTeamScoreFT,AwayTeamScore
0,2012R0101,2012,1,2012-03-24,24.0,12.2,0.0,Stadium Australia,7:20 PM,38203,...,3.3,3.4,5.7,37,Sydney,4.1,8.4,13.8,14.16,100
1,2012R0102,2012,1,2012-03-29,25.7,9.7,0.0,M.C.G.,7:45 PM,78285,...,5.6,10.7,12.9,81,Carlton,3.2,8.7,11.13,18.17,125
2,2012R0103,2012,1,2012-03-30,27.4,9.7,0.0,M.C.G.,7:50 PM,78466,...,10.6,14.1,20.17,137,Collingwood,2.7,7.9,12.16,16.19,115
3,2012R0104,2012,1,2012-03-31,29.1,15.1,0.6,M.C.G.,1:45 PM,33473,...,7.4,8.8,11.12,78,Brisbane Lions,1.4,7.8,13.13,17.17,119
4,2012R0105,2012,1,2012-03-31,28.2,19.7,0.0,Carrara,3:45 PM,12790,...,5.3,8.6,10.8,68,Adelaide,7.8,11.1,15.16,19.23,137


In [78]:
mainDF['Round'] = mainDF['Round'].astype(int) 

#Merging Home Team Data
mainDF = mainDF.merge(
    df[['Year', 'Round', 'Team', 'Position', 'Percentage']],
    left_on=['Year', 'Round', 'HomeTeam'],  # Match Year, Round, and Home Team
    right_on=['Year', 'Round', 'Team'],
    how='left'
).rename(columns={'Position': 'homePosition', 'Percentage': 'homePercentage'}) \
 .drop(columns=['Team'])

#Merge Away Team Data
mainDF = mainDF.merge(
    df[['Year', 'Round', 'Team', 'Position', 'Percentage']],
    left_on=['Year', 'Round', 'AwayTeam'],  # Match Year, Round, and Home Team
    right_on=['Year', 'Round', 'Team'],
    how='left'
).rename(columns={'Position': 'awayPosition', 'Percentage': 'awayPercentage'}) \
 .drop(columns=['Team'])



In [81]:
mainDF[['Year', 'Round', 'HomeTeam', 'AwayTeam', 'homePosition', 'homePercentage', 'awayPosition', 'awayPercentage', 'Attendance']].head()

Unnamed: 0,Year,Round,HomeTeam,AwayTeam,homePosition,homePercentage,awayPosition,awayPercentage,Attendance
0,2012,1,Greater Western Sydney,Sydney,18.0,37.0,1.0,270.3,38203
1,2012,1,Richmond,Carlton,15.0,64.8,4.0,154.3,78285
2,2012,1,Hawthorn,Collingwood,6.0,119.1,13.0,83.9,78466
3,2012,1,Melbourne,Brisbane Lions,14.0,65.5,5.0,152.6,33473
4,2012,1,Gold Coast,Adelaide,17.0,49.6,2.0,201.5,12790


In [83]:
mainDF.to_csv('rawData.csv', index=False)