### WARNING
Due to mismatch in spelling of player first names, it is recommended to not run this file, and instead download the json file from github which has most of the names corrected (approximately 27 players (aka instance in training data) who received votes were left unmatched)

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import time
import random
import json

In [2]:
TEAM_MAP = {
    'Adelaide Crows': 'Adelaide',
    'Brisbane Lions': 'Brisbane',
    'Carlton': 'Carlton',
    'Collingwood': 'Collingwood',
    'Essendon': 'Essendon',
    'Fremantle': 'Fremantle',
    'Geelong Cats': 'Geelong',
    'Gold Coast Suns': 'GoldCoast',
    'GWS Giants': 'GWS',
    'Hawthorn': 'Hawthorn',
    'North Melbourne': 'NorthMelbourne',
    'Melbourne': 'Melbourne',
    'Port Adelaide': 'PortAdelaide',
    'Richmond': 'Richmond',
    'St Kilda': 'StKilda',
    'Sydney Swans': 'Sydney',
    'West Coast Eagles': 'WestCoast',
    'Western Bulldogs': 'WesternBulldogs'
}

In [3]:
def get_teams_dict_list(soup):
    
    teams_dict_list = list()

    mydivs = soup.findAll('div')
    for div in mydivs: 
        if (div["class"] == ['row', 'votes-by-match', 'py-3']):
            team1_tmp = div.find_next('img')['alt']
            team2_tmp = div.find_next('img').find_next('img')['alt']

            team1 = TEAM_MAP[team1_tmp]
            team2 = TEAM_MAP[team2_tmp]
            
            teams_dict_list.append(f'{team1} {team2}')

    return teams_dict_list

In [4]:
def get_votes_from_game(game_data):
    """ Helper to get {player:vote} from games """
    game_votes_dict = {}

    for string in re.findall(r'<strong>.*\n.*\n.*\n.*<span', str(game_data)):

        player = re.findall(r'\t[A-Za-z \']*[-]?[A-Za-z \']* <span', string)[0].strip('\t').strip('<span').strip(' ')
        player = player.lower()
        votes = re.findall(r'>[0-9]{1,2}', string)[0].strip('>')
        
        game_votes_dict[player] = int(votes)
    
    return game_votes_dict

In [5]:
def get_game_vote_dict_list(soup):
    game_vote_dict_list = list()
    mydivs = soup.findAll('div')
    for div in mydivs: 
        if (div["class"] == ['row', 'mb-3']):
            game_votes_dict = get_votes_from_game(div)
            game_vote_dict_list.append(game_votes_dict)
    
    return game_vote_dict_list

In [6]:
def get_year_all_votes(year):
    year_all_votes = dict()

    for round in range(1, 24):
        url = f'https://aflcoaches.com.au/awards/the-aflca-champion-player-of-the-year-award/leaderboard/{year}/{year}01{str(round).zfill(2)}'
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'html.parser')

        teams_dict_list = get_teams_dict_list(soup)
        game_vote_dict_list = get_game_vote_dict_list(soup)

        round_dict = {teams_dict_list[i]:game_vote_dict_list[i] for i in range(len(teams_dict_list))}

        year_all_votes[round] = round_dict

        time.sleep(random.uniform(0.5, 5))


    return year_all_votes


In [7]:
all_votes = dict()
for year in range(2015, 2023):
    all_votes[year] = get_year_all_votes(year)

In [8]:
with open('../data/raw/AFLCA_votes.json', 'w') as f:
    json.dump(all_votes, f, indent=4)