# Assessment 1.1: Web Scraping
## By Daniel Thompson
## Studnet ID: 110382831
## Email: thody038@mymail.unisa.edu.au

### Web Scraping

The following Python code performs web scraping from Wikipedia to extract information about the 2023 AFL season.

In [354]:
import re
import requests
from bs4 import BeautifulSoup

# Define the URL of the Wikipedia page
url = "https://en.wikipedia.org/wiki/2023_AFL_season"

# Send a GET request to the URL
response = requests.get(url)

#Check status for good connection
print(response.status_code)

200


### Data Extraction Functions

The following functions are defined to extract specific data from the HTML content using BeautifulSoup:

- `extract_day`: Extracts the day from a text row.
- `extract_date`: Extracts the date from a text row.
- `extract_time`: Extracts the time from a text row.
- `extract_home_team`: Extracts the home team from a table cell element.
- `extract_home_score`: Extracts the home team's score from a table cell element.
- `extract_result`: Extracts the match result from a table cell element.
- `extract_away_team`: Extracts the away team from a table cell element.
- `extract_away_score`: Extracts the away team's score from a table cell element.
- `extract_venue`: Extracts the venue from a table cell element.
- `extract_attendance`: Extracts the attendance from a table cell element.


In [355]:
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find all elements with a span tag whose id attribute starts with "Round_"
round_numbers = soup.find_all('span', id=lambda x: x and x.startswith("Round_"))

# Function to extract day from text
def extract_day(text_row):
    day_pattern = r'^.*day'
    regex = re.compile(day_pattern)
    match = regex.search(text_row)
    return match.group() if match else None

# Function to extract date from text
def extract_date(text_row):
    date_pattern = r'[0-9]{1,2} [A-Z]{1}[a-z]+'
    regex = re.compile(date_pattern)
    match = regex.search(text_row)
    return match.group() if match else None

# Function to extract time from text
def extract_time(text_row):
    time_pattern = r'[0-9]{1,2}:[0-9]{2}\s[apmAPM]{2}'
    regex = re.compile(time_pattern)
    match = regex.search(text_row)
    return match.group() if match else None

# Function to extract the home team from a table cell element
def extract_home_team(td_element): 
    if td_element.get('align') == 'right':  
        team_pattern = r'[A-Za-z]+(?:\s[A-Za-z]+){0,2}\s'
        regex = re.compile(team_pattern)
        match = regex.search(td_element.text)
        return match.group() if match else None
    else:
        return None

# Function to extract the home score from a table cell element    
def extract_home_score(td_element):
    if td_element.get('align') == 'right':
        score_pattern = r'[0-9]{1,3}\.[0-9]{1,3}\s\([0-9]{1,3}\)'
        regex = re.compile(score_pattern)
        match = regex.search(td_element.text)
        return match.group() if match else None
    else:
        return None
    
# Function to extract the match result from a table cell element ("def by","def", "drew with")
def extract_result(td_element):
    if td_element.get('style') == 'text-align:center;':
       return td_element.text.strip()
    else:
        return None
    
# Function to extract the away team from a table cell element
def extract_away_team(td_element):
    if not td_element.has_attr('align') and not td_element.get('style') == 'font-size: 85%' and not td_element.get('colspan') == '6' and not td_element.get('colspan') == '3':
        anchor_elements = td_element.find('a')
        if anchor_elements:
            team_pattern = r'[A-Za-z]+(?:\s[A-Za-z]+){0,2}\s'
            regex = re.compile(team_pattern)
            match = regex.search(td_element.text)
            return match.group() if match else None
    else:
        return None

# Function to extract the away score from a table cell element    
def extract_away_score(td_element):
    if not td_element.has_attr('align') and not td_element.get('style') == 'font-size: 85%':
        anchor_elements = td_element.find('a')
        if anchor_elements:
            score_pattern = r'[0-9]{1,3}\.[0-9]{1,3}\s\([0-9]{1,3}\)'
            regex = re.compile(score_pattern)
            match = regex.search(td_element.text)
            return match.group() if match else None
    else:
        return None

# Function to extract the venue from a table cell element    
def extract_venue(td_element):
    if td_element.get('style') == 'font-size: 85%':
        anchor_elements = td_element.find('a')
        if anchor_elements:
            venue_pattern = r'([A-Za-z]+(?:\s[A-Za-z]+){0,2})\s'
            regex = re.compile(venue_pattern)
            match = regex.search(td_element.text)
            if match.group() != "ReportStats\n":
                return match.group() if match else None
    else:
        return None

# Function to extract the attendance from a table cell element
def extract_attendance(td_element):
    if td_element.get('style') == 'font-size: 85%':
        attendance_pattern = r'([0-9]{1,3},[0-9]{3})'
        regex = re.compile(attendance_pattern)
        match = regex.search(td_element.text)
        return match.group() if match else None
    else:
        return None

### Data Extraction

This section iterates through each round's data to extract specific details using the defined extraction functions. If the function returns data, the data is printed for inspection.

In [356]:
# Iterate through each round in the list of round numbers
for round in round_numbers:
    # Extract the text of the round, which contains the round name
    round_name = round.text
    print(round_name)

    # Find the next table after the current round element
    table = round.find_next('table')

    # Iterate through each row in the table extracting the details of each match. Print each one if they return anything not None
    for row in table.find_all('td'):
        day = extract_day(row.text)
        if day:
            print(day)
        date = extract_date(row.text)
        if date:
            print(date)
        time = extract_time(row.text)
        if time:
            print(time)
        home_team = extract_home_team(row)
        if home_team:
            print(home_team)
        home_score = extract_home_score(row)
        if home_score:
            print(home_score)
        result = extract_result(row)
        if result:
            print(result)
        away_team = extract_away_team(row)
        if away_team:
            print(away_team)
        away_score = extract_away_score(row)
        if away_score:
            print(away_score)
        venue = extract_venue(row)
        if venue:
            print(venue)
        attendance = extract_attendance(row)
        if attendance:
            print(attendance)





Round 1
Thursday
16 March
7:20 pm
Richmond 
8.10 (58)
drew with
Carlton 
8.10 (58)
Melbourne Cricket Ground 
88,084
Friday
17 March
7:40 pm
Geelong 
16.7 (103)
def. by
Collingwood 
19.11 (125)
Melbourne Cricket Ground 
86,595
Saturday
18 March
1:45 pm
North Melbourne 
12.15 (87)
def.
West Coast 
12.10 (82)
Marvel Stadium 
21,274
Saturday
18 March
4:05 pm
Port Adelaide 
18.18 (126)
def.
Brisbane Lions 
11.6 (72)
Adelaide Oval 
34,255
Saturday
18 March
7:25 pm
Melbourne 
17.13 (115)
def.
Western Bulldogs 
9.11 (65)
Melbourne Cricket Ground 
48,103
Saturday
18 March
7:00 pm
Gold Coast 
9.7 (61)
def. by
Sydney 
16.14 (110)
Heritage Bank Stadium 
13,648
Sunday
19 March
1:10 pm
Greater Western Sydney 
15.16 (106)
def.
Adelaide 
12.18 (90)
Giants Stadium 
8,169
Sunday
19 March
3:20 pm
Hawthorn 
9.11 (65)
def. by
Essendon 
19.10 (124)
Melbourne Cricket Ground 
68,691
Sunday
19 March
4:40 pm
St Kilda 
10.7 (67)
def.
Fremantle 
7.10 (52)
Marvel Stadium 
23,429
Round 2
Thursday
23 March
7:20 pm
C

### CSV Writing

This section writes the extracted data into a CSV file named "extracted_data.csv". It iterates through each round's data, extracts relevant details, and writes them into the CSV file.


In [357]:
import csv

# Define the filename for the CSV file
csv_filename = "extracted_data.csv"

# Initialize variables to store extracted data
day, date, time, home_team, home_score, result, away_team, away_score, venue, attendance = None, None, None, None, None, None, None, None, None, None

# Open the CSV file in write mode
with open(csv_filename, 'w') as file:
    # Create a CSV writer object
    csv_writer = csv.writer(file)

    # Write header rows for men's data
    csv_writer.writerow(["Men's"])
    csv_writer.writerow(["Round", "Day", "Date", "Time", "Home Team", "Home Score", "Result", "Away Team", "Away Score", "Venue", "Attendance"])

    # Iterate through each round in the list of round numbers
    for round in round_numbers:
        round_name = round.text
        table = round.find_next('table')
        
        for row in table.find_all('td'):
            # Extract data for each field if it hasn't been extracted yet    
            if day == None:
                day = extract_day(row.text)
            if date == None:
                date = extract_date(row.text)
            if time == None:
                time = extract_time(row.text)
            if home_team == None:
                home_team = extract_home_team(row)
            if home_score == None:
                home_score = extract_home_score(row)
            if result == None:
                result = extract_result(row)
            if away_team == None:
                away_team = extract_away_team(row)
            if away_score == None:
                away_score = extract_away_score(row)
            if venue == None:
                venue = extract_venue(row)
            if attendance == None:
                attendance = extract_attendance(row)

            # Check if all fields have been extracted    
            if (all([field is not None for field in [day, date, time, home_team, home_score, result, away_team, away_score, venue, attendance]])) == True:
                # Write the extracted data to the CSV file
                csv_writer.writerow([round_name, day, date, time, home_team, home_score, result, away_team, away_score, venue, attendance])
                # Reset variables for the next row
                day, date, time, home_team, home_score, result, away_team, away_score, venue, attendance = None, None, None, None, None, None, None, None, None, None



    

### Web Scraping for AFL Women's Season

The following Python code performs web scraping from Wikipedia to extract information about the 2023 AFL Women's season.

In [358]:
# Define the URL to fetch
url = "https://en.wikipedia.org/wiki/2023_AFL_Women%27s_season"

# Send a GET request to the URL
response = requests.get(url)

# Print the status code of the response
print(response.status_code)

200


### BeautifulSoup Parsing for AFL Women's Season

The following code block utilizes BeautifulSoup to parse the HTML content retrieved from the Wikipedia page for the 2023 AFL Women's season. It finds all the elements with a `span` tag whose `id` attribute starts with "Round_" and stores them in the variable `round_numbers`.

In [359]:
# Parse the HTML content of the response using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find all <span> elements whose id attribute starts with "Round_"
round_numbers = soup.find_all('span', id=lambda x: x and x.startswith("Round_"))

### Data Extraction for AFL Women's Season

Like the men's, this section iterates through each round's data to extract specific details using the defined extraction functions. If the function returns data, the data is printed for inspection.


In [360]:
# Iterate through each round in the list of round numbers
for round in round_numbers:
    # Extract the text of the round, which contains the round name
    round_name = round.text
    print(round_name)

    # Find the next table after the current round element
    table = round.find_next('table')

    # Iterate through each row in the table extracting the details of each match. Print each one if they return anything not None
    for row in table.find_all('td'):
        day = extract_day(row.text)
        if day:
            print(day)
        date = extract_date(row.text)
        if date:
            print(date)
        time = extract_time(row.text)
        if time:
            print(time)
        home_team = extract_home_team(row)
        if home_team:
            print(home_team)
        home_score = extract_home_score(row)
        if home_score:
            print(home_score)
        result = extract_result(row)
        if result:
            print(result)
        away_team = extract_away_team(row)
        if away_team:
            print(away_team)
        away_score = extract_away_score(row)
        if away_score:
            print(away_score)
        venue = extract_venue(row)
        if venue:
            print(venue)
        attendance = extract_attendance(row)
        if attendance:
            print(attendance)



Round 1
Friday
1 September
7:20 pm
Melbourne 
10.13 (73)
def.
Collingwood 
4.7 (31)
Ikon Park 
8,412
Saturday
2 September
1:05 pm
Carlton 
5.4 (34)
def.
Gold Coast 
4.8 (32)
Ikon Park 
3,244
Saturday
2 September
2:35 pm
Adelaide 
8.10 (58)
def.
Port Adelaide 
4.4 (28)
Norwood Oval 
8,722
Saturday
2 September
5:05 pm
Geelong 
10.5 (65)
def.
Western Bulldogs 
2.5 (17)
GMHBA Stadium 
4,404
Saturday
2 September
7:15 pm
Hawthorn 
4.6 (30)
def. by
Essendon 
7.7 (49)
Kinetic Stadium 
3,722
Sunday
3 September
1:05 pm
North Melbourne 
8.8 (56)
def.
St Kilda 
2.4 (16)
Blundstone Arena 
2,454
Sunday
3 September
3:05 pm
Brisbane 
5.4 (34)
def. by
Richmond 
6.4 (40)
Brighton Homes Arena 
3,153
Sunday
3 September
3:05 pm
Sydney 
7.9 (51)
def.
Greater Western Sydney 
7.4 (46)
North Sydney Oval 
5,474
Sunday
3 September
3:05 pm
Fremantle 
4.3 (27)
def.
West Coast 
2.7 (19)
Fremantle Community Bank 
3,790
Round 2
Friday
8 September
5:05 pm
Richmond 
2.0 (12)
def. by
Adelaide 
6.5 (41)
Ikon Park 
1,083


### CSV Writing for AFL Women's Season

This section appends the extracted data into the same CSV file for the AFL Women's season. The Men's and Women's seasons are flagged and given their own sets of headers. It iterates through each round's data, extracts relevant details, and writes them into the CSV file.

In [361]:
# Define the filename for the CSV file
csv_filename = "extracted_data.csv"

# Initialize variables to store extracted data
day, date, time, home_team, home_score, result, away_team, away_score, venue, attendance = None, None, None, None, None, None, None, None, None, None

# Open the CSV file in append mode
with open(csv_filename, 'a') as file:
    # Create a CSV writer object
    csv_writer = csv.writer(file)

    # Write header rows for women's data
    csv_writer.writerow(["Women's"])
    csv_writer.writerow(["Round", "Day", "Date", "Time", "Home Team", "Home Score", "Result", "Away Team", "Away Score", "Venue", "Attendance"])

    # Iterate through each round in the list of round numbers
    for round in round_numbers:
        round_name = round.text

         # Find the table associated with the current round
        table = round.find_next('table')
        
        # Iterate through each row in the table
        for row in table.find_all('td'):

            # Extract data for each field if it hasn't been extracted yet    
            if day == None:
                day = extract_day(row.text)
            if date == None:
                date = extract_date(row.text)
            if time == None:
                time = extract_time(row.text)
            if home_team == None:
                home_team = extract_home_team(row)
            if home_score == None:
                home_score = extract_home_score(row)
            if result == None:
                result = extract_result(row)
            if away_team == None:
                away_team = extract_away_team(row)
            if away_score == None:
                away_score = extract_away_score(row)
            if venue == None:
                venue = extract_venue(row)
            if attendance == None:
                attendance = extract_attendance(row)

            # Check if all fields have been extracted
            if (all([field is not None for field in [day, date, time, home_team, home_score, result, away_team, away_score, venue, attendance]])) == True:
                # Write the extracted data to the CSV file
                csv_writer.writerow([round_name, day, date, time, home_team, home_score, result, away_team, away_score, venue, attendance])
                # Reset variables for the next row
                day, date, time, home_team, home_score, result, away_team, away_score, venue, attendance = None, None, None, None, None, None, None, None, None, None