# Assessment 1.1: Web Scraping
## By Daniel Thompson
## Studnet ID: 110382831
## Email: thody038@mymail.unisa.edu.au

### Web Scraping

The following Python code performs web scraping from Wikipedia to extract information about the 2023 AFL season.

In [7]:
import re
import requests
from bs4 import BeautifulSoup
import csv


In [8]:
# Initialise list of URLs
urls = []
# Opens inputs.txt and reads line by line and adds to list of urls
text_filename = "inputs.txt"
with open(text_filename, 'r') as file:
    urls = [line.strip() for line in file.readlines() if line.strip()]




In [9]:
# Function to extract rounds, returned as a ResultSet
def extract_rounds(soup):
    
    # Find all elements with a span tag whose id attribute starts with "Round_"
    round_numbers = soup.find_all('span', id=lambda x: x and x.startswith("Round_"))

    return round_numbers

# Function to extract day from text
def extract_day(text_row):
    day_pattern = r'^.*day'
    regex = re.compile(day_pattern)
    match = regex.search(text_row)
    return match.group() if match else None

# Function to extract date from text
def extract_date(text_row):
    date_pattern = r'[0-9]{1,2} [A-Z]{1}[a-z]+'
    regex = re.compile(date_pattern)
    match = regex.search(text_row)
    return match.group() if match else None

# Function to extract time from text
def extract_time(text_row):
    time_pattern = r'[0-9]{1,2}:[0-9]{2}\s[apmAPM]{2}'
    regex = re.compile(time_pattern)
    match = regex.search(text_row)
    return match.group() if match else None

# Function to extract the home team from a table cell element
def extract_home_team(td_element): 
    if td_element.get('align') == 'right':  
        team_pattern = r'[A-Za-z]+(?:\s[A-Za-z]+){0,2}\s'
        regex = re.compile(team_pattern)
        match = regex.search(td_element.text)
        return match.group() if match else None
    else:
        return None

# Function to extract the home score from a table cell element    
def extract_home_score(td_element):
    if td_element.get('align') == 'right':
        score_pattern = r'[0-9]{1,3}\.[0-9]{1,3}\s\([0-9]{1,3}\)'
        regex = re.compile(score_pattern)
        match = regex.search(td_element.text)
        return match.group() if match else None
    else:
        return None
    
# Function to extract the match result from a table cell element ("def by","def", "drew with")
def extract_result(td_element):
    if td_element.get('style') == 'text-align:center;':
       return td_element.text.strip()
    else:
        return None
    
# Function to extract the away team from a table cell element
def extract_away_team(td_element):
    if not td_element.has_attr('align') and not td_element.get('style') == 'font-size: 85%' and not td_element.get('colspan') == '6' and not td_element.get('colspan') == '3':
        anchor_elements = td_element.find('a')
        if anchor_elements:
            team_pattern = r'[A-Za-z]+(?:\s[A-Za-z]+){0,2}\s'
            regex = re.compile(team_pattern)
            match = regex.search(td_element.text)
            return match.group() if match else None
    else:
        return None

# Function to extract the away score from a table cell element    
def extract_away_score(td_element):
    if not td_element.has_attr('align') and not td_element.get('style') == 'font-size: 85%':
        anchor_elements = td_element.find('a')
        if anchor_elements:
            score_pattern = r'[0-9]{1,3}\.[0-9]{1,3}\s\([0-9]{1,3}\)'
            regex = re.compile(score_pattern)
            match = regex.search(td_element.text)
            return match.group() if match else None
    else:
        return None

# Function to extract the venue from a table cell element    
def extract_venue(td_element):
    if td_element.get('style') == 'font-size: 85%':
        anchor_elements = td_element.find('a')
        if anchor_elements:
            venue_pattern = r'([A-Za-z]+(?:\s[A-Za-z]+){0,2})\s'
            regex = re.compile(venue_pattern)
            match = regex.search(td_element.text)
            if match.group() != "ReportStats\n":
                return match.group() if match else None
    else:
        return None

# Function to extract the attendance from a table cell element
def extract_attendance(td_element):
    if td_element.get('style') == 'font-size: 85%':
        attendance_pattern = r'([0-9]{1,3},[0-9]{3})'
        regex = re.compile(attendance_pattern)
        match = regex.search(td_element.text)
        return match.group() if match else None
    else:
        return None

In [10]:
   
def data_extraction(round_numbers):


    # Iterate through each round in the list of round numbers
    for round in round_numbers:
        try:
            # Extract the text of the round, which contains the round name
            round_name = round.text
            print(round_name)

            # Find the next table after the current round element
            table = round.find_next('table')

            # Iterate through each row in the table extracting the details of each match. Print each one if they return anything not None
            for row in table.find_all('td'):
                try:
                    day = extract_day(row.text)
                    if day:
                        print(day, end=',')
                    date = extract_date(row.text)
                    if date:
                        print(date, end=',')
                    time = extract_time(row.text)
                    if time:
                        print(time, end=',')
                    home_team = extract_home_team(row)
                    if home_team:
                        print(home_team, end=',')
                    home_score = extract_home_score(row)
                    if home_score:
                        print(home_score, end=',')
                    result = extract_result(row)
                    if result:
                        print(result, end=',')
                    away_team = extract_away_team(row)
                    if away_team:
                        print(away_team, end=',')
                    away_score = extract_away_score(row)
                    if away_score:
                        print(away_score, end=',')
                    venue = extract_venue(row)
                    if venue:
                        print(venue, end=',')
                    attendance = extract_attendance(row)
                    if attendance:
                        print(attendance)
                except Exception as e:
                    print(f"Error processing row: {e}")
        except Exception as e:
            print(f"Error processing round: {e}")


In [11]:
def write_to_csv(round_numbers, count):
    # Define the filename for the CSV file
    csv_filename = "extracted_data.csv"
    day, date, time, home_team, home_score, result, away_team, away_score, venue, attendance = None, None, None, None, None, None, None, None, None, None

    # Default appends
    mode = 'a'
    
    # If first time calling function, mode will change to 'write'
    if count == 0:
        mode = 'w'

    # Open the CSV file in write mode
    try:
        with open(csv_filename, mode, newline='') as file:
            # Create a CSV writer object
            csv_writer = csv.writer(file)

            # Write header rows for men's data
            # csv_writer.writerow(["Men's"])
            csv_writer.writerow(["Round", "Day", "Date", "Time", "Home Team", "Home Score", "Result", "Away Team", "Away Score", "Venue", "Attendance"])

            # Iterate through each round in the list of round numbers
            for round in round_numbers:
                try:
                    round_name = round.text
                    table = round.find_next('table')
                    
                    for row in table.find_all('td'):
                        try:
                            # Extract data for each field if it hasn't been extracted yet    
                            if day is None:
                                day = extract_day(row.text)
                            if date is None:
                                date = extract_date(row.text)
                            if time is None:
                                time = extract_time(row.text)
                            if home_team is None:
                                home_team = extract_home_team(row)
                            if home_score is None:
                                home_score = extract_home_score(row)
                            if result is None:
                                result = extract_result(row)
                            if away_team is None:
                                away_team = extract_away_team(row)
                            if away_score is None:
                                away_score = extract_away_score(row)
                            if venue is None:
                                venue = extract_venue(row)
                            if attendance is None:
                                attendance = extract_attendance(row)

                            # Check if all fields have been extracted    
                            if all(field is not None for field in [day, date, time, home_team, home_score, result, away_team, away_score, venue, attendance]):
                                # Write the extracted data to the CSV file
                                csv_writer.writerow([round_name, day, date, time, home_team, home_score, result, away_team, away_score, venue, attendance])
                                # Reset variables for the next row
                                day, date, time, home_team, home_score, result, away_team, away_score, venue, attendance = None, None, None, None, None, None, None, None, None, None
                        except Exception as e:
                            print(f"Error processing row: {e}")
                except Exception as e:
                    print(f"Error processing round: {e}")
    except Exception as e:
        print(f"Error writing to CSV file: {e}")


In [12]:

# Count initialised to track number of times write_to_csv() has been called to determine whether it writes or appends to the csv
count = 0

# Iterates throught the list of URLs
for url in urls:

    try:
        # Send a GET request to the URL
        response = requests.get(url)
        
        # Check status for good connection
        if response.status_code == 200:
            print("Connection successful")
        else:
            print(f"Failed to connect. Status code: {response.status_code}")

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Calls to collect the ResultSet of rounds to iterate in the following functions
        round_numbers = extract_rounds(soup)

        # Calls to print data as a confidence check that the information is being extracted correctly
        data_extraction(round_numbers)
        # If count == 0, writes to CSV, else appends to CSV
        write_to_csv(round_numbers, count)
        count += 1

            
    except requests.RequestException as e:
        print(f"Error during request: {e}")



Connection successful
Round 1
Thursday,16 March,7:20 pm,Richmond ,8.10 (58),drew with,Carlton ,8.10 (58),Melbourne Cricket Ground ,88,084
Friday,17 March,7:40 pm,Geelong ,16.7 (103),def. by,Collingwood ,19.11 (125),Melbourne Cricket Ground ,86,595
Saturday,18 March,1:45 pm,North Melbourne ,12.15 (87),def.,West Coast ,12.10 (82),Marvel Stadium ,21,274
Saturday,18 March,4:05 pm,Port Adelaide ,18.18 (126),def.,Brisbane Lions ,11.6 (72),Adelaide Oval ,34,255
Saturday,18 March,7:25 pm,Melbourne ,17.13 (115),def.,Western Bulldogs ,9.11 (65),Melbourne Cricket Ground ,48,103
Saturday,18 March,7:00 pm,Gold Coast ,9.7 (61),def. by,Sydney ,16.14 (110),Heritage Bank Stadium ,13,648
Sunday,19 March,1:10 pm,Greater Western Sydney ,15.16 (106),def.,Adelaide ,12.18 (90),Giants Stadium ,8,169
Sunday,19 March,3:20 pm,Hawthorn ,9.11 (65),def. by,Essendon ,19.10 (124),Melbourne Cricket Ground ,68,691
Sunday,19 March,4:40 pm,St Kilda ,10.7 (67),def.,Fremantle ,7.10 (52),Marvel Stadium ,23,429
Round 2
Thurs