In [1]:
# Import essential libraries

import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [2]:
# urls for data scraping

url_200_m = 'https://worldathletics.org/records/toplists/sprints/200-metres/outdoor/men/senior/2023?regionType=countries&region=ngr&timing=electronic&windReading=regular&page=1&bestResultsOnly=false'
url_400_m = 'https://worldathletics.org/records/toplists/sprints/400-metres/outdoor/men/senior/2023?regionType=countries&region=ngr&timing=electronic&page=1&bestResultsOnly=false'
url_110_mh = 'https://worldathletics.org/records/toplists/hurdles/110-metres-hurdles/outdoor/men/senior/2023?regionType=countries&region=ngr&timing=electronic&windReading=regular&page=1&bestResultsOnly=false'
url_400_mh = 'https://worldathletics.org/records/toplists/hurdles/400-metres-hurdles/outdoor/men/senior/2023?regionType=countries&region=ngr&timing=electronic&page=1&bestResultsOnly=false'

In [3]:
# Split by / to check for sport title 
url_400_mh.split('/')

['https:',
 '',
 'worldathletics.org',
 'records',
 'toplists',
 'hurdles',
 '400-metres-hurdles',
 'outdoor',
 'men',
 'senior',
 '2023?regionType=countries&region=ngr&timing=electronic&page=1&bestResultsOnly=false']

In [4]:
def scrape_web(url):
    
    """
    Scrape data from website and save as html.
    
    Attribute: 
            url: data website
            
    Returns: 
        None
    """
    
    # Download HTML
    response = requests.get(url) 
    
    # Open a file and save the HTML content
    with open(url.split('/')[5]+'-'+url.split('/')[6]+'.html', mode = 'wb') as file: 
        file.write(response.content)

In [5]:
# Call the function and get the HTML files
scrape_web(url_200_m)
scrape_web(url_400_m)
scrape_web(url_110_mh)
scrape_web(url_400_mh)

In [6]:
# Check all files in the current directory with .html
for data in glob.glob('*.html'):
    print(data)

hurdles-110-metres-hurdles.html
hurdles-400-metres-hurdles.html
sprints-200-metres.html
sprints-400-metres.html


In [7]:
# Grab all html files

def form_df(data):
    
    """
    Grab required text from html file and form Dataframe.
    
    Attribute: 
            data: html file
            
    Returns: 
        df: Dataframe
        
    """
    
    with open(data) as file: # Open an html file
        soup = BeautifulSoup(file, 'lxml') # Make a soup
        table = soup.find('table', class_="records-table")  # Grab the desired table
        columns = [cols.text.strip() for cols in table.find_all('th')] # Get column headers as a list
        df = pd.DataFrame(columns= columns) # Form an empty dataframe with column headers

        for element in table.find_all('tr')[1:]: #Grab data under the tr tag

            row_data = element.find_all('td') #Find all text in the td tag
            df_row_data = [i.text.strip() for i in row_data] # Extract text from the td tag


            lenght = len(df)
            df.loc[lenght] = df_row_data #Iterate and append data to df
        return df

In [8]:
# Instantiate and get your df
men_200_df = form_df('sprints-200-metres.html')
men_110_mh_df = form_df('hurdles-110-metres-hurdles.html')
men_400_m_df = form_df('sprints-400-metres.html')
men_400_mh_df = form_df('hurdles-400-metres-hurdles.html')

In [9]:
# Confirm dataframes
men_200_df.head()

Unnamed: 0,Rank,Mark,WIND,Competitor,DOB,Nat,Pos,Unnamed: 8,Venue,Date,Results Score
0,1,19.76,0.6,Udodi Chudi ONWUZURIKE,29 JAN 2003,NGR,1sf2,,"Mike A. Myers Stadium, Austin, TX (USA)",07 JUN 2023,1258
1,2,19.84,0.9,Udodi Chudi ONWUZURIKE,29 JAN 2003,NGR,1,,"Mike A. Myers Stadium, Austin, TX (USA)",09 JUN 2023,1245
2,3,19.91,0.9,Udodi Chudi ONWUZURIKE,29 JAN 2003,NGR,1,,"Hilmer Lodge Stadium, Walnut, CA (USA)",14 MAY 2023,1234
3,4,20.07,0.4,Udodi Chudi ONWUZURIKE,29 JAN 2003,NGR,1h1,,"Hornet Stadium - Sac St., Sacramento, CA (USA)",26 MAY 2023,1209
4,5,20.14,1.8,Udodi Chudi ONWUZURIKE,29 JAN 2003,NGR,1f3,,"Hilmer Lodge Stadium, Walnut, CA (USA)",15 APR 2023,1198


In [10]:
men_110_mh_df.head()

Unnamed: 0,Rank,Mark,WIND,Competitor,DOB,Nat,Pos,Unnamed: 8,Venue,Date,Results Score
0,1,13.5,0.7,Stephen ELOJI,,NGR,1,,"Soccer Stadium - North Texas University, Dento...",14 MAY 2023,1158
1,2,13.61,1.7,Bashiru ABDILLAHI,26 OCT 1997,NGR,5h2,,"Hornet Stadium - Sac St., Sacramento, CA (USA)",26 MAY 2023,1138
2,3,13.66,0.1,Prosper Oghenemine EKPORERE,03 NOV 2002,NGR,1,,"Samuel Ogbemudia Stadium, Benin City (NGR)",06 JUL 2023,1128
3,4,13.68,1.1,Bashiru ABDILLAHI,26 OCT 1997,NGR,5sf2,,"Mike A. Myers Stadium, Austin, TX (USA)",07 JUN 2023,1125
4,5,13.71,0.7,Bashiru ABDILLAHI,26 OCT 1997,NGR,2,,"Soccer Stadium - North Texas University, Dento...",14 MAY 2023,1119


In [11]:
men_400_m_df.head()

Unnamed: 0,Rank,Mark,Competitor,DOB,Nat,Pos,Unnamed: 7,Venue,Date,Results Score
0,1,44.24,Emmanuel BAMIDELE,06 JUL 1999,NGR,1,,"Mike A. Myers Stadium, Austin, TX (USA)",09 JUN 2023,1233
1,2,44.67,Emmanuel BAMIDELE,06 JUL 1999,NGR,2sf3,,"Mike A. Myers Stadium, Austin, TX (USA)",07 JUN 2023,1203
2,3,44.71,Emmanuel BAMIDELE,06 JUL 1999,NGR,4,,"LSU Bernie Moore Stadium, Baton Rouge, LA (USA)",13 MAY 2023,1200
3,4,44.79,Emmanuel BAMIDELE,06 JUL 1999,NGR,1f2,,"Stadion Wankdorf, Bern (SUI)",04 AUG 2023,1194
4,5,44.8,Emmanuel BAMIDELE,06 JUL 1999,NGR,1h1,,"LSU Bernie Moore Stadium, Baton Rouge, LA (USA)",12 MAY 2023,1194


In [12]:
men_400_mh_df.head()

Unnamed: 0,Rank,Mark,Competitor,DOB,Nat,Pos,Unnamed: 7,Venue,Date,Results Score
0,1,48.47,Ezekiel NATHANIEL,20 JUN 2003,NGR,4h4,,"Nemzeti Atlétikai Központ, Budapest (HUN)",20 AUG 2023,1207
1,2,48.52,Ezekiel NATHANIEL,20 JUN 2003,NGR,2,,"John Jacobs Track Complex, Norman, OK (USA)",14 MAY 2023,1205
2,3,48.54,Ezekiel NATHANIEL,20 JUN 2003,NGR,3,,"Mike A. Myers Stadium, Austin, TX (USA)",09 JUN 2023,1204
3,4,48.55,Ezekiel NATHANIEL,20 JUN 2003,NGR,2,,"Wolfe Track & Field Complex, Memphis, TN (USA)",04 AUG 2023,1203
4,5,48.74,Ezekiel NATHANIEL,20 JUN 2003,NGR,1h1,,"John Jacobs Track Complex, Norman, OK (USA)",13 MAY 2023,1193


In [13]:
# Save as csv file
men_200_df.to_csv('men_200.csv', index=False)
men_110_mh_df.to_csv('men_110_mh.csv', index=False)
men_400_m_df.to_csv('men_400_m.csv', index=False)
men_400_mh_df.to_csv('men_400_mh.csv', index=False)