### Libraries

In [1]:
import pandas as pd

In [2]:
import requests
from bs4 import BeautifulSoup

### Connect with URL

In [3]:
url= "https://en.wikipedia.org/wiki/2024_MotoGP_World_Championship"
headers= {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}

response= requests.get(url, headers= headers)
print(f"Status Code: {response.status_code}")
print(f"Page size: {len(response.text)} characters")

if response.status_code== 200:
    print("SUCCESS! Wikipedia responded!")
else:
    print(f"Error: {response.status_code}")


Status Code: 200
Page size: 580675 characters
SUCCESS! Wikipedia responded!


### Fetch Tables

In [4]:
soup= BeautifulSoup(response.text, 'html.parser')

# Find all tables with class 'wikitable'
tables= soup.find_all('table', {'class': 'wikitable'})
print(f"Found {len(tables)} tables on the page")


for i, table in enumerate(tables[:3]):
    print(f"\n--- Table {i+1} ---")
    # Get first row to see column headers
    first_row= table.find('tr')
    headers= [th.get_text(strip=True) for th in first_row.find_all('th')]
    print(f"Headers: {headers[:6]}")

Found 9 tables on the page

--- Table 1 ---
Headers: ['Team', 'Constructor', 'Motorcycle', 'No.', 'Rider', 'Rounds']

--- Table 2 ---
Headers: ['Key']

--- Table 3 ---
Headers: ['Round', 'Date', 'Grand Prix', 'Circuit']


In [5]:
race_table= tables[2]

first_row= race_table.find('tr')
all_headers= [th.get_text(strip= True) for th in first_row.find_all('th')]
print(f"All headers: {all_headers}")
print()

race_data= []
rows= race_table.find_all('tr')[1:]

for row in rows:
    cols= row.find_all(['td', 'th'])
    row_data= [col.get_text(strip= True) for col in cols]
    
    if len(row_data) >= 4:
        race_data.append(row_data)

print(f"Extracted {len(race_data)} races!")
print("\nFirst 3 races:")
for i, race in enumerate(race_data[:3], 1):
    print(f"{i}. {race[:5]}") 

All headers: ['Round', 'Date', 'Grand Prix', 'Circuit']

Extracted 23 races!

First 3 races:
1. ['1', '10 March', 'Qatar Airways Grand Prix of Qatar', 'Lusail International Circuit,Lusail']
2. ['2', '24 March', 'Grande Prémio Tissot de Portugal', 'Algarve International Circuit,Portimão']
3. ['3', '14 April', 'Red Bull Grand Prix of the Americas', 'Circuit of the Americas,Austin']


In [6]:
df= pd.DataFrame(race_data)
df

Unnamed: 0,0,1,2,3
0,1,10 March,Qatar Airways Grand Prix of Qatar,"Lusail International Circuit,Lusail"
1,2,24 March,Grande Prémio Tissot de Portugal,"Algarve International Circuit,Portimão"
2,3,14 April,Red Bull Grand Prix of the Americas,"Circuit of the Americas,Austin"
3,4,28 April,"Gran Premio Estrella Galicia 0,0 de España","Circuito de Jerez – Ángel Nieto,Jerez de la Fr..."
4,5,12 May,Michelin Grand Prix de France,"Bugatti Circuit,Le Mans"
5,6,26 May,Gran Premi Monster Energy de Catalunya,"Circuit de Barcelona-Catalunya,Montmeló"
6,7,2 June,Gran Premio d'Italia Brembo,"Autodromo Internazionale del Mugello,Scarperia..."
7,8,30 June,Motul TT Assen,"TT Circuit Assen,Assen"
8,9,7 July,Liqui Moly Motorrad Grand Prix Deutschland,"Sachsenring,Hohenstein-Ernstthal"
9,10,4 August,Monster Energy British Grand Prix,"Silverstone Circuit,Silverstone"


### Find and Fetch the table with Winners

In [7]:
for i, table in enumerate(tables[:7]): 
    first_row= table.find('tr')
    headers= [th.get_text(strip= True) for th in first_row.find_all('th')]
    
    if any(word in ' '.join(headers).lower() for word in ['winner', 'pole', '1st', 'podium']):
        print(f"Table {i+1} - HAS WINNER INFO!")
        print(f"Headers: {headers[:8]}")
        print()

Table 4 - HAS WINNER INFO!
Headers: ['Round', 'Grand Prix', 'Pole position', 'Fastest lap', 'Winning rider', 'Winning team', 'Winning constructor', 'Report']



In [8]:
winners_table= tables[3]

first_row= winners_table.find('tr')
winner_headers= [th.get_text(strip=True) for th in first_row.find_all('th')]
print(f"All headers: {winner_headers}\n")

winner_data = []
rows= winners_table.find_all('tr')[1:]

for row in rows:
    cols= row.find_all(['td', 'th'])
    row_data= [col.get_text(strip=True) for col in cols]
    
    if len(row_data) >= 5:
        winner_data.append(row_data)

print(f"Extracted {len(winner_data)} race results!")

df_winners= pd.DataFrame(winner_data, columns= winner_headers)

print(f"\nDataFrame shape: {df_winners.shape}")
print("\nFirst 5 races with winners:")
print(df_winners[['Round', 'Grand Prix', 'Winning rider']].head())

All headers: ['Round', 'Grand Prix', 'Pole position', 'Fastest lap', 'Winning rider', 'Winning team', 'Winning constructor', 'Report']

Extracted 20 race results!

DataFrame shape: (20, 8)

First 5 races with winners:
  Round                             Grand Prix      Winning rider
0     1            Qatar motorcycle Grand Prix  Francesco Bagnaia
1     2       Portuguese motorcycle Grand Prix       Jorge Martín
2     3  Motorcycle Grand Prix of the Americas   Maverick Viñales
3     4          Spanish motorcycle Grand Prix  Francesco Bagnaia
4     5           French motorcycle Grand Prix       Jorge Martín


### Merge the 2 Tables

In [10]:
df.columns= ['Round', 'Date', 'Grand Prix', 'Circuit']

df_complete= df.merge(
                      df_winners[['Round', 'Winning rider', 'Pole position', 'Fastest lap']], 
                        on='Round',
                      how='left'
                     )

print(f"Merged DataFrame: {df_complete.shape}")
print("\nFirst 5 complete race records:")
print(df_complete.head())

output_file= '../004_data/motogp_2024_results.csv'
df_complete.to_csv(output_file, index=False)
print(f"\nData saved to: {output_file}")
print(f"Scraping complete! {len(df_complete)} races saved!")

Merged DataFrame: (23, 7)

First 5 complete race records:
  Round      Date                                  Grand Prix  \
0     1  10 March           Qatar Airways Grand Prix of Qatar   
1     2  24 March            Grande Prémio Tissot de Portugal   
2     3  14 April         Red Bull Grand Prix of the Americas   
3     4  28 April  Gran Premio Estrella Galicia 0,0 de España   
4     5    12 May               Michelin Grand Prix de France   

                                             Circuit      Winning rider  \
0                Lusail International Circuit,Lusail  Francesco Bagnaia   
1             Algarve International Circuit,Portimão       Jorge Martín   
2                     Circuit of the Americas,Austin   Maverick Viñales   
3  Circuito de Jerez – Ángel Nieto,Jerez de la Fr...  Francesco Bagnaia   
4                            Bugatti Circuit,Le Mans       Jorge Martín   

      Pole position        Fastest lap  
0      Jorge Martín       Pedro Acosta  
1   Enea Bastianin