In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# url of the wikipedia page
url = 'https://en.wikipedia.org/wiki/List_of_winners_of_the_Tokyo_Marathon'

In [3]:
# Send a GET request to the URL and store the response
response = requests.get(url)

In [4]:
# Parse the HTML content of the page
soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
# Find the table containing the data
table = soup.find('table', {'class': 'wikitable'})

In [6]:
# Extract data from the table and convert it to a pandas DataFrame
df = pd.read_html(str(table))[0]

In [7]:
# Inspect the data
df.head()

Unnamed: 0,Year,Men's winner,Country,Time,Women's winner,Country.1,Time.1,Rf.
0,2007,Daniel Njenga,Kenya,2:09:45,Hitomi Niiya,Japan,2:31:01,[1]
1,2008,Viktor Röthlin,Switzerland,2:07:23,Claudia Dreher,Germany,2:35:35,[2][3]
2,2009,Salim Kipsang,Kenya,2:10:27,Mizuho Nasukawa,Japan,2:25:38,[4]
3,2010,Masakazu Fujiwara,Japan,2:12:19,Alevtina Biktimirova,Russia,2:34:39,[5]
4,2011,Hailu Mekonnen,Ethiopia,2:07:35,Noriko Higuchi[a],Japan,2:28:49,[8]


In [8]:
# Drop Rf. column
df.drop('Rf.', axis=1, inplace=True)

In [9]:
df.head()

Unnamed: 0,Year,Men's winner,Country,Time,Women's winner,Country.1,Time.1
0,2007,Daniel Njenga,Kenya,2:09:45,Hitomi Niiya,Japan,2:31:01
1,2008,Viktor Röthlin,Switzerland,2:07:23,Claudia Dreher,Germany,2:35:35
2,2009,Salim Kipsang,Kenya,2:10:27,Mizuho Nasukawa,Japan,2:25:38
3,2010,Masakazu Fujiwara,Japan,2:12:19,Alevtina Biktimirova,Russia,2:34:39
4,2011,Hailu Mekonnen,Ethiopia,2:07:35,Noriko Higuchi[a],Japan,2:28:49


In [10]:
# Rename the columns
df.rename(columns={
    'Country.1': 'W_Country',
    'Time.1': 'W_Time'
}, inplace=True)

In [11]:
df

Unnamed: 0,Year,Men's winner,Country,Time,Women's winner,W_Country,W_Time
0,2007,Daniel Njenga,Kenya,2:09:45,Hitomi Niiya,Japan,2:31:01
1,2008,Viktor Röthlin,Switzerland,2:07:23,Claudia Dreher,Germany,2:35:35
2,2009,Salim Kipsang,Kenya,2:10:27,Mizuho Nasukawa,Japan,2:25:38
3,2010,Masakazu Fujiwara,Japan,2:12:19,Alevtina Biktimirova,Russia,2:34:39
4,2011,Hailu Mekonnen,Ethiopia,2:07:35,Noriko Higuchi[a],Japan,2:28:49
5,2012,Michael Kipyego,Kenya,2:07:37,Atsede Habtamu,Ethiopia,2:25:28
6,2013,Dennis Kimetto,Kenya,2:06:50,Aberu Kebede,Ethiopia,2:25:34
7,2014,Dickson Chumba,Kenya,2:05:42,Tirfi Tsegaye,Ethiopia,2:22:23
8,2015,Endeshaw Negesse,Ethiopia,2:06:00,Birhane Dibaba,Ethiopia,2:23:15
9,2016,Feyisa Lilesa,Ethiopia,2:06:56,Helah Kiprop,Kenya,2:21:27


In [12]:
# Remove the label in '2021[b]'
df['Year'] = df['Year'].str.replace('[b]', '')

In [13]:
df

Unnamed: 0,Year,Men's winner,Country,Time,Women's winner,W_Country,W_Time
0,2007,Daniel Njenga,Kenya,2:09:45,Hitomi Niiya,Japan,2:31:01
1,2008,Viktor Röthlin,Switzerland,2:07:23,Claudia Dreher,Germany,2:35:35
2,2009,Salim Kipsang,Kenya,2:10:27,Mizuho Nasukawa,Japan,2:25:38
3,2010,Masakazu Fujiwara,Japan,2:12:19,Alevtina Biktimirova,Russia,2:34:39
4,2011,Hailu Mekonnen,Ethiopia,2:07:35,Noriko Higuchi[a],Japan,2:28:49
5,2012,Michael Kipyego,Kenya,2:07:37,Atsede Habtamu,Ethiopia,2:25:28
6,2013,Dennis Kimetto,Kenya,2:06:50,Aberu Kebede,Ethiopia,2:25:34
7,2014,Dickson Chumba,Kenya,2:05:42,Tirfi Tsegaye,Ethiopia,2:22:23
8,2015,Endeshaw Negesse,Ethiopia,2:06:00,Birhane Dibaba,Ethiopia,2:23:15
9,2016,Feyisa Lilesa,Ethiopia,2:06:56,Helah Kiprop,Kenya,2:21:27


In [14]:
# Check whether data types are correct 
df.dtypes

Year              object
Men's winner      object
Country           object
Time              object
Women's winner    object
W_Country         object
W_Time            object
dtype: object

In [15]:
# Write the DataFrame to a CSV file
df.to_csv('tokyo_marathon_winners.csv', index=False)