Web Scraping Data from a Real Website + Pandas 

In [202]:
from bs4 import BeautifulSoup
import requests

In [204]:
url = 'https://www.hubertiming.com/results/2017GPTR10K'

page = requests.get(url)

soup = BeautifulSoup(page.text, 'html')

In [205]:
table = soup.find_all('table')[1]

In [206]:
soup.find_all('th')

[<th>Place</th>,
 <th>Bib</th>,
 <th>Name</th>,
 <th>Gender</th>,
 <th>City</th>,
 <th>State</th>,
 <th>Time</th>,
 <th>Gun Time</th>,
 <th>Team</th>]

In [207]:
titles = soup.find_all('th')

In [208]:
table_titles = [title.text for title in titles]
table_titles

['Place', 'Bib', 'Name', 'Gender', 'City', 'State', 'Time', 'Gun Time', 'Team']

In [209]:
import pandas as pd

In [211]:
df = pd.DataFrame(columns = table_titles)
df

Unnamed: 0,Place,Bib,Name,Gender,City,State,Time,Gun Time,Team


In [218]:
column_data = table.find_all('tr')

In [220]:
for row in column_data[1:]:
    row_data = row.find_all('td')
    individual_row_data = [data.text.strip() for data in row_data]

    length = len(df)
    df.loc[length] = individual_row_data

In [222]:
df

Unnamed: 0,Place,Bib,Name,Gender,City,State,Time,Gun Time,Team
0,1,814,JARED WILSON,M,TIGARD,OR,36:21,36:24,
1,2,573,NATHAN A SUSTERSIC,M,PORTLAND,OR,36:42,36:45,INTEL TEAM F
2,3,687,FRANCISCO MAYA,M,PORTLAND,OR,37:44,37:48,
3,4,623,PAUL MORROW,M,BEAVERTON,OR,38:34,38:37,
4,5,569,DEREK G OSBORNE,M,HILLSBORO,OR,39:21,39:24,INTEL TEAM F
...,...,...,...,...,...,...,...,...,...
572,573,273,RACHEL L VANEY,F,OTHER,OR,1:38:17,1:38:34,
573,574,467,ROHIT B DSOUZA,M,PORTLAND,OR,1:38:31,1:40:32,INTEL TEAM I
574,575,471,CENITA D'SOUZA,F,PORTLAND,OR,1:38:32,1:40:34,
575,576,338,PRANAVI APPANA,F,HILLSBORO,OR,1:40:47,1:42:01,


In [224]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 577 entries, 0 to 576
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Place     577 non-null    object
 1   Bib       577 non-null    object
 2   Name      577 non-null    object
 3   Gender    577 non-null    object
 4   City      577 non-null    object
 5   State     577 non-null    object
 6   Time      577 non-null    object
 7   Gun Time  577 non-null    object
 8   Team      577 non-null    object
dtypes: object(9)
memory usage: 45.1+ KB


In [226]:
df['Place'] = df['Place'].astype('int64')
df['Bib'] = df['Bib'].astype('int64')
df['Time'].astype(str)
df['Gun Time'].astype(str)

0        36:24
1        36:45
2        37:48
3        38:37
4        39:24
        ...   
572    1:38:34
573    1:40:32
574    1:40:34
575    1:42:01
576    1:42:10
Name: Gun Time, Length: 577, dtype: object

In [228]:
# Filter out rows that are not in the MM:SS format
df = df[df['Time'].str.match(r'^\d{1,2}:\d{2}$')]
df['Time'] = pd.to_timedelta(df['Time'].str.zfill(5).apply(lambda x: f"00:{x}"))
df = df[df['Gun Time'].str.match(r'^\d{1,2}:\d{2}$')]
df['Gun Time'] = pd.to_timedelta(df['Gun Time'].str.zfill(5).apply(lambda x: f"00:{x}"))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Time'] = pd.to_timedelta(df['Time'].str.zfill(5).apply(lambda x: f"00:{x}"))


In [230]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 289 entries, 0 to 299
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype          
---  ------    --------------  -----          
 0   Place     289 non-null    int64          
 1   Bib       289 non-null    int64          
 2   Name      289 non-null    object         
 3   Gender    289 non-null    object         
 4   City      289 non-null    object         
 5   State     289 non-null    object         
 6   Time      289 non-null    timedelta64[ns]
 7   Gun Time  289 non-null    timedelta64[ns]
 8   Team      289 non-null    object         
dtypes: int64(2), object(5), timedelta64[ns](2)
memory usage: 22.6+ KB


In [232]:
df.head()

Unnamed: 0,Place,Bib,Name,Gender,City,State,Time,Gun Time,Team
0,1,814,JARED WILSON,M,TIGARD,OR,0 days 00:36:21,0 days 00:36:24,
1,2,573,NATHAN A SUSTERSIC,M,PORTLAND,OR,0 days 00:36:42,0 days 00:36:45,INTEL TEAM F
2,3,687,FRANCISCO MAYA,M,PORTLAND,OR,0 days 00:37:44,0 days 00:37:48,
3,4,623,PAUL MORROW,M,BEAVERTON,OR,0 days 00:38:34,0 days 00:38:37,
4,5,569,DEREK G OSBORNE,M,HILLSBORO,OR,0 days 00:39:21,0 days 00:39:24,INTEL TEAM F
