In [271]:
## Web scraping to CSV file

In [272]:
import requests
import sqlalchemy
import getpass
import pandas as pd
from bs4 import BeautifulSoup

In [273]:
# - Define target URL and file path for future use
# - Create counter for loop

url = 'https://en.wikipedia.org/wiki/Fastest_animals'
csv_path = r'C:\Users\vaino\Desktop\Python_jupyter\Fastest_Animals.csv'
df = pd.DataFrame(columns=["Rank", "Animal", "Maximum speed", "Class", "Notes"])

In [274]:
# - Crate variables to hold credentials for the database connection
# - HINT! password is 12345 :)

table_name = 'fastest_animals'
db_user = 'postgres'
db_password = getpass.getpass("Enter Password")
db_host = 'localhost'
db_port = '5432'
db_name = 'postgres'

Enter Password ········


In [275]:
##html_page = requests.get(url).text
#data = BeautifulSoup(html_page, 'html.parser')
#tables = data.find_all('tbody')
#rows = tables[0].find_all('tr')

In [276]:
# - Load the essentials from the webpage

response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table', {'class': 'wikitable'})
rows = table.find_all('tr')[1:]

In [277]:
# - Create loop to extract the data row by row

for row in rows:
        col = row.find_all('td')
        if len(col) != 0:
            data_dict = {"Rank": col[0].get_text(strip=True),
                         "Animal": col[1].get_text(strip=True),
                         "Maximum Speed": col[2].get_text(strip=True),
                         "Class": col[3].get_text(strip=True),
                         "Notes": col[4].get_text(strip=True)}
            df1 = pd.DataFrame(data_dict, index=[0])
            df = pd.concat([df, df1], ignore_index=True)
        else:
            break

In [278]:
# - Check if the extraction is successful

df

Unnamed: 0,Rank,Animal,Maximum speed,Class,Notes,Maximum Speed
0,1,Peregrine falcon,,Flight-diving,The peregrine falcon is the fastest aerial ani...,389 km/h (242 mph)108 m/s (354 ft/s)[1][7]
1,2,Golden eagle,,Flight-diving,"Assuming the maximum size at 1.02 m, its relat...",240–320 km/h (150–200 mph)67–89 m/s (220–293 f...
2,3,White-throated needletail swift,,Flight,,169 km/h (105 mph)[9][10][11]
3,4,Eurasian hobby,,Flight,Can sometimes outfly theswift.,160 km/h (100 mph)[12]
4,5,Mexican free-tailed bat,,Flight,It has been claimed to have the fastest horizo...,160 km/h (100 mph)[13]
5,6,Frigatebird,,Flight,The frigatebird's high speed is helped by its ...,153 km/h (95 mph)
6,7,Rock dove(pigeon),,Flight,Pigeons have been clocked flying 92.5 mph (148...,148.9 km/h (92.5 mph)[14]
7,8,Spur-winged goose,,Flight,,142 km/h (88 mph)[15]
8,9,Gyrfalcon,,Flight,,128 km/h (80 mph)[citation needed]
9,10,Grey-headed albatross,,Flight,,127 km/h (79 mph)[16][17][note 1]


In [279]:
# - Drop the duplicate Maximum speed column

df = df.drop('Maximum speed', axis=1)

In [280]:
df

Unnamed: 0,Rank,Animal,Class,Notes,Maximum Speed
0,1,Peregrine falcon,Flight-diving,The peregrine falcon is the fastest aerial ani...,389 km/h (242 mph)108 m/s (354 ft/s)[1][7]
1,2,Golden eagle,Flight-diving,"Assuming the maximum size at 1.02 m, its relat...",240–320 km/h (150–200 mph)67–89 m/s (220–293 f...
2,3,White-throated needletail swift,Flight,,169 km/h (105 mph)[9][10][11]
3,4,Eurasian hobby,Flight,Can sometimes outfly theswift.,160 km/h (100 mph)[12]
4,5,Mexican free-tailed bat,Flight,It has been claimed to have the fastest horizo...,160 km/h (100 mph)[13]
5,6,Frigatebird,Flight,The frigatebird's high speed is helped by its ...,153 km/h (95 mph)
6,7,Rock dove(pigeon),Flight,Pigeons have been clocked flying 92.5 mph (148...,148.9 km/h (92.5 mph)[14]
7,8,Spur-winged goose,Flight,,142 km/h (88 mph)[15]
8,9,Gyrfalcon,Flight,,128 km/h (80 mph)[citation needed]
9,10,Grey-headed albatross,Flight,,127 km/h (79 mph)[16][17][note 1]


In [281]:
# - We will only be interested in the Maximum Speed in km/h

df["Maximum Speed"] = df["Maximum Speed"].str.split().str[:2]

In [282]:
# - Use lambda function to format the Maximum Speed values

df['Maximum Speed'] = df['Maximum Speed'].apply(lambda x: [x[0].replace('[', ''), x[1].replace(']', '')])
df['Maximum Speed'] = df['Maximum Speed'].apply(lambda x: ' '.join(x))

In [283]:
df

Unnamed: 0,Rank,Animal,Class,Notes,Maximum Speed
0,1,Peregrine falcon,Flight-diving,The peregrine falcon is the fastest aerial ani...,389 km/h
1,2,Golden eagle,Flight-diving,"Assuming the maximum size at 1.02 m, its relat...",240–320 km/h
2,3,White-throated needletail swift,Flight,,169 km/h
3,4,Eurasian hobby,Flight,Can sometimes outfly theswift.,160 km/h
4,5,Mexican free-tailed bat,Flight,It has been claimed to have the fastest horizo...,160 km/h
5,6,Frigatebird,Flight,The frigatebird's high speed is helped by its ...,153 km/h
6,7,Rock dove(pigeon),Flight,Pigeons have been clocked flying 92.5 mph (148...,148.9 km/h
7,8,Spur-winged goose,Flight,,142 km/h
8,9,Gyrfalcon,Flight,,128 km/h
9,10,Grey-headed albatross,Flight,,127 km/h


In [284]:
# - Create engine for database connection
# - Use the credentials created earlier and remember to dispose the engine

credentials = f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}'
engine = sqlalchemy.create_engine(credentials)

df.to_sql(table_name, engine, if_exists='replace', index=False)

engine.dispose()

In [285]:
# - Write dataframe into CSV file

df.to_csv(csv_path)