In [173]:
from bs4 import BeautifulSoup
import requests

from time import sleep

import pandas as pd

In [174]:
# get the html element of an e-commerce site 
text = requests.get('https://webscraper.io/test-sites/e-commerce/allinone/phones/touch').text
soup = BeautifulSoup(text, 'lxml')

In [175]:
# Find how many times the name 'Iphone' passes in the  website
soup.find_all(string = 'Iphone')

['Iphone', 'Iphone', 'Iphone']

In [176]:
# Now find all strings that contain the dollar symbol '$'
import re
soup.find_all(string = re.compile(r'\u0024'))

['$24.99',
 '$57.99',
 '$93.99',
 '$109.99',
 '$118.99',
 '$499.99',
 '$899.99',
 '$899.99',
 '$899.99']

In [177]:
# As 'pull' is frequently used for price and review tags, now find the first 5 p tags whose class contain the word 'pull'
soup.find_all('p', class_ = re.compile('pull'), limit=5)

[<p class="pull-right">11 reviews</p>,
 <p class="pull-right">11 reviews</p>,
 <p class="pull-right">3 reviews</p>,
 <p class="pull-right">4 reviews</p>,
 <p class="pull-right">6 reviews</p>]

In [178]:
# pull all product names and prices 
product_price = {}

product_number = len(soup.find_all(class_='title'))
for i in range(product_number):
    product_price[soup.find_all(class_='title')[i].string] = soup.find_all(class_='pull-right price')[i].string

product_price    

{'Nokia 123': '$24.99',
 'LG Optimus': '$57.99',
 'Samsung Galaxy': '$93.99',
 'Nokia X': '$109.99',
 'Sony Xperia': '$118.99',
 'Ubuntu Edge': '$499.99',
 'Iphone': '$899.99'}

# marketwatch

First pull open and closing prices for TWTR

In [179]:
text = requests.get('https://www.marketwatch.com/investing/stock/twtr?mod=search_symbol').text
soup = BeautifulSoup(text, 'lxml')

In [180]:
open = soup.find_all('bg-quote', {'channel':"/zigman2/quotes/203180645/composite,/zigman2/quotes/203180645/lastsale"})[1].string
close = soup.find_all(class_='table__cell u-semi')[0].string
print(f"TWTR has current price ${open} and its previous close price was {close}")

TWTR has current price $37.45 and its previous close price was $37.22 


In [181]:
# Now get TWTR's daily and yearly price range 
daily_range = []
yearly_range = []

for item in soup.find_all(class_='range__header')[1]:
    daily_range.append(item.string)

for item in soup.find_all(class_='range__header')[2]:
    yearly_range.append(item.string)

print(f"TWTR's daily price range is between {daily_range[1]} and {daily_range[-2]} while these values for a yearly scope are {yearly_range[1]} and {yearly_range[-2]}  ")

TWTR's daily price range is between 37.19 and 38.87 while these values for a yearly scope are 31.30 and 73.34  


Lastly find the analyst ratings for TWTR

In [182]:
rating = soup.find(class_='analyst__option active').string
print(f'Analysts recommend to {rating.lower()} TWTR')

Analysts recommend to hold TWTR


# Scrape a table - Worldometers <br>
Create a complete population historical table

In [183]:
text = requests.get('https://www.worldometers.info/world-population/').text
soup = BeautifulSoup(text, 'lxml')

In [184]:
all = []
for item in soup.find_all(class_='table table-striped table-bordered table-hover table-condensed table-list')[0]:
    for el in item.find_all('td'):
        all.append(el.string)


In [185]:
df = pd.DataFrame()
for k in range(9):
    col_name = soup.find_all('th')[k].text
    df[col_name] = [all[i] for i in range(k, 162, 9)]

In [186]:
df

Unnamed: 0,Year (July 1),Population,Yearly % Change,Yearly Change,Median Age,Fertility Rate,Density (P/Km²),Urban Pop %,Urban Population
0,2020,7794798739,1.05 %,81330639,30.9,2.47,52,56.2 %,4378993944
1,2019,7713468100,1.08 %,82377060,29.8,2.51,52,55.7 %,4299438618
2,2018,7631091040,1.10 %,83232115,29.8,2.51,51,55.3 %,4219817318
3,2017,7547858925,1.12 %,83836876,29.8,2.51,51,54.9 %,4140188594
4,2016,7464022049,1.14 %,84224910,29.8,2.51,50,54.4 %,4060652683
5,2015,7379797139,1.19 %,84594707,30.0,2.52,50,54.0 %,3981497663
6,2010,6956823603,1.24 %,82983315,28.0,2.58,47,51.7 %,3594868146
7,2005,6541907027,1.26 %,79682641,27.0,2.65,44,49.2 %,3215905863
8,2000,6143493823,1.35 %,79856169,26.0,2.78,41,46.7 %,2868307513
9,1995,5744212979,1.52 %,83396384,25.0,3.01,39,44.8 %,2575505235


Alternatively,

In [187]:
table = soup.find_all('table', class_='table table-striped table-bordered table-hover table-condensed table-list')[0]

for item in table.find_all('tr')[1:]:
    row = item.find_all('td') 
    data = [td.text for td in row]

Save the dataset as a csv file

In [188]:
df.to_csv(r'C:\Users\Lenovo\Desktop\datasets\world population.csv')

# Sport Data
Scrape NFL stats throughout the league

In [189]:
text = requests.get('https://www.nfl.com/stats/player-stats/').text
soup = BeautifulSoup(text, 'lxml')

In [190]:
table = soup.find(class_='d3-o-table--horizontal-scroll')

In [191]:
header = []
for item in table.find_all('th'):
    header.append(item.string)

In [192]:
df = pd.DataFrame(columns = header)

In [193]:
for item in table.find_all('tr')[1:]:
    row_td = item.find_all('td')
    row = [el.text for el in row_td]
    df_index = len(df)
    df.loc[df_index] = row

In [194]:
df

Unnamed: 0,Player,Pass Yds,Yds/Att,Att,Cmp,Cmp %,TD,INT,Rate,1st,1st%,20+,40+,Lng,Sck,SckY
0,Tom Brady,5316,7.4,719,485,0.675,43,12,102.1,269,0.374,75,10,62,22,144
1,Justin Herbert,5014,7.5,672,443,0.659,38,15,97.7,256,0.381,53,15,72,31,214
2,Matthew Stafford,4886,8.1,601,404,0.672,41,17,102.9,233,0.388,65,18,79,30,243
3,Patrick Mahomes,4839,7.4,658,436,0.663,37,13,98.5,260,0.395,58,11,75,28,146
4,Derek Carr,4804,7.7,626,428,0.684,23,14,94.0,217,0.347,67,10,61,40,241
5,Joe Burrow,4611,8.9,520,366,0.704,34,14,108.3,202,0.388,60,15,82,51,370
6,Dak Prescott,4449,7.5,596,410,0.688,37,10,104.2,227,0.381,55,7,51,30,144
7,Josh Allen,4407,6.8,646,409,0.633,36,15,92.2,234,0.362,51,8,61,26,164
8,Kirk Cousins,4221,7.5,561,372,0.663,33,7,103.1,192,0.342,60,10,64,28,197
9,Aaron Rodgers,4115,7.7,531,366,0.689,37,4,111.9,213,0.401,55,10,75,30,188
