# Web Scraping using Beautiful Soup

## Install Beautiful Soup

In [1]:
!pip install requests beautifulsoup4 pandas



## Title Extraction

In [21]:
import requests 
from bs4 import BeautifulSoup 
 
response = requests.get("https://www.geeksforgeeks.org/") 
soup = BeautifulSoup(response.content, 'html.parser') 
 
print(soup.title.string)

GeeksforGeeks | A computer science portal for geeks


## Getting HTML File After saving it

In [5]:
'''with open("test.html") as fp: 
    soup = BeautifulSoup(fp, "html.parser") 
print(soup.title.string)'''

'with open("test.html") as fp: \n    soup = BeautifulSoup(fp, "html.parser") \nprint(soup.title.string)'

## Counting Interactions from Youtube

In [11]:
'''interactionCount = soup.find('meta', itemprop="interactionCount") 
print(interactionCount['content'])  
 
datePublished = soup.find('meta', itemprop="datePublished") 
print(datePublished['content']) '''

'interactionCount = soup.find(\'meta\', itemprop="interactionCount") \nprint(interactionCount[\'content\'])  \n \ndatePublished = soup.find(\'meta\', itemprop="datePublished") \nprint(datePublished[\'content\']) '

In [8]:
internalLinks = [ 
a.get('href') for a in soup.find_all('a') 
if a.get('href') and a.get('href').startswith('/')] 
print(internalLinks) 

[]


## Extracting Social Sites from the main website

In [10]:
links = [a.get('href') for a in soup.find_all('a')] 
to_extract = ["facebook.com", "twitter.com", "mailto:"] 
social_links = [] 
for link in links: 
    for social in to_extract: 
        if link and social in link: 
            social_links.append(link) 
print(social_links) 

['mailto:feedback@geeksforgeeks.org', 'https://www.facebook.com/geeksforgeeks.org/', 'https://twitter.com/geeksforgeeks']


## Searching Emails using Regex function

In [13]:
import re
emails = re.findall( 
  r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,4}", 
str(soup)) 
print(emails)

['feedback@geeksforgeeks.org', 'feedback@geeksforgeeks.org']


## Parse Tables Automatically from the link <a href="https://en.wikipedia.org/wiki/List_of_best-selling_albums">Best Selling Albums Wiki</a>

In [22]:
response =requests.get("https://en.wikipedia.org/wiki/List_of_best-selling_albums")
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find("table", class_="sortable") 
output = [] 
for row in table.findAll("tr"): 
    new_row = [] 
    for cell in row.findAll(["td", "th"]): 
        for sup in cell.findAll('sup'): 
            sup.extract() 
        for collapsible in cell.findAll( 
                class_="mw-collapsible-content"): 
            collapsible.extract() 
        new_row.append(cell.get_text().strip()) 
    output.append(new_row) 
print(output) 

[['Artist', 'Album', 'Released', 'Genre', 'Total certified copies(from available markets)*', 'Claimed sales*', 'Ref(s)'], ['Michael Jackson', 'Thriller', '1982', 'Pop, post-disco, funk, rock', '47.3', '70', ''], ['AC/DC', 'Back in Black', '1980', 'Hard rock', '29.6', '50', ''], ['Meat Loaf', 'Bat Out of Hell', '1977', 'Hard rock, glam rock, progressive rock', '21.7', '50', ''], ['Pink Floyd', 'The Dark Side of the Moon', '1973', 'Progressive rock', '24.4', '45', ''], ['Whitney Houston / Various artists', 'The Bodyguard', '1992', 'R&B, soul, pop, soundtrack', '32.4', '45', ''], ['Eagles', 'Their Greatest Hits (1971–1975)', '1976', 'Country rock, soft rock, folk rock', '41.2', '42', ''], ['Bee Gees / Various artists', 'Saturday Night Fever', '1977', 'Disco', '21.6', '40', ''], ['Fleetwood Mac', 'Rumours', '1977', 'Soft rock', '27.9', '40', ''], ['Shania Twain', 'Come On Over', '1997', 'Country, pop', '29.6', '40', '']]


## Parsing Can also be done using pandas

In [23]:
import pandas as pd 
 
table_df = pd.read_html(str(table))[0] 
table_df = table_df.drop('Ref(s)', 1) 
print(table_df.columns) # ['Artist', 'Album', 'Released' ... 
print(table_df.dtypes) # ... Released int64 ... 
print(table_df['Claimed sales*'].sum()) # 422 
print(table_df.loc[3]) 
# Artist

Index(['Artist', 'Album', 'Released', 'Genre',
       'Total certified copies(from available markets)*', 'Claimed sales*'],
      dtype='object')
Artist                                              object
Album                                               object
Released                                             int64
Genre                                               object
Total certified copies(from available markets)*    float64
Claimed sales*                                       int64
dtype: object
422
Artist                                                            Pink Floyd
Album                                              The Dark Side of the Moon
Released                                                                1973
Genre                                                       Progressive rock
Total certified copies(from available markets)*                         24.4
Claimed sales*                                                            45
Name: 3, dtype: obje

## Extract from Metadata instead HTML from <a href="https://www.netflix.com/in/title/80189685">Netflix The Witcher</a>

In [24]:
response =requests.get("https://www.netflix.com/in/title/80189685")
soup = BeautifulSoup(response.content, 'html.parser')
actors = soup.find(class_="item-starring").find( 
class_="title-data-info-item-list") 
print(actors.text.split(',')) 

['Henry Cavill', 'Anya Chalotra', 'Freya Allan']


## Finding Actors

In [25]:
import json 
 
ldJson = soup.find("script", type="application/ld+json") 
parsedJson = json.loads(ldJson.contents[0]) 
print([actor['name'] for actor in parsedJson['actors']]) 

['Henry Cavill', 'Anya Chalotra', 'Freya Allan', 'Jodhi May', 'MyAnna Buring', 'Joey Batey', 'Eamon Farren', 'Mimî M Khayisa', 'Björn Hlynur Haraldsson', 'Adam Levy', 'Lars Mikkelsen', 'Royce Pierreson', 'Wilson Mbomio', 'Anna Shaffer']


## Find Followers of Insta profile

In [27]:
response =requests.get("https://www.instagram.com/sav_bowl_of_change/") ## It is showng error since insta profile is blocked
soup = BeautifulSoup(response.content, 'html.parser')
metaDescription = soup.find("meta", {'name': 'description'}) 
print(metaDescription['content']) 

TypeError: 'NoneType' object is not subscriptable

## Searching Ecommerce Brand

In [29]:
response =requests.get("https://www.spigen.com/collections/tesla/products/tesla-model-3-ta100-sticker?variant=39270568230959")
soup = BeautifulSoup(response.content, 'html.parser')
brand = soup.find('meta', itemprop="brand") 
print(brand['content'])

Tesla


## Average Rating and Review of a certain product from Ecommerce site

In [30]:
import json 
response =requests.get("https://nomz.com/collections/energy-bites/products/gift-box?variant=31459597090948")
soup = BeautifulSoup(response.content, 'html.parser')
ldJson = soup.find("script", type="application/ld+json") 
parsedJson = json.loads(ldJson.contents[0]) 
print(parsedJson["aggregateRating"]["ratingValue"]) # 4.9 
print(parsedJson["aggregateRating"]["reviewCount"]) # 57 
print(parsedJson["weight"]) # 0.492kg -> extra, not visible in UI 

4.9
57
0.492kg


##  Details of the product can be found

In [34]:
response =requests.get("https://maruccisports.com/wood-bats/")
soup = BeautifulSoup(response.content, 'html.parser')
products = [] 
cards = soup.find_all(class_="card") 
for card in cards: 
    products.append({ 
        'id': card.get('data-entity-id'), 
        'name': card.get('data-name'), 
        'category': card.get('data-product-category'), 
        'price': card.get('data-product-price') 
    }) 
print(products) 

[{'id': '2316', 'name': "Freddie Freeman 'FREEMAN5' Pro Model", 'category': 'New Arrivals, Wood Bats, Wood Bats/Pro Model', 'price': '169.99'}, {'id': '2258', 'name': 'Gleyber Torres GLEY25 Pro Model', 'category': 'New Arrivals, Wood Bats, Wood Bats/Pro Model', 'price': '169.99'}, {'id': '2256', 'name': 'Trea Turner TVT Pro Model', 'category': 'New Arrivals, Wood Bats, Wood Bats/Pro Model', 'price': '169.99'}, {'id': '1945', 'name': '6 Bat USA Professional Cut Bundle', 'category': 'Wood Bats, Wood Bats/Professional Cuts', 'price': '579.99'}, {'id': '1804', 'name': 'M-71 Pro Model', 'category': 'Wood Bats, Wood Bats/Pro Model', 'price': '159.99'}, {'id': '914', 'name': 'AP5 Custom Pro Model', 'category': 'Wood Bats, Wood Bats/Custom Pro', 'price': '179.99'}, {'id': '2312', 'name': "Freddie Freeman 'FREEMAN5' Custom Pro Model", 'category': 'Wood Bats, Wood Bats/Custom Pro', 'price': '189.99'}, {'id': '2311', 'name': 'Gleyber Torres GLEY25 Custom Pro Model', 'category': 'Wood Bats, Wood B