# Web Scraping Tutorial using python
- We get the page from the internet
- Parse the page in html format using BeautifulSoup
- Extract the content e.g Title, Price, Rating, etc
- We can save the extracted information in dataframe or csv format

In [4]:
# Install the packages if not already installed
# since I have already installed these so I have commented these lines
#!pip install bs4
#!pip install requests
#!pip install pandas

In [6]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [7]:
# We need the whole html document/page of a website e.g flipkart.com here
# First we need the url and then the header
# We need the user agent information to use the header
# To know about the browsers user agent 
# go to whatismybrowser.com  --> Detect my setting --> Parse user agent
# Copy and paste that infromation here as shown below in HEADERS object


In [8]:
URL = "https://www.flipkart.com/search?q=tablets&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off"

In [10]:
# Headers for request
HEADERS = ({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54', 'Accept-Language': 'en-US, en;q=0.5'}) #add your user agent

In [11]:
# HTTP Request to bring in the page mentioned in the link
webpage = requests.get(URL, headers=HEADERS)

In [12]:
# If we get response as 200 then all is good
webpage

<Response [200]>

In [13]:
# HTML document retreived
webpage.content

b'<!doctype html><html lang="en"><head><link href="https://rukminim1.flixcart.com" rel="preconnect"/><link rel="stylesheet" href="//static-assets-web.flixcart.com/fk-p-linchpin-web/fk-cp-zion/css/app_modules.chunk.905c37.css"/><link rel="stylesheet" href="//static-assets-web.flixcart.com/fk-p-linchpin-web/fk-cp-zion/css/app.chunk.c46047.css"/><meta http-equiv="Content-type" content="text/html; charset=utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=Edge"/><meta property="fb:page_id" content="102988293558"/><meta property="fb:admins" content="658873552,624500995,100000233612389"/><meta name="robots" content="noodp"/><link rel="shortcut icon" href="https://static-assets-web.flixcart.com/www/promos/new/20150528-140547-favicon-retina.ico"/><link type="application/opensearchdescription+xml" rel="search" href="/osdd.xml?v=2"/><meta property="og:type" content="website"/><meta name="og_site_name" property="og:site_name" content="Flipkart.com"/><link rel="apple-touch-icon" sizes="57x57" 

In [14]:
# but this in bytes format and 
type(webpage.content)

bytes

In [15]:
# we need this in HTML format
# So we will use BeautifulSoup api for that
soup=BeautifulSoup(webpage.content, "html.parser")

In [16]:
# soup object containing all the data
soup

<!DOCTYPE html>
<html lang="en"><head><link href="https://rukminim1.flixcart.com" rel="preconnect"/><link href="//static-assets-web.flixcart.com/fk-p-linchpin-web/fk-cp-zion/css/app_modules.chunk.905c37.css" rel="stylesheet"/><link href="//static-assets-web.flixcart.com/fk-p-linchpin-web/fk-cp-zion/css/app.chunk.c46047.css" rel="stylesheet"/><meta content="text/html; charset=utf-8" http-equiv="Content-type"/><meta content="IE=Edge" http-equiv="X-UA-Compatible"/><meta content="102988293558" property="fb:page_id"/><meta content="658873552,624500995,100000233612389" property="fb:admins"/><meta content="noodp" name="robots"/><link href="https://static-assets-web.flixcart.com/www/promos/new/20150528-140547-favicon-retina.ico" rel="shortcut icon"/><link href="/osdd.xml?v=2" rel="search" type="application/opensearchdescription+xml"/><meta content="website" property="og:type"/><meta content="Flipkart.com" name="og_site_name" property="og:site_name"/><link href="/apple-touch-icon-57x57.png" rel

In [17]:
# Now we have the html page and we want to go to the links and get information from pages
# We can extract any element from the page
# If we want to extract links. We know links are present in <a> tag then attributes like div or id or class
# We go to flipkart --> inspect and check html code --> look for <a class:""> 
# copy and paste the contents in attrs= {'class':' paste code here'} as shown below in this cell
# Fetch links as List of Tag Objects
links=soup.find_all("a", attrs={'class':"_1fQZEK"})

In [18]:
# We have got all the anchor <a> tags from that particular page (flipkart.com --> tablets)
links

[<a class="_1fQZEK" href="/apple-ipad-9th-gen-64-gb-rom-10-2-inch-wi-fi-only-space-grey/p/itmd7d2c4840fa04?pid=TABG6VNCHTRZGN9N&amp;lid=LSTTABG6VNCHTRZGN9NIK0OLT&amp;marketplace=FLIPKART&amp;q=tablets&amp;store=tyy%2Fhry&amp;spotlightTagId=BestsellerId_tyy%2Fhry&amp;srno=s_1_1&amp;otracker=search&amp;otracker1=search&amp;fm=organic&amp;iid=26208fe8-f950-4cd2-bb9b-0bb14efb8766.TABG6VNCHTRZGN9N.SEARCH&amp;ppt=None&amp;ppn=None&amp;ssid=n130a4wtyo0000001672494546960&amp;qH=c6411ee33ebd1dbd" rel="noopener noreferrer" target="_blank"><div></div><div class="MIXNux"><div class="_2QcLo-"><div><div class="CXW8mj" style="height:200px;width:200px"><img alt="APPLE iPad (9th Gen) 64 GB ROM 10.2 inch with Wi-Fi Only (Space Grey)" class="_396cs4" loading="eager" src="https://rukminim1.flixcart.com/image/312/312/ktop5e80/tablet/x/9/o/mk2k3hn-a-apple-original-imag6yy7xjjugz4w.jpeg?q=70"/></div></div></div><div class="_3wLduG"><div class="_3PzNI-"><span class="f3A4_V"><label class="_2iDkf8"><input class

In [19]:
# For now we'll get the information from just 1 link
links[0]


<a class="_1fQZEK" href="/apple-ipad-9th-gen-64-gb-rom-10-2-inch-wi-fi-only-space-grey/p/itmd7d2c4840fa04?pid=TABG6VNCHTRZGN9N&amp;lid=LSTTABG6VNCHTRZGN9NIK0OLT&amp;marketplace=FLIPKART&amp;q=tablets&amp;store=tyy%2Fhry&amp;spotlightTagId=BestsellerId_tyy%2Fhry&amp;srno=s_1_1&amp;otracker=search&amp;otracker1=search&amp;fm=organic&amp;iid=26208fe8-f950-4cd2-bb9b-0bb14efb8766.TABG6VNCHTRZGN9N.SEARCH&amp;ppt=None&amp;ppn=None&amp;ssid=n130a4wtyo0000001672494546960&amp;qH=c6411ee33ebd1dbd" rel="noopener noreferrer" target="_blank"><div></div><div class="MIXNux"><div class="_2QcLo-"><div><div class="CXW8mj" style="height:200px;width:200px"><img alt="APPLE iPad (9th Gen) 64 GB ROM 10.2 inch with Wi-Fi Only (Space Grey)" class="_396cs4" loading="eager" src="https://rukminim1.flixcart.com/image/312/312/ktop5e80/tablet/x/9/o/mk2k3hn-a-apple-original-imag6yy7xjjugz4w.jpeg?q=70"/></div></div></div><div class="_3wLduG"><div class="_3PzNI-"><span class="f3A4_V"><label class="_2iDkf8"><input class=

In [20]:
# to get the actual link 
links[0].get('href')

'/apple-ipad-9th-gen-64-gb-rom-10-2-inch-wi-fi-only-space-grey/p/itmd7d2c4840fa04?pid=TABG6VNCHTRZGN9N&lid=LSTTABG6VNCHTRZGN9NIK0OLT&marketplace=FLIPKART&q=tablets&store=tyy%2Fhry&spotlightTagId=BestsellerId_tyy%2Fhry&srno=s_1_1&otracker=search&otracker1=search&fm=organic&iid=26208fe8-f950-4cd2-bb9b-0bb14efb8766.TABG6VNCHTRZGN9N.SEARCH&ppt=None&ppn=None&ssid=n130a4wtyo0000001672494546960&qH=c6411ee33ebd1dbd'

In [21]:
link=links[0].get('href')

In [22]:
# to get the complete link to the page on the internet
product_list="https://flipkart.com" + link

In [23]:
# You can copy and paste the output link in the browser
# It will lead us to the correct page
product_list

'https://flipkart.com/apple-ipad-9th-gen-64-gb-rom-10-2-inch-wi-fi-only-space-grey/p/itmd7d2c4840fa04?pid=TABG6VNCHTRZGN9N&lid=LSTTABG6VNCHTRZGN9NIK0OLT&marketplace=FLIPKART&q=tablets&store=tyy%2Fhry&spotlightTagId=BestsellerId_tyy%2Fhry&srno=s_1_1&otracker=search&otracker1=search&fm=organic&iid=26208fe8-f950-4cd2-bb9b-0bb14efb8766.TABG6VNCHTRZGN9N.SEARCH&ppt=None&ppn=None&ssid=n130a4wtyo0000001672494546960&qH=c6411ee33ebd1dbd'

In [24]:
# We have the complete link to a web page
# Now we'll repeat the process
# We'll make request to the above output page to get the html document and
# parse that page into html format as we did above
new_webpage=requests.get(product_list, headers=HEADERS)

In [25]:
# If we got response as 200 then page is working fine
new_webpage

<Response [200]>

In [26]:
# Use soup
new_soup=BeautifulSoup(new_webpage.content, "html.parser")

In [27]:
new_soup

<!DOCTYPE html>
<html lang="en"><head><link href="https://rukminim1.flixcart.com" rel="preconnect"/><link href="//static-assets-web.flixcart.com/fk-p-linchpin-web/fk-cp-zion/css/app_modules.chunk.905c37.css" rel="stylesheet"/><link href="//static-assets-web.flixcart.com/fk-p-linchpin-web/fk-cp-zion/css/app.chunk.c46047.css" rel="stylesheet"/><meta content="text/html; charset=utf-8" http-equiv="Content-type"/><meta content="IE=Edge" http-equiv="X-UA-Compatible"/><meta content="102988293558" property="fb:page_id"/><meta content="658873552,624500995,100000233612389" property="fb:admins"/><meta content="noodp" name="robots"/><link href="https://static-assets-web.flixcart.com/www/promos/new/20150528-140547-favicon-retina.ico" rel="shortcut icon"/><link href="/osdd.xml?v=2" rel="search" type="application/opensearchdescription+xml"/><meta content="website" property="og:type"/><meta content="Flipkart.com" name="og_site_name" property="og:site_name"/><link href="/apple-touch-icon-57x57.png" rel

In [32]:
# now got the page (e.g https://www.flipkart.com/apple-ipad-9th-gen-64-gb-rom-10-2-inch-wi-fi-only-space-grey/p/itmd7d2c4840fa04?pid=TABG6VNCHTRZGN9N&lid=LSTTABG6VNCHTRZGN9NIK0OLT&marketplace=FLIPKART&q=tablets&store=tyy%2Fhry&spotlightTagId=BestsellerId_tyy%2Fhry&srno=s_1_1&otracker=search&otracker1=search&fm=Search&iid=fcbf3ba6-2a37-4f22-b1b2-ff0475022e61.TABG6VNCHTRZGN9N.SEARCH&ppt=sp&ppn=sp&ssid=kfe4xm8irk0000001672493710954&qH=c6411ee33ebd1dbd) and inspect the element and copy, paste the name, attribute of an element
# inspect and look for the element with text and look for the attribute and paste it accordingly
# here in this case text is in span element and class as shown below
# we go to the span element and look for the class attribute and paste it
# we'll get all the span elements
new_soup.find("span", attrs={"class":"B_NuCI"})


<span class="B_NuCI">APPLE iPad (9th Gen) 64 GB ROM 10.2 inch with Wi-Fi Only (Space Grey)</span>

In [33]:
# to get only the text
new_soup.find("span", attrs={"class":"B_NuCI"}).text


'APPLE iPad (9th Gen) 64 GB ROM 10.2 inch with Wi-Fi Only (Space Grey)'

In [34]:
# If there are white spaces before and after the string we can remove those as well
new_soup.find("span", attrs={"class":"B_NuCI"}).text.strip()


'APPLE iPad (9th Gen) 64 GB ROM 10.2 inch with Wi-Fi Only (Space Grey)'

In [39]:
# Suppose we want to extract the price 
# now got the page (e.g https://www.flipkart.com/apple-ipad-9th-gen-64-gb-rom-10-2-inch-wi-fi-only-space-grey/p/itmd7d2c4840fa04?pid=TABG6VNCHTRZGN9N&lid=LSTTABG6VNCHTRZGN9NIK0OLT&marketplace=FLIPKART&q=tablets&store=tyy%2Fhry&spotlightTagId=BestsellerId_tyy%2Fhry&srno=s_1_1&otracker=search&otracker1=search&fm=Search&iid=fcbf3ba6-2a37-4f22-b1b2-ff0475022e61.TABG6VNCHTRZGN9N.SEARCH&ppt=sp&ppn=sp&ssid=kfe4xm8irk0000001672493710954&qH=c6411ee33ebd1dbd) and inspect the element and copy, paste the name, attribute of an element
# inspect and look for the element with price and look for the attribute and paste it accordingly
# here in this case price is in div element and class as shown below
# we'll get all the span elements
new_soup.find("div", attrs={"class":"_30jeq3 _16Jk6d"}).text


'₹29,990'

In [40]:
# To get the ratings
new_soup.find("div", attrs={"class":"_2d4LTz"}).text


'4.6'