# Web Scrapping

1. `requests` - This is used to extract the HTML code from the given URL
2. `BeautifulSoup` - Format and Scrap the data from the HTML

**Steps**

1. Identify URL
2. Inspect HTML code
3. Find the HTML tag for the element that you want to extract.
4. Write some code to scrap this data

In [1]:
# Installing BeautifulSoup

! pip install bs4

Defaulting to user installation because normal site-packages is not writeable
Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py): started
  Building wheel for bs4 (setup.py): finished with status 'done'
  Created wheel for bs4: filename=bs4-0.0.1-py3-none-any.whl size=1257 sha256=757ae4b310851e7a3432d478dc5e6c950fba772376f593ddd7902586f65256b2
  Stored in directory: c:\users\hp\appdata\local\pip\cache\wheels\73\2b\cb\099980278a0c9a3e57ff1a89875ec07bfa0b6fcbebb9a8cad3
Successfully built bs4
Installing collected packages: bs4
Successfully installed bs4-0.0.1


In [53]:
# Loading required libraries

import numpy as np
import pandas as pd

import requests
from bs4 import BeautifulSoup

In [54]:
# Identify the URL

URL = 'https://www.flipkart.com/search?q=laptops&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off'

In [55]:
# Loading the WebPage in Memory using requests library

# requests.get(URL) 
page=requests.get(URL)

In [56]:
# Check the Status Code of the Page | .status code
page.status_code

200

In [57]:
# Extracting the HTML Code of the WebPage | text
htmlcode=page.text

Lets identify the below mentioned features and based on them we will try to scrape out the relavant data from FlipKart website.

URL = '?'

Price = '?'

Rating = '?'

Title = '?'

Feature = '?'

URL = `https://www.flipkart.com/search?q=laptop&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off`

Price = **div** `_30jeq3 _1_WHN1`  
Features = **ul** `_1xgFaf`  
Rating = **div** `_3LWZlK`  
Prod Title = **div** `_4rR01T`

In [58]:
# Format the HTML code using bs4 library

soup = BeautifulSoup(htmlcode)

In [59]:
# prettify method give you readable html code

print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <link href="https://rukminim1.flixcart.com" rel="preconnect"/>
  <link href="//static-assets-web.flixcart.com/fk-p-linchpin-web/fk-cp-zion/css/app_modules.chunk.905c37.css" rel="stylesheet"/>
  <link href="//static-assets-web.flixcart.com/fk-p-linchpin-web/fk-cp-zion/css/app.chunk.c46047.css" rel="stylesheet"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-type"/>
  <meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
  <meta content="102988293558" property="fb:page_id"/>
  <meta content="658873552,624500995,100000233612389" property="fb:admins"/>
  <meta content="noodp" name="robots"/>
  <link href="https:///www/promos/new/20150528-140547-favicon-retina.ico" rel="shortcut icon"/>
  <link href="/osdd.xml?v=2" rel="search" type="application/opensearchdescription+xml"/>
  <meta content="website" property="og:type"/>
  <meta content="Flipkart.com" name="og_site_name" property="og:site_name"/>
  <link href="/apple-touch-icon-57x

URL = `https://www.flipkart.com/search?q=laptops&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off`

Price -> div class = `_30jeq3 _1_WHN1`

Rating -> div class = `_3LWZlK`

Title -> div class = `_4rR01T`

Feature List -> ul class = `_1xgFaf`

### find()

In [60]:
# Price
# syntax : .find('tag', attrs={'UI': 'value'})

title=soup.find("div",attrs={"class":"_30jeq3 _1_WHN1"})
title.text

'₹38,900'

In [61]:
# Brand
brand=soup.find("div",attrs={"class":"_4rR01T"})
brand.text

'Lenovo IdeaPad 3 Core i3 11th Gen - (8 GB/512 GB SSD/Windows 11 Home) 82H801L7IN | 82H802FJIN | 82H802...'

In [62]:
# Rating
rating=soup.find("div",attrs={"class":"_3LWZlK"})
rating.text


'4.2'

In [63]:
# Feature List
features=soup.find("ul",attrs={"class":"_1xgFaf"})
print(features.text)


Intel Core i3 Processor (11th Gen)8 GB DDR4 RAM64 bit Windows 11 Operating System512 GB SSD39.62 cm (15.6 inch) DisplayOffice Home and Student 20212 Year Onsite�Warranty


### find_all()

In [65]:
# Find All Prices | .find_all('tag', attrs = {'UI': 'value'})
all_price=soup.find_all("div",attrs={"class":"_30jeq3 _1_WHN1"})
for tags in all_price:
    print(tags.text)


₹38,900
₹65,390
₹33,990
₹46,450
₹25,990
₹29,990
₹56,990
₹86,990
₹40,650
₹37,248
₹47,450
₹36,000
₹35,999
₹34,990
₹41,990
₹37,900
₹54,890
₹63,990
₹52,990
₹57,490
₹1,18,990
₹38,490
₹89,990
₹55,500


In [None]:
# Find All Ratings



In [66]:
price = soup.find('div', attrs = {'class' : '_30jeq3 _1_WHN1'})

print(price)

print(type(price))

print(price.text)

<div class="_30jeq3 _1_WHN1">₹38,900</div>
<class 'bs4.element.Tag'>
₹38,900


In [67]:
prices = soup.find_all('div', attrs = {'class' : '_30jeq3 _1_WHN1'})

print(prices)

print(type(prices))

print(type(prices[1]))

for tag in prices:
    print(tag.text)

[<div class="_30jeq3 _1_WHN1">₹38,900</div>, <div class="_30jeq3 _1_WHN1">₹65,390</div>, <div class="_30jeq3 _1_WHN1">₹33,990</div>, <div class="_30jeq3 _1_WHN1">₹46,450</div>, <div class="_30jeq3 _1_WHN1">₹25,990</div>, <div class="_30jeq3 _1_WHN1">₹29,990</div>, <div class="_30jeq3 _1_WHN1">₹56,990</div>, <div class="_30jeq3 _1_WHN1">₹86,990</div>, <div class="_30jeq3 _1_WHN1">₹40,650</div>, <div class="_30jeq3 _1_WHN1">₹37,248</div>, <div class="_30jeq3 _1_WHN1">₹47,450</div>, <div class="_30jeq3 _1_WHN1">₹36,000</div>, <div class="_30jeq3 _1_WHN1">₹35,999</div>, <div class="_30jeq3 _1_WHN1">₹34,990</div>, <div class="_30jeq3 _1_WHN1">₹41,990</div>, <div class="_30jeq3 _1_WHN1">₹37,900</div>, <div class="_30jeq3 _1_WHN1">₹54,890</div>, <div class="_30jeq3 _1_WHN1">₹63,990</div>, <div class="_30jeq3 _1_WHN1">₹52,990</div>, <div class="_30jeq3 _1_WHN1">₹57,490</div>, <div class="_30jeq3 _1_WHN1">₹1,18,990</div>, <div class="_30jeq3 _1_WHN1">₹38,490</div>, <div class="_30jeq3 _1_WHN1">

In [68]:
ratings = soup.find_all('div', attrs={'class' : '_3LWZlK'})

# print(ratings)

for tag in ratings:
    print(tag.text)

4.2
4.3
4.3
4.2
4
3
4.2
4.3
4.2
4.3
4.2
4.3
4.2
3.8
4.4
4.4
4.4
4.8
4.2
4.2
4.4
4.4
4
4
4.2
5
5
4.3
5
4
4.4
5
2
4.4
5
5


In [69]:
ratings = soup.find('div', attrs={'class' : '_3LWZlK'})

print(ratings.text)

4.2


### Let's look into all the URLs

https://www.flipkart.com/search?q=laptops&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page=2

https://www.flipkart.com/search?q=laptops&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page=5

https://www.flipkart.com/search?q=laptops&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page=8

https://www.flipkart.com/search?q=laptops&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page=3

https://www.flipkart.com/search?q=laptops&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page=10

https://www.flipkart.com/search?q=laptops&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page=9

In [None]:
# Code

'''
URL = https://www.flipkart.com/search?q=laptops&otracker=search
&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page=9
'''

for i in range(1, 42):
    print('https://www.flipkart.com/search?q=laptops&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page={}'. format(i))

### Code for Web Scrapping (Incorrect way)

Price -> div class = `_30jeq3 _1_WHN1`

Rating -> div class = `_3LWZlK`

Title -> div class = `_4rR01T`

Feature List -> ul class = `_1xgFaf`

In [73]:
%%time

title = []
rating = []
price = []
feature = []

for i in range(1, 42):
    URL = 'https://www.flipkart.com/search?q=laptops&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page={}'. format(i)
    
    page = requests.get(URL)
    htmlCode = page.text
    
    soup = BeautifulSoup(htmlCode)
    
    # title
    titles = soup.find_all('div', attrs={'class' : '_4rR01T'})
    for item in titles:
        title.append(item.text)
        
    # ratings
    ratings = soup.find_all('div', attrs={'class' : '_3LWZlK'})
    for item in ratings:
        rating.append(item.text)
        
    # prices
    prices = soup.find_all('div', attrs={'class' : '_30jeq3 _1_WHN1'})
    for item in prices:
        price.append(item.text)
        
    # features
    features = soup.find_all('ul', attrs={'class' : '_1xgFaf'})
    for item in features:
        feature.append(item.text)

Wall time: 1min 11s


In [74]:
print(len(title))
print(len(rating))
print(len(price))
print(len(feature))

984
1295
984
984


### Code for Web Scrapping (Correct way)

In [77]:
# Scrapping the Web Page

title = []
rating = []
price = []
features = []

for i in range(1, 42):
    URL = 'https://www.flipkart.com/search?q=laptops&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page={}'. format(i)
    
    page = requests.get(URL)
    htmlCode = page.text
    
    soup = BeautifulSoup(htmlCode)
    
    for x in soup.find_all('div', attrs={'class' : '_2kHMtA'}):

        product = x.find('div', attrs={'class' : '_4rR01T'})
        if product is None:
            title.append(np.NaN)
        else:
            title.append(product.text)

        mrp = x.find('div', attrs={'class'
                                   : '_30jeq3 _1_WHN1'})
        if mrp is None:
            price.append(np.NaN)
        else:
            price.append(mrp.text)

        rate = x.find('div', attrs={'class' : '_3LWZlK'})
        if rate is None:
            rating.append(np.NaN)
        else:
            rating.append(rate.text)

        f = x.find('ul', attrs={'class' : '_1xgFaf'})
        if f is None:
            features.append(np.NaN)
        else:
            features.append(f.text)

In [78]:
print(len(title))
print(len(price))
print(len(rating))
print(len(features))

984
984
984
984


# Create a DataFrame and save it in CSV file

In [79]:
df = pd.DataFrame({'Product' : title, 'Rating' : rating, 'MRP' : price, 'Feature' : features})

In [80]:
#head
df.head()

Unnamed: 0,Product,Rating,MRP,Feature
0,DELL Ryzen 5 Hexa Core AMD R5-6600H - (16 GB/5...,3.0,"₹86,990",Processor: AMDR5-6600H (3.30 GHz Up to 4.50 Gh...
1,Lenovo IdeaPad 3 Core i3 11th Gen - (8 GB/256 ...,4.3,"₹36,000",Intel Core i3 Processor (11th Gen)8 GB DDR4 RA...
2,ASUS VivoBook 15 (2022) Core i3 10th Gen - (8 ...,4.3,"₹33,990",Intel Core i3 Processor (10th Gen)8 GB DDR4 RA...
3,HP Ryzen 5 Hexa Core 5500U - (8 GB/512 GB SSD/...,4.3,"₹46,450",AMD Ryzen 5 Hexa Core Processor8 GB DDR4 RAM64...
4,HP Celeron Dual Core - (8 GB/256 GB SSD/Window...,,"₹25,990",Intel Celeron Dual Core Processor8 GB DDR4 RAM...


In [81]:
#shape
df.shape

(984, 4)

In [82]:
#tail
df.tail()

Unnamed: 0,Product,Rating,MRP,Feature
979,DELL G15 Core i7 11th Gen - (16 GB/512 GB SSD/...,4.3,"₹98,990","NVIDIA GEFORCE RTX 3050 Ti15.6 inches Full HD,..."
980,Lenovo Ideapad Gaming 3 Core i5 11th Gen - (8 ...,4.4,"₹66,490",Intel Core i5 Processor (11th Gen)8 GB DDR4 RA...
981,Lenovo IdeaPad Gaming 3 Core i5 11th Gen - (8 ...,4.4,"₹64,990",Intel Core i5 Processor (11th Gen)8 GB DDR4 RA...
982,Avita Liber Core i5 10th Gen - (8 GB/256 GB SS...,4.2,"₹38,476",Intel Core i5 Processor (10th Gen)8 GB DDR4 RA...
983,ASUS TUF Gaming F17 (2022) Core i7 12th Gen - ...,4.3,"₹1,18,000",Intel Core i7 Processor (12th Gen)16 GB DDR5 R...


In [84]:
# .info()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 984 entries, 0 to 983
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Product  984 non-null    object
 1   Rating   671 non-null    object
 2   MRP      984 non-null    object
 3   Feature  984 non-null    object
dtypes: object(4)
memory usage: 30.9+ KB


In [85]:

df.to_csv('Laptop_Details.csv', index = False)

In [86]:
df.to_csv('laptop_Deatils_wi.csv')

In [None]:
import numpy as np
import pandas 