In [1]:
# Web scraping using beautiful soup \\

# for web scraping we need a tool to web scrape, we are using BeautifulSoup, and we would need a parser!
# A parser is a tool that reads messy data (like HTML code) and helps you pick out the useful 
# parts — like titles, prices, or links — in an organized way using the specific language grammatical tree.

In [2]:
# Using Beautiful Soup for Data Collection - 
                    
# 1. What is BeautifulSoup?

# BeautifulSoup is a Python library used to parse HTML and XML documents.
# It creates a parse tree from page content, making it easy to extract data.
# It is often used with requests to scrape websites.

In [3]:
# 2. Installing BeautifulSoup -
# Install both beautifulsoup4 and a parser like lxml:

# pip install beautifulsoup4 lxml

In [73]:
# Inspect and understand the html code, and carefully find the tag where the information lies.

# Now go to the website and check what information that we need to scrape, we can check the 
# elements html code from the inspect page and then we have to figure out the html tags, basically 
# where is our information coded (under which html tag) its written

In [74]:
# --- Some suefull functions of Beautiful Soup -----

# Searching and Filtering Elements:

# soup.find(): 
# Retrieves the first element that matches specified criteria, (e.g., tag name, attributes, text content).

# soup.find_all(): 
# Retrieves a list of all elements that match the specified criteria.


In [75]:
# Since we found that the name and prices of books are under tag <h3> we would need to get info from it.

In [4]:
# Creating Beautiful Soup object - 
from bs4 import BeautifulSoup     # import 
import requests                       # we need to import requests too, to put get request.

with open("htmls/page1.html", "r", encoding="utf-8") as f:
    content = f.read()

soup = BeautifulSoup(content, "html.parser")

In [5]:
# we got all the infro from a perticular tag - <h3>, # these are the name of the books of page1 

In [20]:
articles = soup.select(".product_pod") 

booksPrices = []
for article in articles :
    titles = article.find("h3").find("a")["title"]
    prices = article.find("p", class_="price_color").text
    prices = prices.split("£")[1]
    rating_tag = article.select_one("p.star-rating")
    Ratings = rating_tag["class"][1]
    booksPrices.append([titles, prices, ratings])
booksPrices    

[['A Light in the Attic', '51.77', 'Two'],
 ['Tipping the Velvet', '53.74', 'Two'],
 ['Soumission', '50.10', 'Two'],
 ['Sharp Objects', '47.82', 'Two'],
 ['Sapiens: A Brief History of Humankind', '54.23', 'Two'],
 ['The Requiem Red', '22.65', 'Two'],
 ['The Dirty Little Secrets of Getting Your Dream Job', '33.34', 'Two'],
 ['The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull',
  '17.93',
  'Two'],
 ['The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics',
  '22.60',
  'Two'],
 ['The Black Maria', '52.15', 'Two'],
 ['Starving Hearts (Triangular Trade Trilogy, #1)', '13.99', 'Two'],
 ["Shakespeare's Sonnets", '20.66', 'Two'],
 ['Set Me Free', '17.46', 'Two'],
 ["Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)", '52.29', 'Two'],
 ['Rip it Up and Start Again', '35.02', 'Two'],
 ['Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991',
  '57.25',
  'Two'],
 ['Olio', '23.88', 'T

In [21]:
import pandas as pd

In [22]:
df = pd.DataFrame(booksPrices, columns = ["Books", "Prices", "Ratings" ])

In [23]:
df

Unnamed: 0,Books,Prices,Ratings
0,A Light in the Attic,51.77,Two
1,Tipping the Velvet,53.74,Two
2,Soumission,50.1,Two
3,Sharp Objects,47.82,Two
4,Sapiens: A Brief History of Humankind,54.23,Two
5,The Requiem Red,22.65,Two
6,The Dirty Little Secrets of Getting Your Dream...,33.34,Two
7,The Coming Woman: A Novel Based on the Life of...,17.93,Two
8,The Boys in the Boat: Nine Americans and Their...,22.6,Two
9,The Black Maria,52.15,Two


In [24]:
df = df.to_csv("pageOne.csv", index= False)