In [None]:
!pip install requests beautifulsoup4 pandas




## *Scrape Book Data from First Page*

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


# Target URL
url = "https://books.toscrape.com/"

# Send HTTP request
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Extract book containers
books = soup.select("article.product_pod")

# Lists to hold data
titles, prices, ratings, availability, genres = [], [], [], [], []

# Loop through each book
for book in books:
    title = book.h3.a['title']
    price = book.select_one(".price_color").text.strip()
    rating = book.p['class'][1]
    stock = book.select_one(".instock.availability").text.strip()

    titles.append(title)
    prices.append(price)
    ratings.append(rating)
    availability.append(stock)
    genres.append("Unknown")  # We’ll fix this in a later step

# Create DataFrame
df = pd.DataFrame({
    "Title": titles,
    "Price": prices,
    "Rating": ratings,
    "Availability": availability,
    "Genre": genres
})

df.head()


Unnamed: 0,Title,Price,Rating,Availability,Genre
0,A Light in the Attic,Â£51.77,Three,In stock,Unknown
1,Tipping the Velvet,Â£53.74,One,In stock,Unknown
2,Soumission,Â£50.10,One,In stock,Unknown
3,Sharp Objects,Â£47.82,Four,In stock,Unknown
4,Sapiens: A Brief History of Humankind,Â£54.23,Five,In stock,Unknown


## Scrap all pages

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin

# Base URL
base_url = "https://books.toscrape.com/catalogue/page-{}.html"

# Initialize lists
titles, prices, ratings, availability, genres = [], [], [], [], []

# Loop through 1 to 50 pages
for page_num in range(1, 51):
    url = base_url.format(page_num)
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Page {page_num} not found, skipping.")
        continue

    soup = BeautifulSoup(response.text, 'html.parser')
    books = soup.select("article.product_pod")

    for book in books:
        title = book.h3.a['title']
        price = book.select_one(".price_color").text.strip()
        rating = book.p['class'][1]
        stock = book.select_one(".instock.availability").text.strip()

        # Visit the book's individual page to get genre
        book_url = urljoin(url, book.h3.a['href'])
        book_resp = requests.get(book_url)
        book_soup = BeautifulSoup(book_resp.text, 'html.parser')
        breadcrumb = book_soup.select('ul.breadcrumb li a')

        if len(breadcrumb) >= 3:
            genre = breadcrumb[2].text.strip()
        else:
            genre = "Unknown"

        # Append data
        titles.append(title)
        prices.append(price)
        ratings.append(rating)
        availability.append(stock)
        genres.append(genre)

    print(f"Page {page_num} scraped.")

# Create DataFrame
df_all = pd.DataFrame({
    "Title": titles,
    "Price": prices,
    "Rating": ratings,
    "Availability": availability,
    "Genre": genres
})

df_all.head()


Page 1 scraped.
Page 2 scraped.
Page 3 scraped.
Page 4 scraped.
Page 5 scraped.
Page 6 scraped.
Page 7 scraped.
Page 8 scraped.
Page 9 scraped.
Page 10 scraped.
Page 11 scraped.
Page 12 scraped.
Page 13 scraped.
Page 14 scraped.
Page 15 scraped.
Page 16 scraped.
Page 17 scraped.
Page 18 scraped.
Page 19 scraped.
Page 20 scraped.
Page 21 scraped.
Page 22 scraped.
Page 23 scraped.
Page 24 scraped.
Page 25 scraped.
Page 26 scraped.
Page 27 scraped.
Page 28 scraped.
Page 29 scraped.
Page 30 scraped.
Page 31 scraped.
Page 32 scraped.
Page 33 scraped.
Page 34 scraped.
Page 35 scraped.
Page 36 scraped.
Page 37 scraped.
Page 38 scraped.
Page 39 scraped.
Page 40 scraped.
Page 41 scraped.
Page 42 scraped.
Page 43 scraped.
Page 44 scraped.
Page 45 scraped.
Page 46 scraped.
Page 47 scraped.
Page 48 scraped.
Page 49 scraped.
Page 50 scraped.


Unnamed: 0,Title,Price,Rating,Availability,Genre
0,A Light in the Attic,Â£51.77,Three,In stock,Poetry
1,Tipping the Velvet,Â£53.74,One,In stock,Historical Fiction
2,Soumission,Â£50.10,One,In stock,Fiction
3,Sharp Objects,Â£47.82,Four,In stock,Mystery
4,Sapiens: A Brief History of Humankind,Â£54.23,Five,In stock,History


In [None]:
df_all.to_csv("books_data.csv", index=False)
print("Data saved to books_data.csv")


Data saved to books_data.csv


## *Part2 - Pyspark*

In [None]:
!apt-get install openjdk-11-jdk -y
!pip install pyspark


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java
  libatk-wrapper-java-jni libxt-dev libxtst6 libxxf86dga1 openjdk-11-jre
  x11-utils
Suggested packages:
  libxt-doc openjdk-11-demo openjdk-11-source visualvm mesa-utils
The following NEW packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java
  libatk-wrapper-java-jni libxt-dev libxtst6 libxxf86dga1 openjdk-11-jdk
  openjdk-11-jre x11-utils
0 upgraded, 10 newly installed, 0 to remove and 35 not upgraded.
Need to get 6,920 kB of archives.
After this operation, 16.9 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 fonts-dejavu-core all 2.37-2build1 [1,041 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 fonts-dejavu-extra all 2.37-2build1 [2,041 kB]
Get:3 http://archive.ubuntu.com/ubuntu jam

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Create Spark session
spark = SparkSession.builder \
    .appName("Books Data Analysis") \
    .getOrCreate()


In [None]:
from google.colab import files
uploaded = files.upload()


Saving books_data.csv to books_data (1).csv


In [None]:
#spark dataframe
df_spark = spark.read.csv("books_data.csv", header=True, inferSchema=True)
df_spark.show(10)  # Show first 10 rows




+--------------------+-------+------+------------+------------------+
|               Title|  Price|Rating|Availability|             Genre|
+--------------------+-------+------+------------+------------------+
|A Light in the Attic|Â£51.77| Three|    In stock|            Poetry|
|  Tipping the Velvet|Â£53.74|   One|    In stock|Historical Fiction|
|          Soumission|Â£50.10|   One|    In stock|           Fiction|
|       Sharp Objects|Â£47.82|  Four|    In stock|           Mystery|
|Sapiens: A Brief ...|Â£54.23|  Five|    In stock|           History|
|     The Requiem Red|Â£22.65|   One|    In stock|       Young Adult|
|The Dirty Little ...|Â£33.34|  Four|    In stock|          Business|
|The Coming Woman:...|Â£17.93| Three|    In stock|           Default|
|The Boys in the B...|Â£22.60|  Four|    In stock|           Default|
|     The Black Maria|Â£52.15|   One|    In stock|            Poetry|
+--------------------+-------+------+------------+------------------+
only showing top 10 

In [None]:

df_spark.printSchema()


root
 |-- Title: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Availability: string (nullable = true)
 |-- Genre: string (nullable = true)



In [None]:
df_spark.show(10, truncate=False)


+----------------------------------------------------------------------------------------------+-------+------+------------+------------------+
|Title                                                                                         |Price  |Rating|Availability|Genre             |
+----------------------------------------------------------------------------------------------+-------+------+------------+------------------+
|A Light in the Attic                                                                          |Â£51.77|Three |In stock    |Poetry            |
|Tipping the Velvet                                                                            |Â£53.74|One   |In stock    |Historical Fiction|
|Soumission                                                                                    |Â£50.10|One   |In stock    |Fiction           |
|Sharp Objects                                                                                 |Â£47.82|Four  |In stock    |Mystery     

In [None]:
df_spark.describe().show()


+-------+--------------------+-------+------+------------+-----------+
|summary|               Title|  Price|Rating|Availability|      Genre|
+-------+--------------------+-------+------+------------+-----------+
|  count|                1000|   1000|  1000|        1000|       1000|
|   mean|                NULL|   NULL|  NULL|        NULL|       NULL|
| stddev|                NULL|   NULL|  NULL|        NULL|       NULL|
|    min|"""Most Blessed o...|Â£10.00|  Five|    In stock|   Academic|
|    max|               salt.|Â£59.99|   Two|    In stock|Young Adult|
+-------+--------------------+-------+------+------------+-----------+



In [None]:
#Filter Data

df_spark = df_spark.withColumn("Price", col("Price").substr(2, 10).cast("float"))  # Remove '£' and convert
df_spark.filter(col("Price") > 20).show(10, truncate=False)


+-----+-----+------+------------+-----+
|Title|Price|Rating|Availability|Genre|
+-----+-----+------+------------+-----+
+-----+-----+------+------------+-----+



In [None]:
#convert text ratings like "Three", "Five" to numbers:

from pyspark.sql.functions import when

rating_map = {
    "One": 1,
    "Two": 2,
    "Three": 3,
    "Four": 4,
    "Five": 5
}

# Map text to numbers
df_spark = df_spark.withColumn("Rating_Num",
    when(col("Rating") == "One", 1)
    .when(col("Rating") == "Two", 2)
    .when(col("Rating") == "Three", 3)
    .when(col("Rating") == "Four", 4)
    .when(col("Rating") == "Five", 5)
)

df_spark.filter(col("Rating_Num") >= 4).show(25, truncate=False)


+---------------------------------------------------------------------------------------------------------+-----+------+------------+--------------+----------+
|Title                                                                                                    |Price|Rating|Availability|Genre         |Rating_Num|
+---------------------------------------------------------------------------------------------------------+-----+------+------------+--------------+----------+
|Sharp Objects                                                                                            |NULL |Four  |In stock    |Mystery       |4         |
|Sapiens: A Brief History of Humankind                                                                    |NULL |Five  |In stock    |History       |5         |
|The Dirty Little Secrets of Getting Your Dream Job                                                       |NULL |Four  |In stock    |Business      |4         |
|The Boys in the Boat: Nine Americans an