<a href="https://colab.research.google.com/github/Tanishq7642/Machine-Learning-UML501-/blob/main/Assignment_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

> **Q1. Write a Python program to scrape all available books from the website (https://books.toscrape.com/) Books to Scrape. For each book, extract the following details:**
>
> 1.  **Title**
> 2.  **Price**
> 3.  **Availability (In stock / Out of stock)**
> 4.  **Star Rating (One, Two, Three, Four, Five)**
>
> **Store the scraped results into a Pandas DataFrame and export them to a CSV file named `books.csv`.**
>
> *(Note: Use the requests library to fetch the HTML page. Use BeautifulSoup to parse and extract book details and handle pagination so that books from all pages are scraped)*

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

base_url = "https://books.toscrape.com/catalogue/"
current_url = base_url + "page-1.html"

all_books_data = []

print("Starting to scrape books...")

while current_url:
    response = requests.get(current_url)
    soup = BeautifulSoup(response.text, "html.parser")

    books_on_page = soup.find_all("article", class_="product_pod")

    for book in books_on_page:
        title = book.h3.a["title"]
        price = book.find("p", class_="price_color").text
        availability = book.find("p", class_="instock availability").text.strip()

        rating_classes = book.find("p", class_="star-rating")["class"]
        rating = rating_classes[1]

        all_books_data.append({
            "Title": title,
            "Price": price,
            "Availability": availability,
            "Rating": rating
        })

    next_page_element = soup.find("li", class_="next")

    if next_page_element:
        next_page_href = next_page_element.a["href"]
        current_url = base_url + next_page_href
    else:
        current_url = None

df = pd.DataFrame(all_books_data)
df.to_csv("books.csv", index=False)

print(f"Saved {len(all_books_data)} books to books.csv")

Starting to scrape books...
Saved 1000 books to books.csv


> **Q2. Write a Python program to scrape the IMDB Top 250 Movies list (https://www.imdb.com/chart/top/). For each movie, extract the following details:**
>
> 1.  **Rank (1-250)**
> 2.  **Movie Title**
> 3.  **Year of Release**
> 4.  **IMDB Rating**
>
> **Store the results in a Pandas DataFrame and export it to a CSV file named `imdb_top250.csv`.**
>
> *(Note: Use Selenium/Playwright to scrape the required details from this website)*

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

url = "https://www.imdb.com/chart/top/"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

movie_list = []

movies = soup.find_all('li', class_=re.compile("ipc-metadata-list-summary-item"))

print("Starting to scrape IMDB...")

for movie in movies:
    title_element = movie.find("h3", class_=re.compile("ipc-title__text"))
    metadata_spans = movie.find_all("span", class_=re.compile("meta-info-item"))
    rating_element = movie.find("span", class_=re.compile("ipc-rating-star"))

    if title_element and len(metadata_spans) > 0 and rating_element:
        title_text = title_element.text

        try:
            rank_str, title_str = title_text.split(". ", 1)
            rank = int(rank_str)
        except ValueError:
            continue

        year = metadata_spans[0].text

        rating = rating_element.text.split("(")[0].strip()

        movie_list.append({
            "Rank": rank,
            "Title": title_str,
            "Year": year,
            "Rating": rating
        })

df = pd.DataFrame(movie_list)
df.to_csv("imdb_top250.csv", index=False)

print(f"Saved {len(movie_list)} movies to imdb_top250.csv")

Starting to scrape IMDB...
Saved 0 movies to imdb_top250.csv


> **Q3. Write a Python program to scrape the weather information for top world cities from the given website (https://www.timeanddate.com/weather/). For each city, extract the following details:**
>
> 1.  **City Name**
> 2.  **Temperature**
> 3.  **Weather Condition (e.g., Clear, Cloudy, Rainy, etc.)**
>
> **Store the results in a Pandas DataFrame and export it to a CSV file named `weather.csv`.**

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://www.timeanddate.com/weather/"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

weather_data = []
table = soup.find("table", class_="zebra fw tb-wc")

print("Starting to scrape weather...")

try:
    rows = table.tbody.find_all("tr")
    for row in rows:
        cells = row.find_all("td")

        if len(cells) > 2:
            city = cells[0].a.text
            temp = cells[1].text
            condition = cells[2].text

            weather_data.append({
                "City": city,
                "Temperature": temp,
                "Condition": condition
            })

    df = pd.DataFrame(weather_data)
    df.to_csv("weather.csv", index=False)
    print(f"Saved {len(weather_data)} cities to weather.csv")

except AttributeError:
    print("Error: Could not find the weather table. The website structure might have changed.")


Starting to scrape weather...
Error: Could not find the weather table. The website structure might have changed.
