<a href="https://colab.research.google.com/github/VanshGupta18/machine-learning/blob/main/ml_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**ASSIGNMENT-4**

---



Write a Python program to scrape all available books from the website
(https://books.toscrape.com/) Books to Scrape – a live site built for practicing scraping (safe,
legal, no anti-bot). For each book, extract the following details:
1. Title
2. Price
3. Availability (In stock / Out of stock)
4. Star Rating (One, Two, Three, Four, Five)

Store the scraped results into a Pandas DataFrame and export them to a CSV file named
books.csv.

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
BASE_URL = "http://books.toscrape.com/catalogue/"
TOTAL_PAGES = 50

def get_data_from_page(page_num):
    page_data = []
    current_page_url = f"{BASE_URL}page-{page_num}.html"

    response = requests.get(current_page_url)
    soup = BeautifulSoup(response.content, "html.parser")

    books = soup.find_all("article", class_="product_pod")
    if not books:
        return []

    for book in books:
        title_element = book.h3.a
        title = title_element["title"] if title_element else 'NaN'

        price_element = book.find("p", class_="price_color")
        price = price_element.get_text(strip=True) if price_element else 'NaN'

        availability_element = book.find("p", class_="instock availability")
        availability = availability_element.get_text(strip=True) if availability_element else 'NaN'

        rating_element = book.find("p", class_="star-rating")
        star_rating = rating_element["class"][1] if rating_element and len(rating_element["class"]) > 1 else 'NaN'

        page_data.append({
            "Title": title,
            "Price": price,
            "Availability": availability,
            "Star Rating": star_rating
        })

    return page_data

In [4]:
all_data = []
for page in range(1, TOTAL_PAGES + 1):
  all_data.extend(get_data_from_page(page))

df = pd.DataFrame(all_data)

if not df.empty:
    df.to_csv('book_data.csv')

Write a Python program to scrape the IMDB Top 250 Movies list
(https://www.imdb.com/chart/top/) . For each movie, extract the following details:
1. Rank (1–250)
2. Movie Title
3. Year of Release
4. IMDB Rating

Store the results in a Pandas DataFrame and export it to a CSV file named imdb_top250.csv.

In [13]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

driver = webdriver.Chrome(options=options)
driver.get("https://www.imdb.com/chart/top/")

wait = WebDriverWait(driver, 20)
list_container = wait.until(
    EC.presence_of_element_located((By.CSS_SELECTOR, "ul.ipc-metadata-list"))
)
movies = list_container.find_elements(By.TAG_NAME, "li")

movies_data = []
for movie_item in movies:
    title_text = movie_item.find_element(By.CSS_SELECTOR, "h3.ipc-title__text").text
    rank_str, title = title_text.split(". ", 1)

    metadata_items = movie_item.find_elements(By.CSS_SELECTOR, "span.cli-title-metadata-item")
    year_str = metadata_items[0].text

    rating_str = movie_item.find_element(By.CSS_SELECTOR, "span.ipc-rating-star").text.split("\n")[0]

    movies_data.append({
        "Rank": int(rank_str),
        "Movie Title": title,
        "Year of Release": int(year_str),
        "IMDB Rating": float(rating_str)
    })

driver.quit()

df = pd.DataFrame(movies_data)
df = df.sort_values(by="Rank").reset_index(drop=True)
df.to_csv("imdb_top250.csv", index=False, encoding='utf-8')

Write a Python program to scrape the weather information for top world cities from the
given website (https://www.timeanddate.com/weather/) . For each city, extract the following
details:
1. City Name
2. Temperature
3. Weather Condition (e.g., Clear, Cloudy, Rainy, etc.)

Store the results in a Pandas DataFrame and export it to a CSV file named weather.csv.

In [19]:

def extract_temp_as_float(temp):

    value = temp.replace("°C", "").replace("°F", "").strip()

    value = value.replace("\u00a0", "").replace("\xa0", "")
    return float(value)

url = "https://www.timeanddate.com/weather/"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

tds = soup.find_all('td')
results = []
current_city = None
weather_condition = ""

for i, td in enumerate(tds):

    if td.find('a'):
        current_city = td.get_text(strip=True)

        weather_condition = ""
        for offset in range(1, 3):
            if i + offset < len(tds):
                img = tds[i + offset].find('img')
                if img and img.get("alt"):
                    weather_condition = img["alt"]
                    break

    elif 'rbi' in td.get('class', []) and current_city:
        temp_str = td.get_text(strip=True)
        try:
            temp_float = extract_temp_as_float(temp_str)
        except Exception as e:
            temp_float = None
        results.append({
            "City Name": current_city,
            "Temperature": temp_float,
            "Weather Condition": weather_condition
        })
        current_city = None
        weather_condition = ""

df = pd.DataFrame(results)
df.to_csv('weather.csv', index=False)
