# Assignment 4 
**Name:** Bhanavi
**Roll No:** 102313054

#### Q1. Write a Python program to scrape all available books from the website (https://books.toscrape.com/) Books to Scrape – a live site built for practicing scraping (safe, legal, no anti-bot). For each book, extract the following details: 
##### 1. Title 
##### 2. Price 
##### 3. Availability (In stock / Out of stock) 
##### 4. Star Rating (One, Two, Three, Four, Five) 
##### Store the scraped results into a Pandas DataFrame and export them to a CSV file named books.csv.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from urllib.parse import urljoin

BASE = "https://books.toscrape.com/"

def parse_book_block(article):
    title = article.h3.a['title'].strip()
    price = article.find("p", class_="price_color").text.strip()
    availability = article.find("p", class_="instock availability").text.strip()
    star_classes = article.find("p", class_="star-rating")['class']
    star_rating = [c for c in star_classes if c != "star-rating"][0]
    return {"title": title, "price": price, "availability": availability, "star_rating": star_rating}

books = []
next_page = "catalogue/page-1.html"  

resp = requests.get(BASE)
soup = BeautifulSoup(resp.text, "lxml")

page_url = BASE
while True:
    print(f"Fetching: {page_url}")
    resp = requests.get(page_url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "lxml")

    articles = soup.select("article.product_pod")
    for art in articles:
        books.append(parse_book_block(art))

    next_li = soup.select_one("li.next > a")
    if not next_li:
        break
    next_href = next_li['href']
   
    page_url = urljoin(page_url, next_href)
    time.sleep(0.2) 

print(f"Total books scraped: {len(books)}")

df = pd.DataFrame(books)
df.to_csv("books.csv", index=False)
print("Saved books.csv")

Fetching: https://books.toscrape.com/
Fetching: https://books.toscrape.com/catalogue/page-2.html
Fetching: https://books.toscrape.com/catalogue/page-3.html
Fetching: https://books.toscrape.com/catalogue/page-4.html
Fetching: https://books.toscrape.com/catalogue/page-5.html
Fetching: https://books.toscrape.com/catalogue/page-6.html
Fetching: https://books.toscrape.com/catalogue/page-7.html
Fetching: https://books.toscrape.com/catalogue/page-8.html
Fetching: https://books.toscrape.com/catalogue/page-9.html
Fetching: https://books.toscrape.com/catalogue/page-10.html
Fetching: https://books.toscrape.com/catalogue/page-11.html
Fetching: https://books.toscrape.com/catalogue/page-12.html
Fetching: https://books.toscrape.com/catalogue/page-13.html
Fetching: https://books.toscrape.com/catalogue/page-14.html
Fetching: https://books.toscrape.com/catalogue/page-15.html
Fetching: https://books.toscrape.com/catalogue/page-16.html
Fetching: https://books.toscrape.com/catalogue/page-17.html
Fetching: 

#### Q2. Write a Python program to scrape the IMDB Top 250 Movies list (https://www.imdb.com/chart/top/) . For each movie, extract the following details: 
##### 1. Rank (1–250) 
##### 2. Movie Title 
##### 3. Year of Release 
##### 4. IMDB Rating 
##### Store the results in a Pandas DataFrame and export it to a CSV file named imdb_top250.csv.

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import time

options = webdriver.ChromeOptions()

options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

url = "https://www.imdb.com/chart/top/"
driver.get(url)
time.sleep(1.5)  

rows = driver.find_elements(By.CSS_SELECTOR, "table.chart.full-width tbody tr")

movies = []
for idx, row in enumerate(rows, start=1):

    rank = idx
 
    title_elem = row.find_element(By.CSS_SELECTOR, "td.titleColumn a")
    title = title_elem.text.strip()
    year_elem = row.find_element(By.CSS_SELECTOR, "td.titleColumn span.secondaryInfo")
    year_text = year_elem.text.strip()  # e.g., "(1994)"
    
    year = year_text.strip("()")

    rating_elem = row.find_element(By.CSS_SELECTOR, "td.imdbRating strong")
    rating = rating_elem.text.strip()

    movies.append({"rank": rank, "title": title, "year": year, "imdb_rating": rating})

driver.quit()

df = pd.DataFrame(movies)
df.to_csv("imdb_top250.csv", index=False)
print("Saved imdb_top250.csv (rows: {})".format(len(df)))


Saved imdb_top250.csv (rows: 0)


#### Q3. Write a Python program to scrape the weather information for top world cities from the given website (https://www.timeanddate.com/weather/) . For each city, extract the following details: 
##### 1. City Name 
##### 2. Temperature 
##### 3. Weather Condition (e.g., Clear, Cloudy, Rainy, etc.) 
##### Store the results in a Pandas DataFrame and export it to a CSV file named weather.csv.

In [3]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
import time

BASE = "https://www.timeanddate.com"
INDEX = "https://www.timeanddate.com/weather/"

resp = requests.get(INDEX)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "lxml")

anchors = soup.find_all("a", href=True)
city_links = []
for a in anchors:
    href = a['href']
    if href.startswith("/weather/") and href.count('/') >= 2:
        
        parts = href.split('/')
       
        if len(parts) >= 4 and parts[2] and parts[3]:
            city_links.append(urljoin(BASE, href))

seen = set()
city_links_unique = []
for link in city_links:
    if link not in seen:
        seen.add(link)
        city_links_unique.append(link)

print(f"Found {len(city_links_unique)} candidate city links on index page")


N = 200
city_links_unique = city_links_unique[:N]

weather_rows = []
for idx, link in enumerate(city_links_unique, start=1):
    try:
        print(f"[{idx}/{len(city_links_unique)}] Fetching {link}")
        r = requests.get(link, timeout=8)
        r.raise_for_status()
        s = BeautifulSoup(r.text, "lxml")

        h1 = s.find("h1")
        city_name = h1.text.strip() if h1 else link.split('/')[-1].replace('-', ' ').title()

        qlook = s.find(id="qlook")
        if qlook:
            temp_div = qlook.find(class_="h2")
            temp = temp_div.text.strip() if temp_div else None
 
            cond_p = qlook.find("p")
            condition = cond_p.text.strip() if cond_p else None
        else:
          
            temp = None
            condition = None
            
            deg = s.find(lambda tag: tag.name in ["div", "span"] and "°" in tag.text)
            temp = deg.text.strip() if deg else None
            
            meta_desc = s.find("meta", {"name": "description"})
            condition = meta_desc["content"].strip() if meta_desc and "Weather" in meta_desc.get("content","") else None

        weather_rows.append({"city": city_name, "temperature": temp, "condition": condition, "url": link})
    except Exception as e:
        print("  -> failed:", e)
    time.sleep(0.2)  

df_weather = pd.DataFrame(weather_rows)
df_weather.to_csv("weather.csv", index=False)
print("Saved weather.csv (rows: {})".format(len(df_weather)))


Found 142 candidate city links on index page
[1/142] Fetching https://www.timeanddate.com/weather/india/patiala
[2/142] Fetching https://www.timeanddate.com/weather/usa/new-york
[3/142] Fetching https://www.timeanddate.com/weather/uk/london
[4/142] Fetching https://www.timeanddate.com/weather/japan/tokyo
[5/142] Fetching https://www.timeanddate.com/weather/ghana/accra
[6/142] Fetching https://www.timeanddate.com/weather/malaysia/kuala-lumpur
[7/142] Fetching https://www.timeanddate.com/weather/ethiopia/addis-ababa
[8/142] Fetching https://www.timeanddate.com/weather/kuwait/kuwait-city
[9/142] Fetching https://www.timeanddate.com/weather/australia/adelaide
[10/142] Fetching https://www.timeanddate.com/weather/ukraine/kyiv
[11/142] Fetching https://www.timeanddate.com/weather/algeria/algiers
[12/142] Fetching https://www.timeanddate.com/weather/bolivia/la-paz
[13/142] Fetching https://www.timeanddate.com/weather/kazakstan/almaty
[14/142] Fetching https://www.timeanddate.com/weather/niger