In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Base URL
BASE_URL = "https://books.toscrape.com/catalogue/page-{}.html"

# Data storage
books_data = []

# Loop through all 50 pages
for page in range(1, 51):
    url = BASE_URL.format(page)
    response = requests.get(url)

    # If page does not exist, stop scraping
    if response.status_code != 200:
        break

    soup = BeautifulSoup(response.text, "html.parser")

    # Find all book containers
    books = soup.find_all("article", class_="product_pod")

    for book in books:
        # Title
        title = book.h3.a["title"]

        # Price
        price = book.find("p", class_="price_color").text.strip()

        # Availability
        availability = book.find("p", class_="instock availability").text.strip()

        # Star Rating (class has "star-rating Three", etc.)
        star_class = book.find("p")["class"]
        star_rating = star_class[1] if len(star_class) > 1 else "No Rating"

        books_data.append([title, price, availability, star_rating])

# Convert to DataFrame
df = pd.DataFrame(books_data, columns=["Title", "Price", "Availability", "Star Rating"])

# Save to CSV
df.to_csv("books.csv", index=False)

print("✅ Scraping complete! Data saved to books.csv")


✅ Scraping complete! Data saved to books.csv


In [16]:
# Install required packages (run once in Jupyter)
# !pip install selenium pandas webdriver-manager

import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.firefox.options import Options

# --- Step 1: Setup Selenium WebDriver (Firefox) ---
options = Options()
options.add_argument("--headless")  # Run without opening browser window

driver = webdriver.Firefox(service=Service(GeckoDriverManager().install()), options=options)

# --- Step 2: Open IMDb Top 250 Movies page ---
url = "https://www.imdb.com/chart/top/"
driver.get(url)
time.sleep(3)

# --- Step 3: Extract Movie Data ---
movies = driver.find_elements(By.CSS_SELECTOR, "li.ipc-metadata-list-summary-item")

data = []
for movie in movies:
    # Rank
    rank = movie.find_element(By.CSS_SELECTOR, ".ipc-title__text").text.split('.')[0]

    # Title
    title = movie.find_element(By.CSS_SELECTOR, "h3").text

    # Year
    year = movie.find_element(By.CSS_SELECTOR, ".cli-title-metadata-item").text

    # Rating
    rating = movie.find_element(By.CSS_SELECTOR, "span.ipc-rating-star--rating").text

    data.append([int(rank), title, year, float(rating)])

# --- Step 4: Store in Pandas DataFrame ---
df = pd.DataFrame(data, columns=["Rank", "Title", "Year", "IMDB Rating"])

# --- Step 5: Save to CSV ---
df.to_csv("imdb_top250.csv", index=False)

driver.quit()
print("Scraping complete! Data saved as imdb_top250.csv")
df.head()


ValueError: response body:
{"message":"API rate limit exceeded for 112.196.126.3. (But here's the good news: Authenticated requests get a higher rate limit. Check out the documentation for more details.)","documentation_url":"https://docs.github.com/rest/overview/resources-in-the-rest-api#rate-limiting"}

request url:
https://api.github.com/repos/mozilla/geckodriver/releases/latest
response headers:
{'Date': 'Wed, 03 Sep 2025 06:32:49 GMT', 'Server': 'Varnish', 'Strict-Transport-Security': 'max-age=31536000; includeSubdomains; preload', 'X-Content-Type-Options': 'nosniff', 'X-Frame-Options': 'deny', 'X-XSS-Protection': '1; mode=block', 'Content-Security-Policy': "default-src 'none'; style-src 'unsafe-inline'", 'Access-Control-Allow-Origin': '*', 'Access-Control-Expose-Headers': 'ETag, Link, Location, Retry-After, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Reset, X-RateLimit-Used, X-RateLimit-Resource, X-OAuth-Scopes, X-Accepted-OAuth-Scopes, X-Poll-Interval, X-GitHub-Media-Type, Deprecation, Sunset', 'Content-Type': 'application/json; charset=utf-8', 'Referrer-Policy': 'origin-when-cross-origin, strict-origin-when-cross-origin', 'X-GitHub-Media-Type': 'github.v3; format=json', 'X-RateLimit-Limit': '60', 'X-RateLimit-Remaining': '0', 'X-RateLimit-Reset': '1756881457', 'X-RateLimit-Resource': 'core', 'X-RateLimit-Used': '60', 'Content-Length': '279', 'X-GitHub-Request-Id': 'D248:316DBB:2E6051:3AFC6F:68B7E111'}


In [13]:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

URL = "https://www.timeanddate.com/weather/"
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}

resp = requests.get(URL, headers=HEADERS, timeout=15)
resp.raise_for_status()

soup = BeautifulSoup(resp.text, "html.parser")

# Pick the first <table> that contains a degree symbol — likely the weather table
table = next((t for t in soup.find_all("table") if "°" in t.get_text()), None)
if table is None:
    raise SystemExit("Could not find the weather table. Page layout may have changed.")

rows = table.find_all("tr")

records = []
for tr in rows:
    text = tr.get_text(" ", strip=True)
    if "°" not in text:           # skip non-weather rows
        continue

    # City: first <a> text in the row (fallback to first cell text)
    a = tr.find("a")
    city = a.get_text(strip=True) if a else (tr.find("td").get_text(strip=True) if tr.find("td") else "")

    # Temperature: find e.g. 24 °C or 24°C
    temp_match = re.search(r"(-?\d+)\s*°\s*([CF])", text, flags=re.I)
    temperature = f"{temp_match.group(1)} °{temp_match.group(2).upper()}" if temp_match else ""

    # Condition: prefer icon alt text, else look for common words
    img = tr.find("img", alt=True)
    if img and img["alt"].strip():
        condition = img["alt"].strip()
    else:
        cond_match = re.search(r"\b(Clear|Cloudy|Sunny|Rain|Showers|Snow|Mist|Fog|Overcast|Thunder|Windy)\b", text, flags=re.I)
        condition = cond_match.group(0) if cond_match else ""

    records.append({"City Name": city, "Temperature": temperature, "Weather Condition": condition})

df = pd.DataFrame(records)
df = df[df["City Name"].astype(bool)]   # drop any rows without a city name

print(f"Found {len(df)} rows. Sample:")
print(df.head(10).to_string(index=False))

df.to_csv("weather.csv", index=False)
print("Saved to weather.csv")


Found 70 rows. Sample:
  City Name Temperature       Weather Condition
      Accra       24 °C   Passing clouds. Mild.
Addis Ababa       15 °C              Fog. Cool.
   Adelaide       17 °C                   Cool.
    Algiers       24 °C   Passing clouds. Mild.
     Almaty       19 °C Scattered clouds. Mild.
      Amman       22 °C Scattered clouds. Mild.
  Amsterdam       17 °C   Passing clouds. Mild.
     Anadyr       11 °C     Partly sunny. Cool.
  Anchorage       13 °C            Clear. Cool.
     Ankara       21 °C            Sunny. Mild.
Saved to weather.csv
