In [46]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


In [47]:

BASE_URL = "https://books.toscrape.com/catalogue/page-{}.html"

books_data = []

for page in range(1, 51):
    url = BASE_URL.format(page)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    books = soup.find_all("article", class_="product_pod")

    for book in books:

        title = book.h3.a["title"]

        price = book.find("p", class_="price_color").text.strip()

        availability = book.find("p", class_="instock availability").text.strip()

        star_tag = book.find("p", class_="star-rating")
        star_rating = star_tag.get("class")[1] if star_tag else "No Rating"

        books_data.append({
            "Title": title,
            "Price": price,
            "Availability": availability,
            "Star Rating": star_rating
        })

print(f"Total books scraped: {len(books_data)}")


Total books scraped: 1000


In [48]:
df = pd.DataFrame(books_data)


df.to_csv("books.csv", index=False, encoding="utf-8")

df.head()


Unnamed: 0,Title,Price,Availability,Star Rating
0,A Light in the Attic,√Ç¬£51.77,In stock,Three
1,Tipping the Velvet,√Ç¬£53.74,In stock,One
2,Soumission,√Ç¬£50.10,In stock,One
3,Sharp Objects,√Ç¬£47.82,In stock,Four
4,Sapiens: A Brief History of Humankind,√Ç¬£54.23,In stock,Five


In [49]:
from google.colab import files
files.download("books.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [50]:
!pip install selenium pandas
!apt-get update
!apt-get install -y chromium-browser chromium-chromedriver


Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:2 https://cli.github.com/packages stable InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:6 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:7 http://security.ubuntu.com/ubuntu jammy-security InRelease
Get:8 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Fetched 10.2 kB in 2s (5,903 B/s)
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to pr

Question 2

In [51]:
!pip install selenium webdriver-manager pandas



In [52]:
import pandas as pd
import time
import sys
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import re
import requests
from bs4 import BeautifulSoup

print("All libraries imported successfully!")

os.environ['PATH'] += ':/usr/bin'


All libraries imported successfully!


In [54]:
def setup_driver():
    """Set up Chrome driver with Colab-specific options"""
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--disable-features=VizDisplayCompositor')
    chrome_options.add_argument('--window-size=1920,1080')
    chrome_options.add_argument('--disable-extensions')
    chrome_options.add_argument('--disable-plugins')
    chrome_options.add_argument('--disable-images')
    chrome_options.add_argument('--disable-javascript')
    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Linux; Android 10; SM-G975F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36')

    service = Service('/usr/bin/chromedriver')
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.set_page_load_timeout(30)
    return driver








In [55]:
def scrape_imdb_top250():
    """Scrape IMDB Top 250 movies list"""
    url = "https://www.imdb.com/chart/top/"


    driver = setup_driver()

    try:
        print("Loading IMDB Top 250 page...")
        driver.get(url)

        wait = WebDriverWait(driver, 15)
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, "ipc-metadata-list-summary-item")))

        movie_containers = driver.find_elements(By.CLASS_NAME, "ipc-metadata-list-summary-item")

        movies_data = []

        print(f"Found {len(movie_containers)} movies. Starting extraction...")

        for i, container in enumerate(movie_containers, 1):
            try:

                rank = i

                title_element = container.find_element(By.CLASS_NAME, "ipc-title-link-wrapper")
                title_text = title_element.find_element(By.TAG_NAME, "h3").text

                title = re.sub(r'^\d+\.\s*', '', title_text)


                metadata_elements = container.find_elements(By.CLASS_NAME, "dli-title-metadata-item")

                year = None
                rating = None

                for element in metadata_elements:
                    text = element.text.strip()

                    if re.match(r'^\d{4}$', text):
                        year = int(text)
                        break


                try:
                    rating_element = container.find_element(By.CLASS_NAME, "ipc-rating-star--rating")
                    rating = float(rating_element.text)
                except (NoSuchElementException, ValueError):

                    try:
                        rating_container = container.find_element(By.CLASS_NAME, "ratingGroup--imdb-rating")
                        rating_text = rating_container.find_element(By.TAG_NAME, "span").text
                        rating = float(rating_text)
                    except (NoSuchElementException, ValueError):
                        rating = None


                movie_data = {
                    'Rank': rank,
                    'Title': title,
                    'Year': year,
                    'Rating': rating
                }

                movies_data.append(movie_data)
                if i % 25 == 0:
                    print(f"Extracted {i} movies...")

            except Exception as e:
                print(f"Error extracting data for movie {i}: {str(e)}")
                continue

        return movies_data

    except TimeoutException:
        print("Timeout: Page took too long to load")
        return []
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return []
    finally:
        driver.quit()

print("Scraping function created!")


Scraping function created!


In [56]:
def save_to_csv(movies_data, filename="imdb_top250.csv"):
    """Save the scraped data to a CSV file"""
    if not movies_data:
        print("No data to save!")
        return None

    df = pd.DataFrame(movies_data)


    df = df.sort_values('Rank').reset_index(drop=True)


    df.to_csv(filename, index=False, encoding='utf-8')

    print(f"\n Data saved to {filename}")
    print(f"Total movies scraped: {len(df)}")

    return df

def display_summary(df):
    """Display summary of scraped data"""
    if df is None or df.empty:
        print("No data to display!")
        return

    print("\n" + "="*50)
    print("DATA SUMMARY")
    print("="*50)

    # Display first few rows
    print("\n First 10 movies:")
    print(df.head(10).to_string(index=False))

    # Display basic statistics
    print(f"\nüìä Statistics:")
    print(f"Year range: {df['Year'].min()} - {df['Year'].max()}")
    print(f"Rating range: {df['Rating'].min():.1f} - {df['Rating'].max():.1f}")
    print(f"Average rating: {df['Rating'].mean():.2f}")
    print(f"Movies with missing data: {df.isnull().any(axis=1).sum()}")

    # Top rated movies
    print(f"\n‚≠ê Top 5 highest rated movies:")
    top_rated = df.nlargest(5, 'Rating')[['Rank', 'Title', 'Year', 'Rating']]
    print(top_rated.to_string(index=False))

print("Data processing functions created!")


Data processing functions created!


In [61]:
print(" Starting IMDB Top 250 Movies Scraper")
print("="*40)

print("Attempting to scrape using Selenium...")
try:
    movies_data = scrape_imdb_top250()
except Exception as e:
    print(f"Selenium method failed: {str(e)}")
    movies_data = []

if movies_data:
  df = save_to_csv(movies_data)
  display_summary(df)
else:
  print("No data scraped. Exiting...")

 Starting IMDB Top 250 Movies Scraper
Attempting to scrape using Selenium...
Selenium method failed: Message: Service /usr/bin/chromedriver unexpectedly exited. Status code was: 1

No data scraped. Exiting...


In [66]:

df.to_csv("imdb_top250.csv", index=False, encoding="utf-8")

from google.colab import files
files.download("imdb_top250.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Question 3

In [67]:
!pip install requests beautifulsoup4 pandas




In [76]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


In [80]:
def scrape_world_weather():
    url = "https://www.timeanddate.com/weather/"
    headers = {
        "User-Agent": "Mozilla/5.0"
    }
    resp = requests.get(url, headers=headers)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    weather_list = []

    city_blocks = soup.select("div.tb-scroll tbody tr")

    if not city_blocks:
        print("No city blocks found, scraping the popular cities manually")
        city_blocks = soup.select("div#qlook div")

    for block in city_blocks:
        try:
            # City name
            city_tag = block.find("a")
            if city_tag:
                city_name = city_tag.text.strip()
            else:
                continue

            temp_tag = block.find("div", class_="h2")
            temp = temp_tag.text.strip() if temp_tag else "N/A"

            cond_tag = block.find("p")
            cond = cond_tag.text.strip() if cond_tag else "N/A"

            weather_list.append({
                "City Name": city_name,
                "Temperature": temp,
                "Weather Condition": cond
            })
        except:
            continue

    return weather_list


In [78]:
data = scrape_world_weather()
df = pd.DataFrame(data)
df.to_csv("weather.csv", index=False, encoding="utf-8")
df.head(10)


No city blocks found, scraping the popular cities manually


In [79]:
from google.colab import files
files.download("weather.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>