In [1]:
pip install requests beautifulsoup4 pandas

Note: you may need to restart the kernel to use updated packages.


In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_data(url, data_type):
    data = []
    page = 1

    while True:
        print(f"Scraping page {page}...")
        response = requests.get(url.format(page=page))
        if response.status_code != 200:
            print("Failed to retrieve data. Exiting...")
            break

        soup = BeautifulSoup(response.text, 'html.parser')

        if data_type == 'headlines':
            headlines = soup.find_all('h2')  # Adjust the tag based on the website structure
            for headline in headlines:
                data.append({'headline': headline.get_text(strip=True)})

        elif data_type == 'product_details':
            products = soup.find_all('div', class_='product')  # Adjust the class based on the website structure
            for product in products:
                title = product.find('h3').get_text(strip=True)
                price = product.find('span', class_='price').get_text(strip=True)
                data.append({'title': title, 'price': price})

        elif data_type == 'job_listings':
            jobs = soup.find_all('div', class_='job')  # Adjust the class based on the website structure
            for job in jobs:
                title = job.find('h3').get_text(strip=True)
                company = job.find('span', class_='company').get_text(strip=True)
                location = job.find('span', class_='location').get_text(strip=True)
                data.append({'title': title, 'company': company, 'location': location})

        else:
            print("Invalid data type specified. Exiting...")
            break

        # Check for pagination
        next_page = soup.find('a', class_='next')  # Adjust the class based on the website structure
        if next_page:
            page += 1
        else:
            break

    return data

def save_to_csv(data, filename):
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    print(f"Data saved to {filename}")

if __name__ == "__main__":
    url = input("Enter the URL (use {page} for pagination): ")
    data_type = input("Enter the type of data to scrape (headlines, product_details, job_listings): ")

    scraped_data = scrape_data(url, data_type)
    if scraped_data:
        save_to_csv(scraped_data, 'scraped_data.csv')

Enter the URL (use {page} for pagination):  https://www.amazon.in/
Enter the type of data to scrape (headlines, product_details, job_listings):  watch


Scraping page 1...
Failed to retrieve data. Exiting...


In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_data(url, data_type):
    data = []
    page = 1
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    while True:
        formatted_url = url.format(page=page)
        print(f"Scraping page {page}: {formatted_url}...")
        
        response = requests.get(formatted_url, headers=headers)
        if response.status_code != 200:
            print(f"Failed to retrieve data from {formatted_url}. Status Code: {response.status_code}")
            break

        soup = BeautifulSoup(response.text, 'html.parser')

        if data_type == 'headlines':
            headlines = soup.find_all('h2')  # Adjust this based on the website
            for headline in headlines:
                data.append({'headline': headline.get_text(strip=True)})

        elif data_type == 'product_details':
            products = soup.find_all('div', class_='product')  # Adjust this
            for product in products:
                title = product.find('h3').get_text(strip=True) if product.find('h3') else "N/A"
                price = product.find('span', class_='price').get_text(strip=True) if product.find('span', class_='price') else "N/A"
                data.append({'title': title, 'price': price})

        elif data_type == 'job_listings':
            jobs = soup.find_all('div', class_='job')  # Adjust this
            for job in jobs:
                title = job.find('h3').get_text(strip=True) if job.find('h3') else "N/A"
                company = job.find('span', class_='company').get_text(strip=True) if job.find('span', class_='company') else "N/A"
                location = job.find('span', class_='location').get_text(strip=True) if job.find('span', class_='location') else "N/A"
                data.append({'title': title, 'company': company, 'location': location})

        else:
            print("Invalid data type specified. Exiting...")
            break

        # Handle pagination
        next_page = soup.find('a', class_='next')  # Adjust this
        if next_page:
            page += 1
        else:
            break

    return data

def save_to_csv(data, filename):
    if not data:
        print("No data found. Exiting...")
        return
    
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    print(f"Data saved to {filename}")

if __name__ == "__main__":
    url = input("Enter the URL (use {page} for pagination): ")
    data_type = input("Enter the type of data to scrape (headlines, product_details, job_listings): ")

    scraped_data = scrape_data(url, data_type)
    save_to_csv(scraped_data, 'scraped_data.csv')

Enter the URL (use {page} for pagination):  https://remotive.io/remote-jobs?page=
Enter the type of data to scrape (headlines, product_details, job_listings):  job_listings


Scraping page 1: https://remotive.io/remote-jobs?page=...
Failed to retrieve data from https://remotive.io/remote-jobs?page=. Status Code: 403
No data found. Exiting...
