In [None]:
!pip install requests beautifulsoup4 pandas
import requests
from bs4 import BeautifulSoup
import csv

# Function to get the article details from the press release page
def get_article_details(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract the date of publication
    date_tag = soup.find('div', class_='ReleaseDateSubHeaddateTime text-center pt20')
    date_of_publication = date_tag.text.strip() if date_tag else 'N/A'

    # Extract the headline
    headline_tag = soup.find('div', class_='innner-page-main-about-us-content-right-part').find('h2')
    headline = headline_tag.text.strip() if headline_tag else 'N/A'

    # Extract the ministry
    ministry_tag = soup.find('div', class_='MinistryNameSubhead text-center')
    ministry = ministry_tag.text.strip() if ministry_tag else 'N/A'

    # Extract the article content
    content_tag = soup.find('div', class_='innner-page-main-about-us-content-right-part')
    paragraphs = content_tag.find_all('p') if content_tag else []
    article_content = ' '.join([para.get_text(separator=' ', strip=True) for para in paragraphs])

    return date_of_publication, headline, ministry, article_content, url

# Function to generate the list of URLs based on the range of PRIDs
def generate_urls(start_prid, end_prid):
    base_url = "https://pib.gov.in/PressReleasePage.aspx?PRID="
    prids = range(start_prid, end_prid + 1)
    urls = [f"{base_url}{prid}" for prid in prids]
    return urls

# Main function to scrape the data and write to a CSV file
def scrape_pib_data(start_prid, end_prid, output_file):
    urls = generate_urls(start_prid, end_prid)

    # Open a CSV file to write the data
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Date of Publication', 'Headline', 'Ministry', 'Article Content', 'Article Link'])

        # Iterate through each URL and get the details
        for url in urls:
            try:
                date_of_publication, headline, ministry, article_content, article_link = get_article_details(url)
                writer.writerow([date_of_publication, headline, ministry, article_content, article_link])
                print(f"Scraped data from {url}")
            except Exception as e:
                print(f"Failed to scrape {url}: {e}")

# Specify the range of PRIDs and the output CSV file
start_prid = 1992019
end_prid = 2016839
output_file = 'pib_data.csv'
# Call the main function to start scraping
scrape_pib_data(start_prid, end_prid, output_file)
