In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time 

def extract_PRIDs_from_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    prid_language_dict = {}

    release_lang_divs = soup.find_all('div', class_='ReleaseLang')
    for div in release_lang_divs:
        language_tags = div.find_all('a')
        for language_tag in language_tags:
            language = language_tag.text.strip()
            prid = None
            prid_match = re.search(r'PRID=(\d+)', str(language_tag))
            if prid_match:
                prid = prid_match.group(1)
            prid_language_dict[language] = prid

    return prid_language_dict if prid_language_dict else None

start_time = time.time()
base_url = 'https://pib.gov.in/PressReleasePage.aspx?PRID='
start_prid = 1900000
end_prid = 1900200

data = {'Main_PRID': [], 'Language': [], 'Extracted_PRID': []}

all_extracted_PRIDs = set()

for prid in range(start_prid, end_prid + 1):
    
    prid = str(prid)
    if prid in all_extracted_PRIDs:
        #print(f"PRID={prid} already extracted. Skipping...")
        continue

    url = f'{base_url}{prid}'
    response = requests.get(url)

    if response.status_code == 200:
        extracted_PRIDs = extract_PRIDs_from_html(response.text)
        #print(extracted_PRIDs)
        if extracted_PRIDs is not None:
            for language, extracted_prid in extracted_PRIDs.items():
                data['Main_PRID'].append(prid)
                data['Language'].append(language)
                data['Extracted_PRID'].append(extracted_prid)
                all_extracted_PRIDs.add(extracted_prid)
                all_extracted_PRIDs.add(prid)
            #print(f"Extracted PRIDs for PRID={prid}")
        else:
            data['Main_PRID'].append(prid)
            data['Language'].append('NaN')
            data['Extracted_PRID'].append('NaN')
            #print(f"No PRIDs extracted for PRID={prid}")

    else:
        #print(f"Failed to fetch the URL for PRID={prid}")
        data['Main_PRID'].append(prid)
        data['Language'].append('NaN')
        data['Extracted_PRID'].append('NaN')
    all_extracted_PRIDs.add(prid)

    # print(all_extracted_PRIDs)
    # input()
df = pd.DataFrame(data)
df.to_csv('extracted_PRIDs.csv', index=False)

end_time = time.time()

print(end_time - start_time)

128.43269562721252
