In [2]:
import re

def extract_number_and_text(input_string):
    # Regular expression to extract the number and text accommodating spaces or hyphens
    match = re.match(r"^(\d+)[\s-](.+)\.pdf$", input_string.strip())

    if match:
        number = match.group(1)  # The number part
        text = match.group(2)    # The text part
    else:
        number = None
        text = None

    return number, text

In [3]:
import requests
from bs4 import BeautifulSoup

# URL of the page to be scraped
url = "https://ceo.karnataka.gov.in/304/_gallery_/en"

# Perform the HTTP request to get the page content
response = requests.get(url)
response.raise_for_status()  # Raises an HTTPError for bad responses

# Parse the HTML content of the page with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find all 'a' tags with class 'filelist'
pdf_links = soup.find_all('a', class_='filelist')


mapping_dict = {}
# Loop over each link to extract the PDF names and the associated numbers
for pdf_link in pdf_links:
    pdf_name = pdf_link.text.strip()  # Get the text part and strip any extra whitespace
    pdf_url = pdf_link['href']  # Get the href attribute which is the URL
    pdf_number = pdf_url.split('/')[-1].replace('.pdf', '')  # Extract number from the URL
    print(f"{pdf_name} - {pdf_number}")
    constituency_id, constituency_name = extract_number_and_text(pdf_name)
    
    mapping_dict[pdf_number] = {
        "constituency_id": constituency_id,
        "constituency_name": constituency_name
    }

1-NIPPANI.pdf - 13801687259760
2-CHIKKODI.pdf - 43171687259760
3-ATHANI.pdf - 97891687259760
4-KAGAWADA.pdf - 62171687259760
5-KUDACHI(SC).pdf - 65291687259760
6-RAYBHAG.pdf - 49771687259760
7-HUKKEREI.pdf - 30221687259760
8-ARABHVI.pdf - 88131687259760
9-GOKAK.pdf - 50901687259760
10- YAMAKNMARADI.pdf - 33821687259760
11-BELGAUM.pdf - 70841687259760
12- BELGAUM DAKSHIN.pdf - 47771687259760
13-BELGAUM RURAL.pdf - 77181687259760
14-KHANAPUR.pdf - 87491687259760
15-KITTUR.pdf - 38341687259760
16-BAILAHONGAL.pdf - 57251687259760
17-SAVADATTI YELLAMMA.pdf - 69561687259760
18-RAMDURG.pdf - 34361687259760
19-MUDHOL.pdf - 72851687259760
20-TERADAL.pdf - 26001687259760
21-JAMAKHANDI.pdf - 79501687260056
22-BILGI.pdf - 91121687260056
23-BADAMI.pdf - 89621687260056
24-BHAGALKOT.pdf - 73081687260056
25-HUNGUND.pdf - 21991687260056
26-MUDDEBIHAL.pdf - 59471687260056
27-DEVAR HIPPARGI.pdf - 19971687260056
28-BASAVANA BAGEVADI.pdf - 74871687260056
29-BABALESHWAR.pdf - 59871687260056
30-BIJAPUR CITY.

In [5]:
# dump this to json file

import json

with open('constituency_mapping.json', 'w') as f:
    json.dump(mapping_dict, f, indent=4)

In [None]:
import os
import requests
from bs4 import BeautifulSoup

# URL of the page to be scraped
url = "https://ceo.karnataka.gov.in/304/_gallery_/en"

# Directory containing the JSON files
json_directory = "/path/to/your/json/files"

# Perform the HTTP request to get the page content
response = requests.get(url)
response.raise_for_status()

# Parse the HTML content of the page with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find all 'a' tags with class 'filelist'
pdf_links = soup.find_all('a', class_='filelist')

# Dictionary to hold the number as key and corresponding PDF name as value
pdf_mapping = {}

for pdf_link in pdf_links:
    pdf_name = pdf_link.text.strip()
    pdf_url = pdf_link['href']
    pdf_number = pdf_url.split('/')[-1].replace('.pdf', '')
    pdf_mapping[pdf_number] = pdf_name

# Iterate over files in the JSON directory
for file_name in os.listdir(json_directory):
    if file_name.endswith('.json'):
        # Extract the number from the file name
        number = file_name.split('_')[-1].replace('.json', '')
        
        # Check if this number is in our dictionary
        if number in pdf_mapping:
            new_file_name = f"JSON_karnataka_AssemblyElection_2023_{pdf_mapping[number].replace(' ', '')}.json"
            original_path = os.path.join(json_directory, file_name)
            new_path = os.path.join(json_directory, new_file_name)
            
            # Rename the file
            os.rename(original_path, new_path)
            print(f"Renamed '{file_name}' to '{new_file_name}'")
