### GETTING EXAM CENTERS LIST

In [1]:
# To read filenames from the Folder
import os 
import shutil

# Web-scraping
import requests
from bs4 import BeautifulSoup

import pandas as pd # Handling Data

# PDF Handling
import fitz  # PyMuPDF

from tqdm import tqdm # Progressbar

In [2]:
def extract_table_from_url(url, div_class):
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to fetch URL: {url}")
        return None
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the div with the specified class
    table_div = soup.find('div', class_=div_class)
    
    if not table_div:
        print(f"No div found with class '{div_class}' on the page.")
        return None
    
    # Find the table within the div with class 'table_responsive'
    table_div = soup.find('div', class_='table_responsive')
    if not table_div:
        print("No div found with class 'table_responsive'")
        exit()
    
    # Find all rows (tr) in the table body (tbody)
    rows = table_div.find('tbody').find_all('tr')
    
    # Initialize empty lists for headers and data
    headers = []
    data = []
    
    # Extract headers from the first row (assuming it's the header row)
    header_row = rows[0]
    for header_cell in header_row.find_all('td'):
        headers.append(header_cell.text.strip())
    
    # Extract data from subsequent rows
    for row in rows[1:]:
        row_data = []
        for td in row.find_all('td'):
            row_data.append(td.text.strip())
        data.append(row_data)

    return pd.DataFrame(data)

In [3]:
indian_centers_url = 'https://medicine.careers360.com/articles/neet-exam-centres'
div_class = 'table_responsive'
indian_centers_data = extract_table_from_url(indian_centers_url, div_class)

foreign_centers_url = 'https://medicine.careers360.com/articles/neet-exam-centres-outside-india'
div_class = 'table_responsive'
foreign_centers_data = extract_table_from_url(foreign_centers_url, div_class) 

In [4]:
foreign_centers_data

Unnamed: 0,0,1,2
0,9901,Kuwait,Kuwait City
1,9902,United Arab Emirates,Dubai
2,9903,United Arab Emirates,Abu Dhabi
3,9904,Thailand,Bangkok
4,9905,Sri Lanka,Colombo
5,9906,Qatar,Doha
6,9907,Nepal,Kathmandu
7,9908,Malaysia,Kuala lumpur
8,9909,Nigeria,Lagos
9,9910,Bahrain,Manama


In [5]:
indian_centers_data

Unnamed: 0,0,1,2,3,4
0,1,1101,Andaman & Nicobar Islands (UT),South Andaman,Port Blair
1,2,1211,Andhra Pradesh,Guntur,Amaravathi
2,3,1212,Andhra Pradesh,Anantapur,Anantapur
3,4,1213,Andhra Pradesh,West Godavari,Bhimavaram
4,5,1214,Andhra Pradesh,Prakasam,Chirala
...,...,...,...,...,...
549,550,4605,West Bengal,Hooghly,Hooghly
550,551,4606,West Bengal,Howrah,Howrah
551,552,4607,West Bengal,Paschim Medinipur,Kharagpur
552,553,4608,West Bengal,Kolkata,Kolkata


In [6]:
# Get Center IDs

center_ids = list(foreign_centers_data[0])
center_ids.extend(list(indian_centers_data[1]))
center_ids = sorted(center_ids)

In [7]:
# Check for Erorrs

error_ids = []
for center_id in center_ids:
    if len(center_id) != 4:
        error_ids.append(center_id)

error_ids

['37']

In [8]:
# Fix Errors

# 3704 is the ID for Puducherry (UT) Karaikal.
center_ids[center_ids.index('37')] = '3704'

### Now, run 'multiprocessing.py' before proceeding to the next one.

### DOWNLOADING RESULTS PDF (Alternate Download for Verification)

In [None]:
# NON-PARALLEL METHOD
url_base = 'https://neetfs.ntaonline.in/NEET_2024_Result/'

save_path = 'PDF_DATA/'  # Replace with your desired save path

for center_id in tqdm(centers_list):
    for i in range(int(center_ids) * 100, (int(center_ids) * 100) + 100):
        url = f"{url_base}{i}.pdf"
        download_pdf_from_url(url, save_path)

### CHECKING FOR MISSING FILES

In [9]:
def list_files_in_folder(folder):
    """Return a set of filenames in the specified folder."""
    file_set = set()
    for root, dirs, files in os.walk(folder):
        for file in files:
            file_set.add(file)
    return file_set

def compare_folders(folder1, folder2):
    """Compare files between two folders and return differences."""
    try:
        folder1_files = list_files_in_folder(folder1)
        folder2_files = list_files_in_folder(folder2)

        only_in_folder1 = folder1_files - folder2_files
        only_in_folder2 = folder2_files - folder1_files

        return only_in_folder1, only_in_folder2

    except Exception as e:
        print(f"Error while comparing folders: {str(e)}")
        return set(), set()

# Example usage:
folder1_path = 'PDF_DATA'  
folder2_path = 'PDF_DATA_NEW'

only_in_folder1, only_in_folder2 = compare_folders(folder1_path, folder2_path)

print(f"Files only in Folder 1: {only_in_folder1}")
print(f"Files only in Folder 2: {only_in_folder2}")

Files only in Folder 1: {'990201.pdf', '990601.pdf', '990301.pdf', '991101.pdf', '990101.pdf', '991001.pdf', '991201.pdf', '990401.pdf', '990901.pdf', '991301.pdf', '990501.pdf', '990701.pdf', '990801.pdf', '991401.pdf'}
Files only in Folder 2: {'200223.pdf', '200216.pdf', '200218.pdf', '462003.pdf', '200214.pdf', '200217.pdf', '200221.pdf', '313502.pdf', '200220.pdf', '462001.pdf', '200204.pdf', '200209.pdf', '200205.pdf', '200213.pdf', '200215.pdf', '200203.pdf', '200219.pdf', '200208.pdf', '362701.pdf', '462002.pdf', '200210.pdf', '200224.pdf', '200207.pdf', '313503.pdf', '200212.pdf', '313501.pdf', '313504.pdf', '200201.pdf', '313505.pdf', '200222.pdf', '200202.pdf', '200206.pdf', '462004.pdf', '200211.pdf'}


In [10]:
# Copy missing files to the folder with more files

def copy_files(source_folder, dest_folder, file_list):
    """Copy specified files from source_folder to dest_folder."""
    try:
        if not os.path.exists(dest_folder):
            os.makedirs(dest_folder)

        for file in file_list:
            source_file = os.path.join(source_folder, file)
            dest_file = os.path.join(dest_folder, file)
            shutil.copy2(source_file, dest_file)
            print(f"Copied {file} from {source_folder} to {dest_folder}")

    except Exception as e:
        print(f"Error while copying files: {str(e)}")

copy_files(source_folder=folder1_path, dest_folder=folder2_path, file_list=only_in_folder1)

Copied 990201.pdf from PDF_DATA to PDF_DATA_NEW
Copied 990601.pdf from PDF_DATA to PDF_DATA_NEW
Copied 990301.pdf from PDF_DATA to PDF_DATA_NEW
Copied 991101.pdf from PDF_DATA to PDF_DATA_NEW
Copied 990101.pdf from PDF_DATA to PDF_DATA_NEW
Copied 991001.pdf from PDF_DATA to PDF_DATA_NEW
Copied 991201.pdf from PDF_DATA to PDF_DATA_NEW
Copied 990401.pdf from PDF_DATA to PDF_DATA_NEW
Copied 990901.pdf from PDF_DATA to PDF_DATA_NEW
Copied 991301.pdf from PDF_DATA to PDF_DATA_NEW
Copied 990501.pdf from PDF_DATA to PDF_DATA_NEW
Copied 990701.pdf from PDF_DATA to PDF_DATA_NEW
Copied 990801.pdf from PDF_DATA to PDF_DATA_NEW
Copied 991401.pdf from PDF_DATA to PDF_DATA_NEW


In [11]:
def find_duplicate_files(folder):
    """Find duplicate file names in the specified folder."""
    file_count = {}
    duplicates = []

    for root, dirs, files in os.walk(folder):
        for file in files:
            file_path = os.path.join(root, file)
            file_name = os.path.basename(file)

            if file_name in file_count:
                if file_name not in duplicates:
                    duplicates.append(file_name)
            else:
                file_count[file_name] = file_path

    return duplicates

folder_path = 'PDF_DATA'

duplicates = find_duplicate_files(folder_path)

if duplicates:
    print("Duplicate files found:")
    for file in duplicates:
        print(file)
else:
    print("No duplicate files found.")

No duplicate files found.


In [12]:
def delete_non_pdf_files(folder):
    """Delete all non-PDF files in the specified folder."""
    for root, dirs, files in os.walk(folder):
        for file in files:
            file_path = os.path.join(root, file)
            if file.endswith(".pdf"):
                continue  # Skip PDF files
            else:
                try:
                    os.remove(file_path)
                    print(f"Deleted: {file_path}")
                except Exception as e:
                    print(f"Failed to delete {file_path}: {e}")

# Example usage:
folder_path = 'PDF_DATA_NEW'

delete_non_pdf_files(folder_path)

### PARSING & MERGING ALL PDF DATA

In [13]:
def get_data(pdf_file):
    
    # Open the PDF file
    doc = fitz.open(pdf_file)
    
    # Initialize an empty list to store extracted text
    text = []
    
    # Loop through each page in the document
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        
        # Extract text from the page
        page_text = page.get_text()
        
        # Append the text from this page to the list
        text.append(page_text)
    
    # Close the PDF document
    doc.close()

    return text


def remove_column_name(content):
    content = content.split('\n')

    while True:
        try:
            content.remove('Srlno. Marks')
        except Exception as e:
            break
    
    content = content[1:-1]

    return content


def parse_header(header):
    header = header.split('\n')
   
    center_city, center_state = [field.strip() for field in "".join(header[2:]).split(',')[-2:]]
    
    header_data = {
        'center_id': header[0].strip('Centre: '),
        'center_name': header[2],
        'center_city': center_city,
        'center_state': center_state
    }

    return header_data


def parse_data(data):
    marks_data = {}
    
    for page in data:
        header = page.split('NEET (UG) 2024')[0]
        header_data = parse_header(header)
        
        content = page.split('NEET (UG) 2024')[1]
        content = remove_column_name(content)

        i = 0
        while i < len(content)-1:
            marks_data[int(content[i])] = int(content[i+1])
            i += 2

    return header_data, marks_data


def create_table(header_data, marks_data):
    # Prepare data for DataFrame
    data = {
        'center_id': [header_data['center_id']] * len(marks_data),
        'center_name': [header_data['center_name']] * len(marks_data),
        'center_city': [header_data['center_city']] * len(marks_data),
        'center_state': [header_data['center_state']] * len(marks_data),
        'serial_no': list(marks_data.keys()),
        'score': list(marks_data.values())
    }

    # Create DataFrame
    df = pd.DataFrame(data)
    
    return df

In [14]:
def get_file_names(directory):
    # Initialize an empty list to store file names
    file_names = []
    
    # Iterate over all files in the directory
    for filename in os.listdir(directory):
        # Check if the current file is a regular file (not a directory)
        if os.path.isfile(os.path.join(directory, filename)):
            # Add the file name to the list
            file_names.append(filename)
    
    return file_names

# Example usage:
directory_path = 'PDF_DATA_NEW'  # Replace with your directory path
pdf_files = get_file_names(directory_path)

In [15]:
data_path = "PDF_DATA_NEW/"

all_data = pd.DataFrame()

for pdf_file in pdf_files:
    data = get_data(data_path + pdf_file)
    header_data, marks_data = parse_data(data)
    data_table = create_table(header_data, marks_data)
    all_data = pd.concat([all_data, data_table], ignore_index=True)

all_data

Unnamed: 0,center_id,center_name,center_city,center_state,serial_no,score
0,272807,"KASTURBA GIRLS PU COLLEGE, KASTURBA GIRLS PU C...",SHIVAMOGA (SHIMOGA),KARNATAKA,1,42
1,272807,"KASTURBA GIRLS PU COLLEGE, KASTURBA GIRLS PU C...",SHIVAMOGA (SHIMOGA),KARNATAKA,2,172
2,272807,"KASTURBA GIRLS PU COLLEGE, KASTURBA GIRLS PU C...",SHIVAMOGA (SHIMOGA),KARNATAKA,3,42
3,272807,"KASTURBA GIRLS PU COLLEGE, KASTURBA GIRLS PU C...",SHIVAMOGA (SHIMOGA),KARNATAKA,4,222
4,272807,"KASTURBA GIRLS PU COLLEGE, KASTURBA GIRLS PU C...",SHIVAMOGA (SHIMOGA),KARNATAKA,5,129
...,...,...,...,...,...,...
2333157,250301,"AYESHA ALI ACADEMY, KANIPORA KULGAM, J&K, ANAN...",ANANTNAG,JAMMU & KASHMIR,555,137
2333158,250301,"AYESHA ALI ACADEMY, KANIPORA KULGAM, J&K, ANAN...",ANANTNAG,JAMMU & KASHMIR,556,83
2333159,250301,"AYESHA ALI ACADEMY, KANIPORA KULGAM, J&K, ANAN...",ANANTNAG,JAMMU & KASHMIR,557,513
2333160,250301,"AYESHA ALI ACADEMY, KANIPORA KULGAM, J&K, ANAN...",ANANTNAG,JAMMU & KASHMIR,558,135


In [17]:
all_data.to_csv("all_data.csv", index=False)