In [20]:
import re
import csv
from PyPDF2 import PdfReader

# Function to extract student data from the PDF
def extract_student_data_from_pdf(pdf_file, session_year, probidhan, semester_num):
    reader = PdfReader(pdf_file)
    student_data = []

    # Regex patterns to capture relevant information
    polytechnic_pattern = r"(\d{5}) - ([A-Za-z\s,]+)"  # Matches 'polytechnic_code - polytechnic_name'
    student_result_pattern = r"(\d{6})\s*\(\s*([\d\.]+)\s*\)"  # Matches 'student_roll ( GPA )'


    # Define semester column names
    semester_names = [
        "first_semester", "second_semester", "third_semester", "fourth_semester",
        "fifth_semester", "sixth_semester", "seventh_semester", "eighth_semester"
    ]

    for page_num in range(len(reader.pages)):
       
        page = reader.pages[page_num]
        text = page.extract_text()

        # Extract polytechnic information (code and name)
        polytechnic_match = re.search(polytechnic_pattern, text)
        if polytechnic_match:
            polytechnic_code = polytechnic_match.group(1)
            polytechnic_name = polytechnic_match.group(2).strip()

        # Find all student results (roll number and GPA)
        student_matches = re.findall(student_result_pattern, text)
        

        # Store each student's data
        for student_roll, gpa in student_matches:
            student_data.append({
                "polytechnic_name": polytechnic_name,
                "polytechnic_code": polytechnic_code,
                "probidhan": probidhan,
                "session_year": session_year,
                "student_roll": student_roll,
                semester_names[semester_num - 1]: gpa   
            })

    return student_data

# Function to write the extracted data to a CSV file
def write_to_csv(file_path, student_data):
    # Define CSV header
    header = ["polytechnic_name", "polytechnic_code", "probidhan", "session_year", "student_roll"] + [
        "first_semester", "second_semester", "third_semester", "fourth_semester",
        "fifth_semester", "sixth_semester", "seventh_semester", "eighth_semester"
    ]

    # Open the CSV file for writing
    with open(file_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=header)

        # Write the header row
        writer.writeheader()

        # Fill missing semester columns with 'null'
        for data in student_data:
            for sem_name in header[5:]:  # Skip first 5 columns
                if sem_name not in data:
                    data[sem_name] = "null"
            writer.writerow(data)


In [36]:

# Example Usage
pdf_file = r'Test_Result\2022_Probidhan\Data\Session_2023_2024\Result_1st_2022_Regulation.pdf'  # Replace with your actual file path
session_year = '2023-2024'  # Example session year
probidhan = '2022'  # Example probidhan
semester_num = 1 # Example semester number

# Extract student data from the PDF
student_data = extract_student_data_from_pdf(pdf_file, session_year, probidhan, semester_num)


In [37]:

# Write extracted data to a CSV file
csv_file_path = r'Test_Result\2022_Probidhan\Regular_CSV\2023_2024_first_Semester.csv'
write_to_csv(csv_file_path, student_data)

print(f"✅ Data successfully written to: {csv_file_path}")


✅ Data successfully written to: Test_Result\2022_Probidhan\Regular_CSV\2023_2024_first_Semester.csv
