header = ["polytechnic_name", "polytechnic_code", "probidhan", "session_year", "student_roll"] + [
       "cgpa", "first_semester", "second_semester", "third_semester", "fourth_semester",
        "fifth_semester", "sixth_semester", "seventh_semester", "eighth_semester",
        "refard_subject",
]

In [20]:
import re
import csv
from PyPDF2 import PdfReader

def extract_student_data_from_pdf(pdf_file, session_year, probidhan):
    reader = PdfReader(pdf_file)
    student_data = {}
    
    polytechnic_pattern = r"(\d{5}) - ([A-Za-z\s,]+)"
    student_result_pattern = re.compile(
        r"(\d{6})\s*(?:cgpa:\s*([\d\.]+))?\s*[({]?\s*"
        r"(?:gpa8:\s*([\d\.]+|ref),\s*)?"
        r"(?:gpa7:\s*([\d\.]+|ref),\s*)?"
        r"(?:gpa6:\s*([\d\.]+|ref),\s*)?"
        r"(?:gpa5:\s*([\d\.]+|ref),\s*)?"
        r"(?:gpa4:\s*([\d\.]+|ref),\s*)?"
        r"(?:gpa3:\s*([\d\.]+|ref),\s*)?"
        r"(?:gpa2:\s*([\d\.]+|ref),\s*)?"
        r"(?:gpa1:\s*([\d\.]+|ref),\s*)?"
        r"(?:ref_sub:\s*([\dA-Za-z(),\s]*))?"
        r"[})]?"
    )
    
    exceptional_case_pattern = re.compile(r"(\d{6})\s*[{]([\dA-Za-z(),\s]*)[}]")
    
    semester_names = [
        "first_semester", "second_semester", "third_semester", "fourth_semester", 
        "fifth_semester", "sixth_semester", "seventh_semester", "eighth_semester"
    ]
    
    polytechnic_code, polytechnic_name = None, None
    
    for page in reader.pages:
        text = page.extract_text()
        if not text:
            continue

        polytechnic_match = re.search(polytechnic_pattern, text)
        if polytechnic_match:
            polytechnic_code, polytechnic_name = polytechnic_match.groups()
            polytechnic_name = polytechnic_name.strip()
        
        student_matches = student_result_pattern.findall(text)
        for match in student_matches:
            student_roll = match[0]
            cgpa = match[1] if match[1] and match[1].strip() else "0.0"
            gpa_values = list(match[2:10])
            ref_subs = match[10] if match[10] else ""

            student_entry = {
                "polytechnic_name": polytechnic_name,
                "polytechnic_code": polytechnic_code,
                "probidhan": probidhan,
                "session_year": session_year,
                "student_roll": student_roll,
                "cgpa": cgpa,
                "refard_subject": ref_subs.strip()
            }
            
            gpa_values.reverse()
            for i, semester in enumerate(semester_names):
                student_entry[semester] = gpa_values[i] if i < len(gpa_values) and gpa_values[i] != "ref" else "0.0"
            
            student_data[student_roll] = student_entry  # Ensure unique students
        
        exceptional_matches = exceptional_case_pattern.findall(text)
        for match in exceptional_matches:
            student_roll = match[0]
            ref_subs = match[1].strip()
            
            if student_roll in student_data:
                student_data[student_roll]["refard_subject"] = ref_subs  # Append ref subjects
            else:
                student_entry = {
                    "polytechnic_name": polytechnic_name,
                    "polytechnic_code": polytechnic_code,
                    "probidhan": probidhan,
                    "session_year": session_year,
                    "student_roll": student_roll,
                    "cgpa": "0.0",
                    "refard_subject": ref_subs
                }
                
                for semester in semester_names:
                    student_entry[semester] = "0.0"
                
                student_data[student_roll] = student_entry
    
    return list(student_data.values())

def write_to_csv(file_path, student_data):
    header = ["polytechnic_name", "polytechnic_code", "probidhan", "session_year", "student_roll", "cgpa", "refard_subject"] + [
        "first_semester", "second_semester", "third_semester", "fourth_semester", 
        "fifth_semester", "sixth_semester", "seventh_semester", "eighth_semester"
    ]

    with open(file_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=header)
        writer.writeheader()
        writer.writerows(student_data)

# # Example Usage
# pdf_path = "RESULT__2020_2021_session_8th_2016_Regulation.pdf"
# csv_output_path = "extracted_results.csv"
# session_year = "2020-2021"
# probidhan = "2016"

# students = extract_student_data_from_pdf(pdf_path, session_year, probidhan)
# write_to_csv(csv_output_path, students)

# print("CSV file has been successfully created!")


In [21]:

# Example Usage
pdf_path = r"2016 Probidhan\Data\2020-2021\RESULT__2020_2021_session_8th_2016_Regulation.pdf"
csv_output_path = r"CSV_Result\2025_result\APP_CSV\2020_2021_eighth_Semester_Updated.csv"
session_year = "2020-2021"
probidhan = "2016"

# Extract data and write to CSV
students = extract_student_data_from_pdf(pdf_path, session_year, probidhan)
write_to_csv(csv_output_path, students)

print("CSV file has been successfully created!")


CSV file has been successfully created!
