Optimize code that remove unexpected text from fail subject 

In [17]:
import re
import csv
from PyPDF2 import PdfReader

# Function to clean extracted subjects by removing unwanted prefixes
def clean_subjects(subjects):
    # Define a regex pattern to match unwanted prefixes followed by subjects
    pattern = r'\b\w+_sub-\s*'
    # Remove the unwanted prefixes
    cleaned_subjects = re.sub(pattern, '', subjects)
    return cleaned_subjects.strip()

# Function to extract student data from the PDF for a given semester
def extract_student_data_from_pdf(pdf_file, session_year, probidhan, semester_num):
    student_data = []

    # Regex patterns to capture relevant information
    polytechnic_pattern = r"(\d{5}) - ([A-Za-z\s,]+)"  # Matches 'polytechnic_code - polytechnic_name'
    student_subject_pattern = r"(\d{6})\s*{\s*([^}]*)}"  # Matches 'student_roll { subjects }'

    reader = PdfReader(pdf_file)
    for page in reader.pages:
        text = page.extract_text()

        # Extract polytechnic information (code and name)
        polytechnic_match = re.search(polytechnic_pattern, text)
        if polytechnic_match:
            polytechnic_code = polytechnic_match.group(1)
            polytechnic_name = polytechnic_match.group(2).strip()

        # Find all student results (roll number and referred subjects)
        student_matches = re.findall(student_subject_pattern, text)

        # Store each student's data
        for student_roll, subjects in student_matches:
            # Clean subjects and split by comma
            cleaned_subjects = clean_subjects(subjects)
            subject_list = cleaned_subjects.split(', ')

            # Initialize semesters with 'null' values for semesters 1 to 8
            semester_keys = ["first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth"]
            semesters = {f"{i}_semester": "null" for i in semester_keys}

            # Assign all subjects to the provided semester
            semesters[f"{semester_keys[semester_num - 1]}_semester"] = ', '.join(subject_list)

            # Append student data with all semesters
            student_data.append({
                "polytechnic_name": polytechnic_name,
                "polytechnic_code": polytechnic_code,
                "probidhan": probidhan,
                "session_year": session_year,
                "student_roll": student_roll,
                **semesters  # Store all semester data
            })

    return student_data

# Function to write the extracted data to CSV
def write_to_csv(file_path, student_data):
    header = ["polytechnic_name", "polytechnic_code", "probidhan", "session_year", "student_roll"] + \
             ["first_semester", "second_semester", "third_semester", "fourth_semester",
              "fifth_semester", "sixth_semester", "seventh_semester", "eighth_semester"]

    with open(file_path, mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=header)
        writer.writeheader()
        for data in student_data:
            writer.writerow(data)


In [None]:

# Example usage
pdf_file = r'2022 Probidhan\Data\Session_2022_2023\RESULT_2022_2023_4_session_4th_2022_Regulation.pdf'  # Replace with your actual file path
session_year = '2022-2023'  # Example session year
probidhan = '2022'  # Example probidhan
semester_num=4

# Extract the student data from the PDF
student_data = extract_student_data_from_pdf(pdf_file, session_year, probidhan,semester_num)


In [14]:

# Write the extracted data to a CSV file
csv_file_path = r'Test_Result\2025_result\Refard_Result\2021_2022_sixth_semester.csv'  # Replace with your actual output file path
write_to_csv(csv_file_path, student_data)

print(f"Data successfully written to {csv_file_path}")

Data successfully written to Test_Result\2025_result\Refard_Result\2021_2022_sixth_semester.csv


Refard Extraction for 2025 Result 

In [4]:
import pdfplumber
import re
import csv

# File paths
pdf_path = r"2022 Probidhan\Data\session_2021_2022\RESULT_2021_2022_6th_2022_Regulation.pdf"
csv_path = r"Test_Result\2025_result\Refard_Result\2021_2022_6th_semester.csv"

# Regex patterns (precompiled for better performance)
polytechnic_pattern = re.compile(r"(\d{5}) - (.+)")
student_pattern = re.compile(r"(\d{6}) \{(.*?)\}")
ref_subjects_pattern = re.compile(r"ref_sub:\s*([\d,()TP\s]+)")

# Function to extract data from PDF efficiently
def extract_data_from_pdf(pdf_path):
    data = []
    polytechnic_name, polytechnic_code = None, None

    with pdfplumber.open(pdf_path) as pdf:
        # Extract text from all pages at once
        full_text = "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])

    # Find polytechnic name and code (once per section)
    polytechnic_match = polytechnic_pattern.search(full_text)
    if polytechnic_match:
        polytechnic_code = polytechnic_match.group(1)
        polytechnic_name = polytechnic_match.group(2).strip()

    # Find all students in one go
    for student_roll, details in student_pattern.findall(full_text):
        ref_subjects = None

        # Case 1: Extract referred subjects if ref_sub exists
        ref_match = ref_subjects_pattern.search(details)
        if ref_match:
            ref_subjects = ref_match.group(1).strip()
        else:
            # Case 2: Check if only referred subjects exist (no GPA)
            if not any(keyword in details for keyword in ["gpa", "GPA"]):
                ref_subjects = details.strip()

        # Store extracted data
        data.append([polytechnic_name, polytechnic_code, student_roll, ref_subjects])

    return data

# Function to write data to CSV
def write_to_csv(data, csv_path, probidhan, session_year, semester_number):
    header = ["polytechnic_name", "polytechnic_code", "probidhan", "session_year", "student_roll",
              "first_semester", "second_semester", "third_semester", "fourth_semester",
              "fifth_semester", "sixth_semester", "seventh_semester", "eighth_semester"]

    with open(csv_path, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(header)

        for row in data:
            polytechnic_name, polytechnic_code, student_roll, ref_subjects = row
            semester_data = ["NULL"] * 8  # Default all semesters to NULL
            if ref_subjects:
                semester_data[int(semester_number) - 1] = ref_subjects  # Assign referred subjects to the correct semester

            writer.writerow([polytechnic_name, polytechnic_code, probidhan, session_year, student_roll] + semester_data)


In [5]:

# Parameters
probidhan = "2022"
session_year = "2021-2022"
semester_number = 6  # Assign referred subjects to the 6th semester

# Run extraction and CSV generation
data = extract_data_from_pdf(pdf_path)
write_to_csv(data, csv_path, probidhan, session_year, semester_number)

print(f"Optimized CSV file saved: {csv_path}")


KeyboardInterrupt: 

In [18]:
import re
import csv
from PyPDF2 import PdfReader

# Function to extract only referred subjects
def extract_ref_subjects(subjects):
    # Match subjects inside 'ref_sub:' or standalone subjects without GPA values
    ref_match = re.search(r'ref_sub:\s*([^}]*)', subjects)
    if ref_match:
        return ref_match.group(1).strip()
    elif not re.search(r'gpa\d:\s*\d+\.\d+', subjects):
        return subjects.strip()
    return ""

# Function to extract student data from the PDF

def extract_student_data_from_pdf(pdf_file, session_year, probidhan, semester_num):
    student_data = []

    # Regex patterns
    polytechnic_pattern = r"(\d{5}) - ([A-Za-z\s,]+)"
    student_subject_pattern = r"(\d{6})\s*{\s*([^}]*)}"  # Matches 'student_roll { subjects }'

    reader = PdfReader(pdf_file)
    for page in reader.pages:
        text = page.extract_text()
        if not text:
            continue

        # Extract polytechnic information
        polytechnic_match = re.search(polytechnic_pattern, text)
        if polytechnic_match:
            polytechnic_code = polytechnic_match.group(1)
            polytechnic_name = polytechnic_match.group(2).strip()

        # Find all student results
        student_matches = re.findall(student_subject_pattern, text)

        for student_roll, subjects in student_matches:
            # Extract only referred subjects
            ref_subjects = extract_ref_subjects(subjects)

            # Initialize semesters with 'null'
            semester_keys = ["first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth"]
            semesters = {f"{i}_semester": "null" for i in semester_keys}

            # Assign extracted subjects to the provided semester
            semesters[f"{semester_keys[semester_num - 1]}_semester"] = ref_subjects if ref_subjects else "null"

            # Append data
            student_data.append({
                "polytechnic_name": polytechnic_name,
                "polytechnic_code": polytechnic_code,
                "probidhan": probidhan,
                "session_year": session_year,
                "student_roll": student_roll,
                **semesters
            })

    return student_data

# Function to write extracted data to CSV
def write_to_csv(file_path, student_data):
    header = ["polytechnic_name", "polytechnic_code", "probidhan", "session_year", "student_roll"] + \
             ["first_semester", "second_semester", "third_semester", "fourth_semester",
              "fifth_semester", "sixth_semester", "seventh_semester", "eighth_semester"]

    with open(file_path, mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=header)
        writer.writeheader()
        for data in student_data:
            writer.writerow(data)


In [22]:

# Example usage
pdf_file = r'2016 Probidhan\Data\2020-2021\RESULT__2020_2021_session_8th_2016_Regulation.pdf'  # Path to uploaded PDF
session_year = '2020-2021'
probidhan = '2016'
semester_num = 8

# Extract data
student_data = extract_student_data_from_pdf(pdf_file, session_year, probidhan, semester_num)

# Write data to CSV
csv_file_path = r'Test_Result\2025_result\Refard_Result\2020_2021_eighth_semester.csv'
write_to_csv(csv_file_path, student_data)

print(f"Data successfully written to {csv_file_path}")


Data successfully written to Test_Result\2025_result\Refard_Result\2020_2021_eighth_semester.csv
