Optimize code that remove unexpected text from fail subject 

In [1]:
import re
import csv
from PyPDF2 import PdfReader

# Function to clean extracted subjects by removing unwanted prefixes
def clean_subjects(subjects):
    # Define a regex pattern to match unwanted prefixes followed by subjects
    pattern = r'\b\w+_sub-\s*'
    # Remove the unwanted prefixes
    cleaned_subjects = re.sub(pattern, '', subjects)
    return cleaned_subjects.strip()

# Function to extract student data from the PDF for a given semester
def extract_student_data_from_pdf(pdf_file, session_year, probidhan, semester_num):
    student_data = []

    # Regex patterns to capture relevant information
    polytechnic_pattern = r"(\d{5}) - ([A-Za-z\s,]+)"  # Matches 'polytechnic_code - polytechnic_name'
    student_subject_pattern = r"(\d{6})\s*{\s*([^}]*)}"  # Matches 'student_roll { subjects }'

    reader = PdfReader(pdf_file)
    for page in reader.pages:
        text = page.extract_text()

        # Extract polytechnic information (code and name)
        polytechnic_match = re.search(polytechnic_pattern, text)
        if polytechnic_match:
            polytechnic_code = polytechnic_match.group(1)
            polytechnic_name = polytechnic_match.group(2).strip()

        # Find all student results (roll number and referred subjects)
        student_matches = re.findall(student_subject_pattern, text)

        # Store each student's data
        for student_roll, subjects in student_matches:
            # Clean subjects and split by comma
            cleaned_subjects = clean_subjects(subjects)
            subject_list = cleaned_subjects.split(', ')

            # Initialize semesters with 'null' values for semesters 1 to 8
            semester_keys = ["first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth"]
            semesters = {f"{i}_semester": "null" for i in semester_keys}

            # Assign all subjects to the provided semester
            semesters[f"{semester_keys[semester_num - 1]}_semester"] = ', '.join(subject_list)

            # Append student data with all semesters
            student_data.append({
                "polytechnic_name": polytechnic_name,
                "polytechnic_code": polytechnic_code,
                "probidhan": probidhan,
                "session_year": session_year,
                "student_roll": student_roll,
                **semesters  # Store all semester data
            })

    return student_data

# Function to write the extracted data to CSV
def write_to_csv(file_path, student_data):
    header = ["polytechnic_name", "polytechnic_code", "probidhan", "session_year", "student_roll"] + \
             ["first_semester", "second_semester", "third_semester", "fourth_semester",
              "fifth_semester", "sixth_semester", "seventh_semester", "eighth_semester"]

    with open(file_path, mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=header)
        writer.writeheader()
        for data in student_data:
            writer.writerow(data)


In [18]:

# Example usage
pdf_file = r'Test_Result\2022_Probidhan\Data\Session_2023_2024\Result_1st_2022_Regulation.pdf'  # Replace with your actual file path
session_year = '2023-2024'  # Example session year
probidhan = '2022'  # Example probidhan
semester_num=1

# Extract the student data from the PDF
student_data = extract_student_data_from_pdf(pdf_file, session_year, probidhan,semester_num)


In [19]:

# Write the extracted data to a CSV file
csv_file_path = r'Test_Result\2022_Probidhan\Refard_CSV\2023_2024_first_semester.csv'  # Replace with your actual output file path
write_to_csv(csv_file_path, student_data)

print(f"Data successfully written to {csv_file_path}")

Data successfully written to Test_Result\2022_Probidhan\Refard_CSV\2023_2024_first_semester.csv
