In [14]:
import pdfplumber
import re
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")

In [34]:


def check(subheading, subheadings):
    for sh in subheadings:
        if subheading.upper() == sh[:-1].upper() or subheading.upper() == sh.upper():
            return True
    return False

def categorize_text(text):
    
    # Define subheadings
    education_subheadings = ["EDUCATION", "ACADEMIC PROFILES", "ACADEMIC QUALIFICATIONS", "SCHOLASTIC ACHIEVEMENTS",]
    project_subheadings = ["INTERNSHIPS", "PROJECTS", "SELF PROJECTS", "INTERNSHIP & PROJECTS", "INTERNSHIPS AND PROJECTS", 
                           "Professional Experiences", "Experiences", "WORK EXPERIENCES", "KEY PROJECTS",
                            "work background","workshops","trainings","jobs", "PUBLICATIONS"]
    competition_subheadings = ["COMPETITION/CONFERENCES", "COMPETITIONS/CONFERENCES", "CERTIFICATIONS", "COMPETITIONS", ]
    awards_subheadings = ["AWARDS & ACHIEVEMENTS","accomplishments", "ACHIEVEMENTS/AWARDS", "AWARDS AND ACHIEVEMENTS","Awards and Honours"]
    skills_subheadings = ["SKILLS AND EXPERTISE", "Skills", "Technical Skills", "Technical Proficiencies", "Relevant Skills and Coursework",]
    coursework_subheadings = ["COURSEWORK INFORMATION", "COURSES & CERTIFICATIONS", "RELEVANT COURSES",]
    extra_curricular_activities_subheadings = ["EXTRA CURRICULAR ACTIVITIES", "EXTRACURRICULAR ACTIVITIES", "EXTRA-CURRICULAR ACTIVITIES","other activities", "POSITIONS OF RESPONSIBILITY",]
    
    subheadings = (education_subheadings + project_subheadings + competition_subheadings + 
               awards_subheadings + skills_subheadings + coursework_subheadings + 
               extra_curricular_activities_subheadings)
    
    # Initialize dictionary to hold categorized data
    # subheadings += keywords
    categorized_data = {subheading: "" for subheading in subheadings}
    
    # Create regex pattern for each subheading
    subheading_pattern = "|".join(subheadings)
    for subheading in subheadings:
        subheading_pattern += '|' + subheading[:-1]
    pattern = re.compile(
        rf"(?P<subheading>{subheading_pattern})\b(.*?)(?=\n(?:{subheading_pattern})|\Z)",
        re.S | re.IGNORECASE
    )    
    # Use regex to find all matches
    matches = pattern.finditer(text)
    features = ['EXTRA','COURSEWORK','SKILLS','AWARDS','COMPETITIONS','PROJECTS','EDUCATION']
    extracted_text = {}
    for feature in features:
        extracted_text[feature] = ''
    for match in matches:
        subheading = match.group("subheading")
        content = match.group(2).strip()
        if content=='': continue
        if check(subheading, education_subheadings):
            subheading = 'EDUCATION'
        elif check(subheading, project_subheadings):
            subheading = 'PROJECTS'
        elif check(subheading, competition_subheadings):
            subheading = 'COMPETITIONS'
        elif check(subheading, awards_subheadings):
            subheading = 'AWARDS'
        elif check(subheading, skills_subheadings):
            subheading = 'SKILLS'
        elif check(subheading, coursework_subheadings):
            subheading = 'COURSEWORK'
        elif check(subheading, extra_curricular_activities_subheadings):
            subheading = 'EXTRA'
        extracted_text[subheading] += content
    
    return extracted_text

In [None]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

In [37]:
resumes_folder = "resumes"
# Path to the resume PDF
data = []

for filename in os.listdir(resumes_folder):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(resumes_folder, filename)
        
        # Extract text from the PDF
        resume_text = extract_text_from_pdf(pdf_path)
        
        # Create a dictionary to store the categorized data
        resume_dict = {'CANDIDATE_NAME': filename}
        resume_dict['RESUME'] = resume_text
        # Append the dictionary to the data list
        data.append(resume_dict)

df = pd.DataFrame(data)
df.to_csv('resumes.csv', index=False)
