In [1]:
import os
import re
import pdfplumber
import spacy
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

# Load NLP model (spaCy)
nlp = spacy.load("en_core_web_sm")

# Define regex patterns for extracting details
EMAIL_REGEX = r"[a-zA-Z0-9+_.-]+@[a-zA-Z0-9.-]+"
PHONE_REGEX = r"\+?[0-9]{10,15}"
CGPA_REGEX = r"(?:CGPA|GPA|CPI|Percentage)[^0-9]*([0-9]+\.?[0-9]*)"

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

# Function to extract information from resume
def parse_resume(text):
    doc = nlp(text)
    name = doc.ents[0].text if doc.ents else "Unknown"
    email = re.search(EMAIL_REGEX, text)
    phone = re.search(PHONE_REGEX, text)
    cgpa = re.search(CGPA_REGEX, text)
    
    email = email.group() if email else "Not Found"
    phone = phone.group() if phone else "Not Found"
    cgpa = cgpa.group(1) if cgpa else "Not Found"
    
    # Extract skills, university, and key experience info
    skills = [ent.text for ent in doc.ents if ent.label_ == "ORG"]  # Basic skill matching
    university = next((ent.text for ent in doc.ents if "University" in ent.text), "Unknown")
    year_of_study = re.search(r"\b(20\d{2})\b", text)  # Extracts years like 2023, 2024
    course = next((ent.text for ent in doc.ents if "Bachelor" in ent.text or "Master" in ent.text), "Unknown")
    discipline = next((ent.text for ent in doc.ents if "Engineering" in ent.text or "Science" in ent.text), "Unknown")
    
    # Experience Scoring (Basic NLP-based scoring)
    gen_ai_score = 1 if "ChatGPT" in text else 2 if "LLM" in text else 3 if "RAG" in text else 0
    ai_ml_score = 1 if "Machine Learning" in text else 2 if "Neural Network" in text else 3 if "Transformer" in text else 0
    
    return {
        "Name": name,
        "Contact Details": phone,
        "Email": email,
        "University": university,
        "Year of Study": year_of_study.group() if year_of_study else "Unknown",
        "Course": course,
        "Discipline": discipline,
        "CGPA": cgpa,
        "Key Skills": ", ".join(skills),
        "Gen AI Experience Score": gen_ai_score,
        "AI/ML Experience Score": ai_ml_score
    }

# Function to process resumes in batch
def process_resumes(pdf_files):
    results = []
    with ThreadPoolExecutor() as executor:
        texts = executor.map(extract_text_from_pdf, pdf_files)
        results = [parse_resume(text) for text in texts]
    return results

# Function to save results to Excel
def save_to_excel(results, output_path):
    df = pd.DataFrame(results)
    df.to_excel(output_path, index=False)
    print(f"Results saved to {output_path}")

# Main execution
def main():
    input_folder = "resumes"  # Folder containing PDFs
    output_file = "resume_analysis.xlsx"
    
    pdf_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".pdf")]
    results = process_resumes(pdf_files)
    save_to_excel(results, output_file)

if __name__ == "__main__":
    main()

Results saved to resume_analysis.xlsx
