In [1]:
import spacy
from PyPDF2 import PdfReader
import os
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB  

In [10]:
# **1. Preprocessing**
def extract_text_from_pdf(pdf_path):
    """Extract raw text from a PDF resume"""
    with open(pdf_path, 'rb') as file:
        reader = PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text) 
    return text

def create_features(resumes):
    vectorizer = CountVectorizer()  # Or use TfidfVectorizer()
    features = vectorizer.fit_transform([resume['text'] for resume in resumes])
    return features, vectorizer.get_feature_names_out()

def extract_skills(text, nlp):
    skills = []

    # NER for skill-like entities
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ in ['SKILL']:  # Customize if your NER has other labels
            skills.append(ent.text)

    # Keyword Matching (customize this list extensively)
    skill_keywords = ["Python", "Java", "data analysis", "machine learning", ...]
    for word in text.split():
        if word.lower() in skill_keywords:
            skills.append(word)

    return list(set(skills))

def extract_experience(text, nlp):
    experiences = []
    doc = nlp(text)
    # Job title search (NER or matching)
    job_titles = [ent.text for ent in doc.ents if ent.label_ == 'JOB_TITLE']  # Customize label

    # Date pattern searching
    date_pattern = r'\d{4}|((Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\.?\s\d{4})'
    date_ranges = re.findall(date_pattern, text) 

    # Combine into potential experiences 
    for job_title, date_range in zip(job_titles, date_ranges):
        experiences.append({
            'title': job_title,
            'date_range': date_range
        })

    return experiences

def extract_education(text, nlp):
    education_info = []
    doc = nlp(text)
    # Degree pattern matching
    degree_pattern = r'\b([A-Z][a-z]+\s?)+(\sDegree\b|\b[Bb]achelor\b|\b[Mm]aster\b|\b[Dd]octorate\b)'
    degrees = re.findall(degree_pattern, text)

    # University/Institution search (may need NER tuning)
    for ent in doc.ents:
        if ent.label_ in ['ORG']:  # Assuming 'ORG' tag represents institutions
            education_info.append({
                'institution': ent.text,
                'degrees': degrees  # Associate degrees if found nearby
            })

    return education_info 


def load_and_label_data(data_dir):
    resumes = []
    for file in os.listdir(data_dir):
        if file.endswith('.pdf'):
            file_path = os.path.join(data_dir, file)
            text = extract_text_from_pdf(file_path)
            text = preprocess_text(text)
            skills = extract_skills(text, nlp)

            # Education Extraction
            education = extract_education(text, nlp)

            # Experience Extraction
            experience = extract_experience(text, nlp)
            resumes.append({
                'text': text,
                'skills': skills,
                'education': education,
                'experience': experience
            })
    return resumes