# 📌 Cell 1: List All PDF Files in the Dataset

In [None]:
import os

# Walk through the Kaggle input directory and list only PDF files.
pdf_files = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if filename.lower().endswith(".pdf"):
            file_path = os.path.join(dirname, filename)
            pdf_files.append(file_path)
            #print(file_path)

print(f"\nTotal PDF files found: {len(pdf_files)}")


In [None]:
pip install pdfplumber

# 📌 Cell 2: Improved Text Extraction from PDFs

In [None]:
import pdfplumber

def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file using pdfplumber.
    
    Parameters:
        pdf_path (str): Path to the PDF file.
    
    Returns:
        text (str): Extracted text.
    """
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text() or ""
                text += page_text + "\n"
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
    return text.strip()

# Test the function on the first PDF file if needed:
# print(extract_text_from_pdf(pdf_files[0])[:500])


# 📌 Cell 3: Load and Process All PDF Resumes

In [None]:
def read_resumes_from_files(file_list):
    """
    Reads all PDF resumes from a list of file paths and extracts text.
    
    Parameters:
        file_list (list): List of PDF file paths.
    
    Returns:
        resumes (list): List of extracted resume texts.
    """
    resumes = []
    for file_path in file_list:
        text = extract_text_from_pdf(file_path)
        if text:  # Only add if text extraction was successful
            resumes.append(text)
    print(f"Total resumes processed: {len(resumes)}")
    return resumes

# Load all resumes from the collected PDF file paths.
all_resumes = read_resumes_from_files(pdf_files)


# Cell 4: Enhanced Preprocessing with spaCy (Lemmatization & Stopword Removal)

In [None]:
import re
import spacy

# Load spaCy's English model (make sure this model is available on Kaggle)
nlp = spacy.load("en_core_web_sm")

def preprocess_text_spacy(text):
    """
    Preprocesses resume text by removing extra spaces, lowercasing, lemmatizing, 
    and removing stopwords and punctuation using spaCy.
    
    Parameters:
        text (str): Original text.
    
    Returns:
        processed_text (str): Preprocessed text.
    """
    # Clean up spaces/newlines
    text = re.sub(r'\s+', ' ', text)
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Preprocess all resumes.
processed_resumes = [preprocess_text_spacy(resume) for resume in all_resumes]
print("✅ Text preprocessing complete with spaCy!")


# Cell 5: Compute Sentence‑BERT Embeddings (Replace Word2Vec/TF‑IDF)

In [None]:
!pip install -q sentence-transformers

from sentence_transformers import SentenceTransformer
import torch

# Load a pre-trained Sentence-BERT model.
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Compute embeddings for each processed resume.
resume_embeddings = sbert_model.encode(processed_resumes, convert_to_tensor=True)
print("✅ Sentence-BERT embeddings computed!")


# 📌 Cell 6: Bias Mitigation – Remove Demographic Indicators

In [None]:
def remove_demographic_indicators(text):
    """
    Removes demographic indicators (e.g., names, locations) using spaCy's NER.
    
    Parameters:
        text (str): Input text.
    
    Returns:
        cleaned_text (str): Text with demographic entities removed.
    """
    doc = nlp(text)
    tokens = [token.text for token in doc if token.ent_type_ not in ["PERSON", "GPE"]]
    return " ".join(tokens)

# Apply bias mitigation on the processed resumes.
debiased_resumes = [remove_demographic_indicators(text) for text in processed_resumes]
print("✅ Bias mitigation applied on resume texts!")

# (Optional) Recompute embeddings on debiased resumes for ranking:
debiased_embeddings = sbert_model.encode(debiased_resumes, convert_to_tensor=True)


# 📌 Cell 7: Enhanced Feedback Generation Ranking with Sentence‑BERT

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Define your job description for ranking.
job_description = "We are seeking a skilled designer with strong experience in graphic design, UI/UX, and creative problem solving."

# Compute the job description embedding.
job_embedding = sbert_model.encode(job_description, convert_to_tensor=True)

# Convert tensors to NumPy arrays (if needed for cosine similarity)
job_embedding_np = job_embedding.cpu().numpy()
debiased_embeddings_np = debiased_embeddings.cpu().numpy()

# Compute cosine similarity between the job description and each resume.
similarities = cosine_similarity([job_embedding_np], debiased_embeddings_np)[0]

# Get ranked indices (highest similarity first).
ranked_indices = np.argsort(similarities)[::-1]

print("Ranking complete. Top 5 similarity scores:")
for i in range(min(5, len(similarities))):
    print(f"Rank {i+1}: Resume Index {ranked_indices[i]} with similarity {similarities[ranked_indices[i]]:.4f}")

# Feedback functions (you can later extend these with more advanced interpretable methods).
def recruiter_feedback(resume_text):
    if len(resume_text.split()) < 50:
        return "This resume may lack sufficient details."
    elif "experience" not in resume_text:
        return "Consider looking for resumes with clear experience details."
    else:
        return "Resume appears well-detailed."

def job_seeker_feedback(resume_text):
    missing_keywords = []
    essential_keywords = ["experience", "skills", "education", "projects"]
    for keyword in essential_keywords:
        if keyword not in resume_text:
            missing_keywords.append(keyword)
    if missing_keywords:
        return f"Consider adding: {', '.join(missing_keywords)}."
    else:
        return "Your resume appears comprehensive!"


# 📌 Cell 8: Main Execution – Process, Generate Feedback, and Save Results

In [None]:
import pandas as pd

results = []
# Use the ranking from the cosine similarity computed on debiased embeddings.
for rank, idx in enumerate(ranked_indices, start=1):
    resume_text = debiased_resumes[idx]
    rec_feedback = recruiter_feedback(resume_text)
    cand_feedback = job_seeker_feedback(resume_text)
    
    results.append({
        "Rank": rank,
        "Resume Index": idx + 1,  # converting 0-index to 1-index for display
        "Similarity Score": similarities[idx],
        "Recruiter Feedback": rec_feedback,
        "Job Seeker Feedback": cand_feedback,
        "Resume Snippet": resume_text[:500] + "..."
    })

# Convert the results to a DataFrame.
results_df = pd.DataFrame(results)

# Save the results to a CSV file.
output_path = "/kaggle/working/resume_feedback_results_with_ranking.csv"
results_df.to_csv(output_path, index=False)
print(f"✅ Results saved to {output_path}")

# Display the first few rows of the results.
results_df.head()


In [None]:
results_df.head()

# 📌 Cell 9: View a Specific CV (e.g., Top-Ranked Resume)

In [None]:
from IPython.display import IFrame

# To view the top-ranked resume, use the first element from ranked_indices.
top_resume_index = ranked_indices[0]
top_resume_file = pdf_files[top_resume_index]

print(f"Displaying the top-ranked resume from file: {top_resume_file}")
IFrame(top_resume_file, width=800, height=600)


In [None]:
!lscpu
