In [1]:
!pip install reportlab python-docx

import os
from docx import Document
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
import zipfile
from google.colab import files

# -------------------------
# Create folders
# -------------------------
os.makedirs("resumes_docx", exist_ok=True)
os.makedirs("resumes_pdf", exist_ok=True)

# -------------------------
# DOCX Helper
# -------------------------
def create_docx(path, name, email, phone, location, age, gender, education, exp, skills, summary):
    doc = Document()
    doc.add_heading(name, 0)
    doc.add_paragraph(f"Email: {email} | Phone: {phone}")
    doc.add_paragraph(f"Location: {location}")
    doc.add_paragraph(f"Age: {age} | Gender: {gender}")
    doc.add_heading("Education", level=1)
    doc.add_paragraph(education)
    doc.add_heading("Experience", level=1)
    doc.add_paragraph(exp)
    doc.add_heading("Skills", level=1)
    doc.add_paragraph(skills)
    doc.add_heading("Summary", level=1)
    doc.add_paragraph(summary)
    doc.save(path)

# -------------------------
# PDF Helper
# -------------------------
def create_pdf(path, lines):
    c = canvas.Canvas(path, pagesize=A4)
    width, height = A4
    y = height - 50
    for line in lines:
        c.drawString(50, y, line)
        y -= 20
    c.save()

# -------------------------
# Resume 1: Data Scientist
# -------------------------
name, email, phone, loc, age, gender = "John Doe", "john.doe@gmail.com", "+91 9876543210", "Bangalore, India", "28", "Male"
edu = "M.Tech in Computer Science, IIT Delhi (2019)"
exp = "5 years of experience in Data Science and Machine Learning. Worked on predictive modeling, NLP, and cloud deployment."
skills = "Python, SQL, TensorFlow, AWS, Data Analysis, Machine Learning"
summary = "Data Scientist with 5 years of experience in building AI solutions for finance and e-commerce domains."

create_docx("resumes_docx/resume1.docx", name, email, phone, loc, age, gender, edu, exp, skills, summary)
create_pdf("resumes_pdf/resume1.pdf", [name, f"Email: {email} | Phone: {phone}", f"Location: {loc}", f"Age: {age} | Gender: {gender}", f"Education: {edu}", f"Experience: {exp}", f"Skills: {skills}", f"Summary: {summary}"])

# -------------------------
# Resume 2: Full Stack Developer
# -------------------------
name, email, phone, loc, age, gender = "Priya K", "priya.k@outlook.com", "+91 9123456789", "Chennai, India", "26", "Female"
edu = "B.Tech in Information Technology, Anna University (2020)"
exp = "3 years in full stack web development using React, Node.js, and AWS. Developed scalable e-commerce platforms."
skills = "React, Node.js, MongoDB, AWS, Docker, REST APIs"
summary = "Full Stack Developer with strong expertise in JavaScript frameworks and cloud platforms."

create_docx("resumes_docx/resume2.docx", name, email, phone, loc, age, gender, edu, exp, skills, summary)
create_pdf("resumes_pdf/resume2.pdf", [name, f"Email: {email} | Phone: {phone}", f"Location: {loc}", f"Age: {age} | Gender: {gender}", f"Education: {edu}", f"Experience: {exp}", f"Skills: {skills}", f"Summary: {summary}"])

# -------------------------
# Resume 3: Cloud Engineer
# -------------------------
name, email, phone, loc, age, gender = "Ramesh P", "ramesh.p@gmail.com", "+91 9001234567", "Hyderabad, India", "30", "Male"
edu = "B.E in Computer Engineering, Osmania University (2017)"
exp = "6 years in cloud infrastructure and DevOps. Specialized in automation, Kubernetes, and CI/CD pipelines."
skills = "AWS, Azure, Kubernetes, Docker, Terraform, Jenkins, Linux"
summary = "Cloud Engineer with expertise in designing and maintaining scalable cloud environments."

create_docx("resumes_docx/resume3.docx", name, email, phone, loc, age, gender, edu, exp, skills, summary)
create_pdf("resumes_pdf/resume3.pdf", [name, f"Email: {email} | Phone: {phone}", f"Location: {loc}", f"Age: {age} | Gender: {gender}", f"Education: {edu}", f"Experience: {exp}", f"Skills: {skills}", f"Summary: {summary}"])

# -------------------------
# Zip all resumes
# -------------------------
zip_path = "sample_resumes.zip"
with zipfile.ZipFile(zip_path, 'w') as zipf:
    for folder in ["resumes_docx", "resumes_pdf"]:
        for file in os.listdir(folder):
            zipf.write(os.path.join(folder, file), arcname=file)

# -------------------------
# Download Zip
# -------------------------
files.download(zip_path)


Collecting reportlab
  Downloading reportlab-4.4.4-py3-none-any.whl.metadata (1.7 kB)
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading reportlab-4.4.4-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: reportlab, python-docx
Successfully installed python-docx-1.2.0 reportlab-4.4.4


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [2]:
# ==============================
# Install required packages
# ==============================
!pip install langchain langchain-openai langchain-community pypdf python-docx unstructured

# ==============================
# Imports
# ==============================
import os
import pandas as pd
import zipfile
from google.colab import files
from getpass import getpass
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain_community.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader

# ==============================
# Step 0: Set OpenAI API Key
# ==============================
os.environ["OPENAI_API_KEY"] = getpass("Enter your OpenAI API Key: ")
print("Key loaded:", os.environ["OPENAI_API_KEY"][:8] + "...")

# ==============================
# Step 1: Upload the zip file
# ==============================
print("📂 Please upload sample_resumes.zip")
uploaded = files.upload()

# Extract resumes
zip_path = list(uploaded.keys())[0]
os.makedirs("resumes", exist_ok=True)
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall("resumes")

print("✅ Resumes extracted into 'resumes/' folder")

# ==============================
# Step 2: LangChain + OpenAI Setup
# ==============================
# Define schema
schemas = [
    ResponseSchema(name="name", description="Full name of the candidate"),
    ResponseSchema(name="age", description="Age in numbers"),
    ResponseSchema(name="gender", description="Gender of the candidate"),
    ResponseSchema(name="location", description="City or location"),
    ResponseSchema(name="email", description="Email ID"),
    ResponseSchema(name="phone", description="Phone number"),
    ResponseSchema(name="qualification", description="Highest qualification"),
    ResponseSchema(name="experience_years", description="Years of work experience"),
    ResponseSchema(name="skills", description="List of skills"),
    ResponseSchema(name="summary", description="Short professional summary")
]

parser = StructuredOutputParser.from_response_schemas(schemas)
format_instructions = parser.get_format_instructions()

# Prompt template
prompt = PromptTemplate(
    template="""
    Extract candidate details from this resume text:

    {resume_text}

    {format_instructions}
    """,
    input_variables=["resume_text"],
    partial_variables={"format_instructions": format_instructions},
)

# Resume loader
def load_resume(file_path):
    if file_path.endswith(".pdf"):
        loader = PyPDFLoader(file_path)
    elif file_path.endswith(".docx"):
        loader = UnstructuredWordDocumentLoader(file_path)
    else:
        raise ValueError("Unsupported file format")
    return loader.load()[0].page_content

# LLM
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

def extract_resume_data(resume_text):
    prompt_text = prompt.format(resume_text=resume_text)
    response = llm.predict(prompt_text)
    return parser.parse(response)

# ==============================
# Step 3: Process All Resumes
# ==============================
def process_resumes(resume_folder):
    all_candidates = []
    for file in os.listdir(resume_folder):
        if not (file.endswith(".pdf") or file.endswith(".docx")):
            continue
        path = os.path.join(resume_folder, file)
        print(f"Processing {file} ...")
        text = load_resume(path)
        candidate_data = extract_resume_data(text)
        all_candidates.append(candidate_data)

    df = pd.DataFrame(all_candidates)
    csv_path = "extracted_candidates.csv"
    df.to_csv(csv_path, index=False)
    print("✅ Data saved to", csv_path)

    # Automatically download CSV
    files.download(csv_path)

    return df

# ==============================
# Step 4: Run Extraction
# ==============================
df = process_resumes("resumes/")
df.head()


Collecting langchain-openai
  Downloading langchain_openai-0.3.33-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.30-py3-none-any.whl.metadata (3.0 kB)
Collecting pypdf
  Downloading pypdf-6.1.1-py3-none-any.whl.metadata (7.1 kB)
Collecting unstructured
  Downloading unstructured-0.18.15-py3-none-any.whl.metadata (24 kB)
Collecting requests<3,>=2 (from langchain)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting filetype (from unstructured)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting python-magic (from unstructured)
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting emoji (from unstructured)
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Collecting python-iso639 (from unstructured)
  Do

Saving sample_resumes.zip to sample_resumes (1).zip
✅ Resumes extracted into 'resumes/' folder
Processing resume3.pdf ...


  response = llm.predict(prompt_text)


Processing resume2.docx ...
Processing resume2.pdf ...
Processing resume1.docx ...
Processing resume3.docx ...
Processing resume1.pdf ...
✅ Data saved to extracted_candidates.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,name,age,gender,location,email,phone,qualification,experience_years,skills,summary
0,Ramesh P,30,Male,"Hyderabad, India",ramesh.p@gmail.com,+91 9001234567,"B.E in Computer Engineering, Osmania Universit...",6 years,"AWS, Azure, Kubernetes, Docker, Terraform, Jen...",Cloud Engineer with expertise in designing and...
1,Priya K,26,Female,"Chennai, India",priya.k@outlook.com,+91 9123456789,"B.Tech in Information Technology, Anna Univers...",3 years,"React, Node.js, MongoDB, AWS, Docker, REST APIs",Full Stack Developer with strong expertise in ...
2,Priya K,26,Female,"Chennai, India",priya.k@outlook.com,+91 9123456789,"B.Tech in Information Technology, Anna Univers...",3 years,"React, Node.js, MongoDB, AWS, Docker, REST APIs",Full Stack Developer with strong expertise in ...
3,John Doe,28,Male,"Bangalore, India",john.doe@gmail.com,+91 9876543210,"M.Tech in Computer Science, IIT Delhi (2019)",5,"Python, SQL, TensorFlow, AWS, Data Analysis, M...",Data Scientist with 5 years of experience in b...
4,Ramesh P,30,Male,"Hyderabad, India",ramesh.p@gmail.com,+91 9001234567,"B.E in Computer Engineering, Osmania Universit...",6 years,"AWS, Azure, Kubernetes, Docker, Terraform, Jen...",Cloud Engineer with expertise in designing and...


In [7]:
# ================================
# Validation Methods for Resume ↔ JD - sprint2
# ================================

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import re
from google.colab import files  # for auto-download

# Load candidates (from previous pipeline)
df = pd.read_csv("extracted_candidates.csv")

# Load JD text
with open("job_description.txt", "r") as f:
    jd_text = f.read().lower()

# Clean function
def clean_text(text):
    if isinstance(text, str):
        return re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())
    return ""

# ================================
# 1. Cosine Similarity (Embeddings)
# ================================
model = SentenceTransformer("all-MiniLM-L6-v2")

def cosine_embedding(candidate_text, jd_text):
    embeddings = model.encode([candidate_text, jd_text])
    score = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    return round(score * 100, 2)

# ================================
# 2. TF-IDF + Cosine Similarity
# ================================
def tfidf_similarity(candidate_text, jd_text):
    vect = TfidfVectorizer()
    tfidf_matrix = vect.fit_transform([candidate_text, jd_text])
    score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    return round(score * 100, 2)

# ================================
# 3. Jaccard Similarity
# ================================
def jaccard_similarity(candidate_text, jd_text):
    set1, set2 = set(candidate_text.split()), set(jd_text.split())
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return round(len(intersection) / len(union) * 100, 2) if union else 0

# ================================
# 4. Hybrid Score (Weighted Average)
# ================================
def hybrid_score(candidate_text, jd_text, w1=0.5, w2=0.3, w3=0.2):
    e_score = cosine_embedding(candidate_text, jd_text)
    t_score = tfidf_similarity(candidate_text, jd_text)
    j_score = jaccard_similarity(candidate_text, jd_text)
    return round(w1*e_score + w2*t_score + w3*j_score, 2)

# ================================
# Apply Methods
# ================================
df["cosine_emb"] = df["skills"].apply(lambda s: cosine_embedding(clean_text(str(s)), jd_text))
df["tfidf_cosine"] = df["skills"].apply(lambda s: tfidf_similarity(clean_text(str(s)), jd_text))
df["jaccard"] = df["skills"].apply(lambda s: jaccard_similarity(clean_text(str(s)), jd_text))
df["hybrid"] = df["skills"].apply(lambda s: hybrid_score(clean_text(str(s)), jd_text))

# Save results
output_file = "candidates_with_validations.csv"
df.to_csv(output_file, index=False)
print(f"✅ Validation scores saved to {output_file}")

# Auto-download in Colab
files.download(output_file)


✅ Validation scores saved to candidates_with_validations.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [6]:
# Load candidates
import pandas as pd
df = pd.read_csv("candidates_with_validations.csv")

# Define missing skills finder
def find_missing(candidate_skills):
    if isinstance(candidate_skills, str):
        candidate_list = [s.strip().lower() for s in candidate_skills.split(",")]
    else:
        candidate_list = []

    jd_skills = ["python", "machine learning", "sql", "deep learning", "nlp"]  # <-- customize
    missing = [s for s in jd_skills if s not in candidate_list]
    return ", ".join(missing) if missing else "None"

# Apply to dataframe
df["missing_skills"] = df["skills"].apply(find_missing)

# Save again
df.to_csv("candidates_with_missing.csv", index=False)
print("✅ Missing skills calculated and saved to candidates_with_missing.csv")

from google.colab import files
files.download("candidates_with_missing.csv")


✅ Missing skills calculated and saved to candidates_with_missing.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>