In [1]:
# ================================
# Step 0 — Project Setup
# ================================

import os
import random
import numpy as np
import tensorflow as tf

# Define project structure
project_dirs = ["data", "models", "notebooks", "src"]

for d in project_dirs:
    os.makedirs(d, exist_ok=True)

print("✅ Project structure created:", project_dirs)

# Set random seeds for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

print("✅ Seeds fixed to:", seed)

# Verify packages (if running fresh environment)
required_packages = [
    "numpy", "pandas", "scikit-learn", "tensorflow", 
    "nltk", "spacy", "sentence-transformers", "shap", "lime", "matplotlib"
]
print("📦 Suggested packages:", required_packages)


✅ Project structure created: ['data', 'models', 'notebooks', 'src']
✅ Seeds fixed to: 42
📦 Suggested packages: ['numpy', 'pandas', 'scikit-learn', 'tensorflow', 'nltk', 'spacy', 'sentence-transformers', 'shap', 'lime', 'matplotlib']


In [2]:
# ================================
# Step 1 — Generate / Collect Dataset
# ================================

import json
import pandas as pd
import numpy as np

# ---------- 1.2 Domain Requirements ----------
domain_requirements = {
    "Data Science": {
        "domain": "Data Science",
        "required_skills": [
            "Python", "Pandas", "NumPy", "Scikit-learn", "PyTorch", "Docker", "Deep Learning"
        ]
    },
    "Web Development": {
        "domain": "Web Development",
        "required_skills": [
            "HTML", "CSS", "JavaScript", "React", "Node.js", "Express", "SQL"
        ]
    },
    "Cloud Engineering": {
        "domain": "Cloud Engineering",
        "required_skills": [
            "AWS", "Azure", "Docker", "Kubernetes", "Linux", "Terraform", "CI/CD"
        ]
    }
}

# Save domain requirement files
os.makedirs("data/domain_requirements", exist_ok=True)
for domain, req in domain_requirements.items():
    with open(f"data/domain_requirements/{domain.lower().replace(' ','_')}.json", "w") as f:
        json.dump(req, f, indent=4)

print("✅ Domain requirement files saved.")


# ---------- 1.3 Synthetic Resume Generator ----------
skills_pool = list(set(sum([req["required_skills"] for req in domain_requirements.values()], []))) + [
    "C++", "Java", "SQL", "Tableau", "Hadoop", "Spark", "Flask"
]

job_titles = ["Data Scientist", "Data Analyst", "ML Engineer", "Backend Developer", "Frontend Developer", "Cloud Engineer"]

projects_pool = [
    "Image Classification using CNN", "Web Scraping with Python", "Portfolio Website",
    "Cloud Infrastructure Setup", "ETL Pipeline with Spark", "Dashboard with React"
]

def generate_resume(idx, domains):
    domain = np.random.choice(list(domains.keys()))
    required = domains[domain]["required_skills"]

    # Randomly sample skills
    n_skills = np.random.randint(3, 10)
    skills = list(np.random.choice(skills_pool, n_skills, replace=False))

    # Projects
    n_projects = np.random.randint(0, 4)
    projects = list(np.random.choice(projects_pool, n_projects, replace=False))

    # Work experience
    n_exp = np.random.randint(1, 3)
    work_experience = [
        {"title": np.random.choice(job_titles), "years": np.random.randint(0, 6)}
        for _ in range(n_exp)
    ]

    # Test score from clipped normal distribution (mean=65, std=20)
    test_score = int(np.clip(np.random.normal(65, 20), 0, 100))

    resume = {
        "id": f"candidate_{idx:04d}",
        "skills": skills,
        "projects": projects,
        "work_experience": work_experience,
        "test_score": test_score,
        "preferred_domain": domain
    }
    return resume


# Generate N=2000 synthetic resumes
N = 2000
synthetic_resumes = [generate_resume(i, domain_requirements) for i in range(N)]

# Save to JSON file
with open("data/synthetic_resumes.json", "w") as f:
    json.dump(synthetic_resumes, f, indent=4)

print(f"✅ Generated {N} synthetic resumes and saved to data/synthetic_resumes.json")

# Quick peek at one sample
print(json.dumps(synthetic_resumes[0], indent=2))

✅ Domain requirement files saved.
✅ Generated 2000 synthetic resumes and saved to data/synthetic_resumes.json
{
  "id": "candidate_0000",
  "skills": [
    "Express",
    "HTML",
    "Kubernetes",
    "SQL",
    "CSS",
    "SQL"
  ],
  "projects": [
    "Image Classification using CNN",
    "ETL Pipeline with Spark"
  ],
  "work_experience": [
    {
      "title": "Cloud Engineer",
      "years": 2
    },
    {
      "title": "Backend Developer",
      "years": 3
    }
  ],
  "test_score": 54,
  "preferred_domain": "Cloud Engineering"
}


In [3]:
# ================================
# Step 2 — Create Ground Truth Labels (Rule-based)
# ================================

# Load domain requirements
domain_req_map = {}
for domain, req in domain_requirements.items():
    domain_req_map[domain] = set(req["required_skills"])

def assign_label(resume, domain_req_map):
    domain = resume["preferred_domain"]
    required_skills = domain_req_map[domain]
    candidate_skills = set(resume["skills"])

    # Matched & missing skills
    matched_skills = candidate_skills.intersection(required_skills)
    missing_skills = required_skills - candidate_skills

    # Ratios & counts
    skill_match_ratio = len(matched_skills) / len(required_skills) if required_skills else 0
    test_score_norm = resume["test_score"] / 100
    project_count = len(resume["projects"])

    # Apply labeling rules
    if (skill_match_ratio >= 0.70) and (test_score_norm >= 0.75) and (project_count >= 1):
        label = "Fit"
    elif (0.40 <= skill_match_ratio < 0.70) or (0.50 <= test_score_norm < 0.75):
        label = "Partial Fit"
    else:
        label = "Not Fit"

    # Add extra fields
    resume["matched_skills"] = list(matched_skills)
    resume["missing_skills"] = list(missing_skills)
    resume["skill_match_ratio"] = round(skill_match_ratio, 2)
    resume["test_score_norm"] = round(test_score_norm, 2)
    resume["project_count"] = project_count
    resume["label"] = label

    return resume


# Apply labeling to all resumes
labeled_resumes = [assign_label(r, domain_req_map) for r in synthetic_resumes]

# Save labeled dataset
with open("data/labeled_resumes.json", "w") as f:
    json.dump(labeled_resumes, f, indent=4)

print("✅ Rule-based labels assigned and saved to data/labeled_resumes.json")

# Quick distribution check
label_counts = pd.Series([r["label"] for r in labeled_resumes]).value_counts()
print("\nLabel Distribution:\n", label_counts)

# Peek at one labeled resume
print("\nSample Resume with Label:\n", json.dumps(labeled_resumes[0], indent=2))


✅ Rule-based labels assigned and saved to data/labeled_resumes.json

Label Distribution:
 Partial Fit    1157
Not Fit         838
Fit               5
Name: count, dtype: int64

Sample Resume with Label:
 {
  "id": "candidate_0000",
  "skills": [
    "Express",
    "HTML",
    "Kubernetes",
    "SQL",
    "CSS",
    "SQL"
  ],
  "projects": [
    "Image Classification using CNN",
    "ETL Pipeline with Spark"
  ],
  "work_experience": [
    {
      "title": "Cloud Engineer",
      "years": 2
    },
    {
      "title": "Backend Developer",
      "years": 3
    }
  ],
  "test_score": 54,
  "preferred_domain": "Cloud Engineering",
  "matched_skills": [
    "Kubernetes"
  ],
  "missing_skills": [
    "Azure",
    "CI/CD",
    "Docker",
    "AWS",
    "Terraform",
    "Linux"
  ],
  "skill_match_ratio": 0.14,
  "test_score_norm": 0.54,
  "project_count": 2,
  "label": "Partial Fit"
}


In [4]:
# ================================
# Step A — Data Cleaning after Generation
# ================================

import re
from difflib import get_close_matches

# Canonical vocab (from domain requirements)
canonical_skills = sorted(set(sum([req["required_skills"] for req in domain_requirements.values()], [])))

# Stopwords for projects
stopwords = {"and", "the", "project", "using"}

# Canonical job titles
canonical_titles = ["data scientist", "data analyst", "machine learning engineer", "intern", 
                    "backend developer", "frontend developer", "cloud engineer"]

def normalize_skill(skill, canonical_vocab):
    s = skill.strip().lower()
    # Try exact canonical match
    if s in [c.lower() for c in canonical_vocab]:
        return s
    # Try fuzzy matching (closest skill)
    match = get_close_matches(s, [c.lower() for c in canonical_vocab], n=1, cutoff=0.8)
    if match:
        return match[0]
    return s   # keep as-is if no good match

def clean_projects(projects):
    cleaned = []
    for p in projects:
        p = p.lower()
        p = re.sub(r"[^a-z0-9 ]", " ", p)  # remove punctuation
        tokens = [t for t in p.split() if t not in stopwords]
        if len(tokens) >= 2:
            cleaned.append(" ".join(tokens))
    return list(set(cleaned))  # deduplicate

def normalize_title(title):
    t = title.lower().strip()
    match = get_close_matches(t, canonical_titles, n=1, cutoff=0.7)
    return match[0] if match else t

def clean_years(years):
    try:
        y = float(re.sub("[^0-9.]", "", str(years)))
        return max(y, 0)  # clamp negatives
    except:
        return 0

def clean_resume(resume):
    # Skills
    resume["skills"] = list({normalize_skill(s, canonical_skills) for s in resume["skills"]})
    
    # Projects
    resume["projects"] = clean_projects(resume["projects"])
    
    # Work experience
    cleaned_exp = []
    for exp in resume["work_experience"]:
        cleaned_exp.append({
            "title": normalize_title(exp["title"]),
            "years": clean_years(exp["years"])
        })
    resume["work_experience"] = cleaned_exp
    
    # Test score
    score = resume.get("test_score", 0)
    score = max(0, min(score, 100))  # clamp
    resume["test_score"] = int(score)
    resume["test_score_norm"] = round(score/100, 2)
    
    return resume

# Apply cleaning
cleaned_resumes = [clean_resume(r) for r in labeled_resumes]

# Remove corrupted/duplicates
seen = set()
final_resumes = []
for r in cleaned_resumes:
    key = (tuple(sorted(r["skills"])), tuple(r["projects"]), r["test_score"], r["preferred_domain"])
    if not r["skills"] and not r["projects"] and r["test_score"] == 0:
        continue  # drop corrupted
    if r["preferred_domain"] == "":
        continue
    if key in seen:
        continue
    seen.add(key)
    final_resumes.append(r)

print(f"✅ Cleaned dataset: {len(final_resumes)} resumes (from {len(labeled_resumes)})")

# Save cleaned data
with open("data/cleaned_resumes.json", "w") as f:
    json.dump(final_resumes, f, indent=4)


✅ Cleaned dataset: 2000 resumes (from 2000)


In [5]:
# ================================
# Step B — Label Balancing
# ================================

from collections import Counter
import random

labels = [r["label"] for r in final_resumes]
counts = Counter(labels)
print("Before balancing:", counts)

max_count = max(counts.values())
balanced_resumes = []

for label, count in counts.items():
    group = [r for r in final_resumes if r["label"] == label]
    if count < max_count:
        # Oversample minority
        extra = random.choices(group, k=max_count - count)
        balanced_resumes.extend(group + extra)
    else:
        balanced_resumes.extend(group)

balanced_counts = Counter([r["label"] for r in balanced_resumes])
print("After balancing:", balanced_counts)

# Save balanced dataset
with open("data/balanced_resumes.json", "w") as f:
    json.dump(balanced_resumes, f, indent=4)

Before balancing: Counter({'Partial Fit': 1157, 'Not Fit': 838, 'Fit': 5})
After balancing: Counter({'Partial Fit': 1157, 'Not Fit': 1157, 'Fit': 1157})


In [6]:
# ================================
# Step 3 — Preprocessing & Helper Functions
# ================================

import numpy as np
from sklearn.preprocessing import StandardScaler

# 3.1 Build skill vocabulary
def build_skill_vocab(resumes, domain_requirements):
    all_skills = set()
    for r in resumes:
        all_skills.update(r["skills"])
    for domain, req in domain_requirements.items():
        all_skills.update(req["required_skills"])
    skill_vocab = sorted(all_skills)
    return skill_vocab

skill_vocab = build_skill_vocab(final_resumes, domain_requirements)
skill_index = {s: i for i, s in enumerate(skill_vocab)}
skill_vocab_size = len(skill_vocab)

print(f"✅ Built skill vocabulary of size {skill_vocab_size}")


# 3.2 Skill encoding function
def encode_skills(candidate_skills, skill_index):
    vector = np.zeros(len(skill_index), dtype=int)
    for s in candidate_skills:
        if s in skill_index:
            vector[skill_index[s]] = 1
    return vector


# 3.3 Matched & missing skills
def matched_missing_skills(candidate_skills, domain_required_skills):
    candidate_set = set(candidate_skills)
    required_set = set(domain_required_skills)
    matched = list(candidate_set.intersection(required_set))
    missing = list(required_set - candidate_set)
    ratio = len(matched) / len(required_set) if required_set else 0
    return matched, missing, round(ratio, 2)


# 3.4 Project & experience features
def extract_project_experience_features(resume):
    project_count = len(resume.get("projects", []))
    years_experience = sum(item.get("years", 0) for item in resume.get("work_experience", []))
    return project_count, years_experience


# 3.5 Test score normalization
def normalize_test_score(score):
    return round(score / 100, 2)


# 3.6 Numeric feature scaling (fit & transform on dataset)
def fit_numeric_scalers(resumes):
    project_counts = []
    years_exp = []
    
    for r in resumes:
        p, y = extract_project_experience_features(r)
        project_counts.append(p)
        years_exp.append(y)
    
    project_scaler = StandardScaler()
    years_scaler = StandardScaler()
    
    project_scaler.fit(np.array(project_counts).reshape(-1, 1))
    years_scaler.fit(np.array(years_exp).reshape(-1, 1))
    
    return project_scaler, years_scaler

def transform_numeric_features(resume, project_scaler, years_scaler):
    project_count, years_exp = extract_project_experience_features(resume)
    project_scaled = project_scaler.transform([[project_count]])[0][0]
    years_scaled = years_scaler.transform([[years_exp]])[0][0]
    return project_scaled, years_scaled


# ================================
# ✅ Test the helper functions
# ================================
# Fit scalers on dataset
project_scaler, years_scaler = fit_numeric_scalers(final_resumes)

sample = final_resumes[0]

# Encode skills
skill_vector = encode_skills(sample["skills"], skill_index)

# Matched & missing
domain = sample["preferred_domain"]
matched, missing, ratio = matched_missing_skills(sample["skills"], domain_requirements[domain]["required_skills"])

# Numeric features
p_count, y_exp = extract_project_experience_features(sample)
p_scaled, y_scaled = transform_numeric_features(sample, project_scaler, years_scaler)

# Test score norm
score_norm = normalize_test_score(sample["test_score"])

print("\n=== Sample Resume Features ===")
print("Skill vector length:", len(skill_vector))
print("Matched:", matched)
print("Missing:", missing)
print("Skill match ratio:", ratio)
print("Project count:", p_count, "→ scaled:", round(p_scaled, 2))
print("Years experience:", y_exp, "→ scaled:", round(y_scaled, 2))
print("Normalized test score:", score_norm)


✅ Built skill vocabulary of size 46

=== Sample Resume Features ===
Skill vector length: 46
Matched: []
Missing: ['Azure', 'Kubernetes', 'CI/CD', 'Docker', 'AWS', 'Terraform', 'Linux']
Skill match ratio: 0.0
Project count: 2 → scaled: 0.43
Years experience: 5.0 → scaled: 0.5
Normalized test score: 0.54
