In [27]:
import random
import json
import time
from collections import defaultdict

def generate_dataset(domain_path, output_path, total_samples, seed=42):
    random.seed(seed)
    start_time = time.time()
    print("⏳ Starting dataset generation...")

    # Load domain requirements
    with open(domain_path, "r", encoding="utf-8") as f:
        DOMAIN_REQUIREMENTS = json.load(f)
    domains = list(DOMAIN_REQUIREMENTS.keys())

    # Track label counts
    label_counts = defaultdict(int)
    domain_label_counts = {d: defaultdict(int) for d in domains}

    # --- Generate a candidate profile ---
    def generate_candidate_profile(domain):
        req = DOMAIN_REQUIREMENTS[domain]

        # Work experience
        work_exp = []
        if random.random() > 0.3:  # 30% chance to be fresher
            for _ in range(random.randint(1, 3)):
                work_exp.append({
                    "title": random.choice(req.get("job_titles", ["Professional"])),
                    "years": random.randint(1, 3)
                })

        # Skills
        skills = random.sample(
            req["skills"],
            k=random.randint(max(2, len(req["skills"]) // 2), len(req["skills"]))
        )
        other_skills = ["communication", "teamwork", "problem solving", "critical thinking"]
        skills += random.sample(other_skills, k=random.randint(0, 2))

        # Test score
        test_score = random.randint(req["min_score"], 95)

        return {
            "skills": skills,
            "work_experience": work_exp,
            "test_score": test_score,
            "preferred_domain": domain
        }

    # --- Evaluate candidate profile ---
    def evaluate_candidate(candidate):
        domain = candidate["preferred_domain"]
        req = DOMAIN_REQUIREMENTS[domain]

        skills = set(candidate["skills"])
        work_exp = candidate["work_experience"]
        total_years = sum(job["years"] for job in work_exp)
        relevant_exp = sum(job["years"] for job in work_exp if job["title"] in req.get("job_titles", []))
        test_score = candidate["test_score"]

        matched_skills = skills.intersection(req["skills"])
        skill_ratio = len(matched_skills) / max(1, len(req["skills"]))

        # --- Threshold-based label assignment ---
        if test_score >= 80 and skill_ratio > 0.35 and relevant_exp > 1:
            return "fit"
        elif test_score >= 70 and skill_ratio > 0.25 and relevant_exp >= 1:
            return "partial"
        else:
            # Check if candidate fits another domain
            for alt_domain, alt_req in DOMAIN_REQUIREMENTS.items():
                if alt_domain == domain:
                    continue
                if skills.intersection(alt_req["skills"]) and test_score >= alt_req["min_score"]:
                    return "suggest"
            return "no_fit"

    # --- Generate dataset ---
    dataset = []
    for i in range(total_samples):
        domain = random.choice(domains)
        candidate = generate_candidate_profile(domain)
        candidate["label"] = evaluate_candidate(candidate)
        dataset.append(candidate)

        label_counts[candidate["label"]] += 1
        domain_label_counts[domain][candidate["label"]] += 1

        # Progress log
        if (i + 1) % max(1, total_samples // 10) == 0:
            print(f"🔹 {i + 1}/{total_samples} samples generated")

    # Save dataset
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(dataset, f, indent=2)

    # Summary
    print("\n📊 Label counts:", dict(label_counts))
    print("📊 Domain-wise label counts:")
    for domain, counts in domain_label_counts.items():
        print(f"{domain}: {dict(counts)}")

    print(f"\n✅ Dataset generation finished in {time.time() - start_time:.2f} seconds.")
    return dataset, label_counts, domain_label_counts


In [28]:
dataset, label_counts, domain_label_counts = generate_dataset(
    domain_path="domain_requirements.json",
    output_path="dataset.json",
    total_samples=500,
    seed=42
)

⏳ Starting dataset generation...
🔹 50/500 samples generated
🔹 100/500 samples generated
🔹 150/500 samples generated
🔹 200/500 samples generated
🔹 250/500 samples generated
🔹 300/500 samples generated
🔹 350/500 samples generated
🔹 400/500 samples generated
🔹 450/500 samples generated
🔹 500/500 samples generated

📊 Label counts: {'suggest': 121, 'no_fit': 79, 'fit': 159, 'partial': 141}
📊 Domain-wise label counts:
Data Science: {'suggest': 6, 'fit': 7, 'partial': 6, 'no_fit': 1}
Web Development: {'partial': 4, 'no_fit': 14, 'fit': 10}
UI/UX Design: {'partial': 10, 'fit': 7, 'no_fit': 7, 'suggest': 4}
Cybersecurity: {'suggest': 10, 'fit': 9, 'partial': 3}
Cloud Computing: {'fit': 13, 'partial': 11, 'suggest': 6, 'no_fit': 4}
Artificial Intelligence: {'partial': 7, 'fit': 23, 'suggest': 3, 'no_fit': 1}
Mobile App Development: {'no_fit': 10, 'partial': 5, 'fit': 7}
DevOps: {'fit': 10, 'partial': 7, 'suggest': 6}
Database Administration: {'partial': 10, 'no_fit': 13, 'fit': 7}
Networking: {'

In [29]:
import pandas as pd
import json
import time  # For timing measurements


# --- Helper to build normalization map ---
def build_normalization_map(DOMAIN_REQUIREMENTS):
    """
    Build a dictionary of skill/title aliases for normalization.
    Example: {"node.js": "node.js", "node": "node.js", "NodeJs": "node.js"}
    """
    alias_map = {}

    def add_alias(word, canonical):
        word_norm = word.strip().lower().replace(" ", "").replace("-", "").replace(".", "")
        alias_map[word_norm] = canonical.lower()

    for domain, req in DOMAIN_REQUIREMENTS.items():
        for skill in req.get("skills", []):
            canonical = skill
            # Add direct and common variations
            add_alias(skill, canonical)
            add_alias(skill.replace(".", ""), canonical)
            add_alias(skill.replace("-", ""), canonical)
            add_alias(skill.replace(" ", ""), canonical)
            add_alias(skill.lower(), canonical)

        for title in req.get("job_titles", []):
            canonical = title
            add_alias(title, canonical)
            add_alias(title.lower(), canonical)

    return alias_map


# --- Normalize a candidate sample ---
def normalize_sample(sample, alias_map):
    """
    Normalize skills and job titles in the sample according to alias_map.
    """
    # Normalize skills
    normalized_skills = []
    for s in sample.get("skills", []):
        key = s.strip().lower().replace(" ", "").replace("-", "").replace(".", "")
        if key in alias_map:
            normalized_skills.append(alias_map[key])
        else:
            normalized_skills.append(s.lower())
    sample["skills"] = list(set(normalized_skills))  # unique

    # Normalize job titles
    for job in sample.get("work_experience", []):
        title = job.get("title", "")
        key = title.strip().lower().replace(" ", "").replace("-", "").replace(".", "")
        if key in alias_map:
            job["title"] = alias_map[key]
        else:
            job["title"] = title.lower()

    return sample


# --- Main dataset update function ---
def update_dataset(dataset_path, domain_path, output_csv, output_parquet):
    start_time = time.time()
    print("⏳ Starting dataset update...")

    # Load your dataset JSON
    t0 = time.time()
    with open(dataset_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    t1 = time.time()
    print(f"📂 Loaded dataset JSON ({len(data)} samples) in {t1 - t0:.4f} seconds.")

    # Load domain requirements
    t2 = time.time()
    with open(domain_path, "r", encoding="utf-8") as f:
        DOMAIN_REQUIREMENTS = json.load(f)
    t3 = time.time()
    print(f"📂 Loaded domain requirements in {t3 - t2:.4f} seconds.")

    # Build normalization map
    alias_map = build_normalization_map(DOMAIN_REQUIREMENTS)

    # Normalize data before feature extraction
    normalized_data = []
    for i, sample in enumerate(data):
        normalized_data.append(normalize_sample(sample, alias_map))
        if (i + 1) % max(1, len(data)//10) == 0:
            print(f"🔹 Normalized {i + 1}/{len(data)} samples...")

    # Helper: convert JSON sample to ML-ready features (no projects anymore)
    def json_to_row(sample):
        skills = set(sample.get("skills", []))
        work_exp = sample.get("work_experience", [])
        total_years = sum(job.get("years", 0) for job in work_exp)
        titles = [job.get("title", "") for job in work_exp]

        domain = sample.get("preferred_domain", "")
        domain_req = DOMAIN_REQUIREMENTS.get(domain, {})

        matched_skills = skills.intersection([s.lower() for s in domain_req.get("skills", [])])
        skill_match_ratio = len(matched_skills) / max(1, len(domain_req.get("skills", [])))

        relevant_experience = sum(
            job.get("years", 0) for job in work_exp if job.get("title", "").lower() in [t.lower() for t in domain_req.get("job_titles", [])]
        )

        return {
            "test_score": sample.get("test_score", 0),
            "skill_match_ratio": skill_match_ratio,
            "relevant_experience": relevant_experience,
            "preferred_domain": domain.lower(),
            "skills_text": " ".join(skills).lower(),
            "titles_text": " ".join(titles).lower(),
            "label": sample.get("label", "").lower()
        }

    # Build dataframe
    t_build_start = time.time()
    rows = []
    for i, x in enumerate(normalized_data):
        rows.append(json_to_row(x))
        if (i + 1) % max(1, len(normalized_data)//10) == 0:
            print(f"🔹 Processed {i + 1}/{len(normalized_data)} samples...")
    df = pd.DataFrame(rows)
    t_build_end = time.time()
    print(f"⏱ Dataframe built in {t_build_end - t_build_start:.2f} seconds.")

    # Save both CSV and Parquet
    t_save_start = time.time()
    df.to_csv(output_csv, index=False)
    df.to_parquet(output_parquet, index=False)
    t_save_end = time.time()
    print(f"💾 Dataset saved as CSV and Parquet in {t_save_end - t_save_start:.4f} seconds.")

    end_time = time.time()
    print(f"✅ Total process finished in {end_time - start_time:.2f} seconds.")


In [30]:
update_dataset(
    dataset_path="dataset.json",       # input dataset generated earlier
    domain_path="domain_requirements.json",         # domain requirements
    output_csv="dataset_clean.csv",    # output CSV
    output_parquet="dataset_clean.parquet"  # output Parquet
)

⏳ Starting dataset update...
📂 Loaded dataset JSON (500 samples) in 0.0269 seconds.
📂 Loaded domain requirements in 0.0000 seconds.
🔹 Normalized 50/500 samples...
🔹 Normalized 100/500 samples...
🔹 Normalized 150/500 samples...
🔹 Normalized 200/500 samples...
🔹 Normalized 250/500 samples...
🔹 Normalized 300/500 samples...
🔹 Normalized 350/500 samples...
🔹 Normalized 400/500 samples...
🔹 Normalized 450/500 samples...
🔹 Normalized 500/500 samples...
🔹 Processed 50/500 samples...
🔹 Processed 100/500 samples...
🔹 Processed 150/500 samples...
🔹 Processed 200/500 samples...
🔹 Processed 250/500 samples...
🔹 Processed 300/500 samples...
🔹 Processed 350/500 samples...
🔹 Processed 400/500 samples...
🔹 Processed 450/500 samples...
🔹 Processed 500/500 samples...
⏱ Dataframe built in 0.01 seconds.
💾 Dataset saved as CSV and Parquet in 0.0000 seconds.
✅ Total process finished in 0.04 seconds.


In [39]:
# prepare_dataset_no_projects.py
import pandas as pd
import re
import time
from sklearn.model_selection import train_test_split

def prepare_dataset(dataset_path, train_output, val_output):
    start_time = time.time()
    print("⏳ Starting dataset preparation...")

    # -------------------------------
    # 1. Load dataset
    # -------------------------------
    t0 = time.time()
    df = pd.read_csv(dataset_path)
    t1 = time.time()
    print(f"📂 Loaded dataset ({len(df)} samples) in {t1 - t0:.4f} seconds.")

    # -------------------------------
    # 2. Basic cleaning
    # -------------------------------
    t_clean_start = time.time()
    df = df.drop_duplicates()
    valid_labels = ["fit", "partial", "suggest", "no_fit"]
    df = df[df["label"].isin(valid_labels)]

    # Fill missing values (remove project related columns)
    df = df.fillna({
        "test_score": 0,
        "skills_text": "",
        "titles_text": "",
        "preferred_domain": "",
        "skill_match_ratio": 0,
        "relevant_experience": 0
    })

    # Clip numeric columns (remove project_match)
    df["test_score"] = df["test_score"].clip(0, 100)
    df["skill_match_ratio"] = df["skill_match_ratio"].clip(0, 1)
    df["relevant_experience"] = df["relevant_experience"].clip(lower=0)

    t_clean_end = time.time()
    print(f"🧹 Basic cleaning completed in {t_clean_end - t_clean_start:.4f} seconds.")

    # -------------------------------
    # 3. Text cleaning
    # -------------------------------
    t_text_start = time.time()
    def clean_text(s):
        s = re.sub(r"[^a-z0-9\s]", "", str(s).lower())
        return s.strip()
    
    for col in ["skills_text", "titles_text", "preferred_domain"]:
        df[col] = df[col].apply(clean_text)
    t_text_end = time.time()
    print(f"📝 Text cleaning completed in {t_text_end - t_text_start:.4f} seconds.")

    # -------------------------------
    # 4. Manual oversampling to balance labels
    # -------------------------------
    t_balance_start = time.time()
    print("Before balancing:", df["label"].value_counts().to_dict())

    max_count = df["label"].value_counts().max()
    balanced_dfs = []
    for label, group in df.groupby("label"):
        n_repeat = max_count // len(group)
        remainder = max_count % len(group)
        oversampled_group = pd.concat([group]*n_repeat + [group.sample(remainder, random_state=42)])
        balanced_dfs.append(oversampled_group)

    balanced_df = pd.concat(balanced_dfs).reset_index(drop=True)
    print("After balancing:", balanced_df["label"].value_counts().to_dict())
    t_balance_end = time.time()
    print(f"⚖️ Oversampling completed in {t_balance_end - t_balance_start:.4f} seconds.")

    # -------------------------------
    # 5. Train/Validation split
    # -------------------------------
    t_split_start = time.time()
    train_df, val_df = train_test_split(
        balanced_df,
        test_size=0.2,
        random_state=42,
        stratify=balanced_df["label"],
        shuffle=True
    )
    t_split_end = time.time()
    print(f"🔹 Train/Validation split completed in {t_split_end - t_split_start:.4f} seconds.")

    # Save datasets
    t_save_start = time.time()
    train_df.to_csv(train_output, index=False)
    val_df.to_csv(val_output, index=False)
    t_save_end = time.time()
    print(f"💾 Train/Validation datasets saved in {t_save_end - t_save_start:.4f} seconds.")

    end_time = time.time()
    print(f"✅ Dataset preparation finished in {end_time - start_time:.2f} seconds.")
    print("Train size:", len(train_df), "Validation size:", len(val_df))

    return train_df, val_df


In [52]:
# If the function is already in the notebook:
train_df, val_df = prepare_dataset(
    dataset_path="dataset_clean.csv",     # Input dataset
    train_output="train_dataset.csv",    # Where train split will be saved
    val_output="val_dataset.csv"         # Where validation split will be saved
)


⏳ Starting dataset preparation...
📂 Loaded dataset (500 samples) in 0.0111 seconds.
🧹 Basic cleaning completed in 0.0065 seconds.
📝 Text cleaning completed in 0.0035 seconds.
Before balancing: {'fit': 159, 'partial': 141, 'suggest': 121, 'no_fit': 79}
After balancing: {'no_fit': 159, 'suggest': 159, 'fit': 159, 'partial': 159}
⚖️ Oversampling completed in 0.0062 seconds.
🔹 Train/Validation split completed in 0.0000 seconds.
💾 Train/Validation datasets saved in 0.0186 seconds.
✅ Dataset preparation finished in 0.05 seconds.
Train size: 508 Validation size: 128
