In [2]:
pip install pandas numpy 

Collecting pandas
  Obtaining dependency information for pandas from https://files.pythonhosted.org/packages/8e/59/712db1d7040520de7a4965df15b774348980e6df45c129b8c64d0dbe74ef/pandas-2.3.3-cp311-cp311-win_amd64.whl.metadata
  Using cached pandas-2.3.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting numpy
  Obtaining dependency information for numpy from https://files.pythonhosted.org/packages/fc/f0/74965001d231f28184d6305b8cdc1b6fcd4bf23033f6cb039cfe76c9fca7/numpy-2.4.0-cp311-cp311-win_amd64.whl.metadata
  Using cached numpy-2.4.0-cp311-cp311-win_amd64.whl.metadata (6.6 kB)
Collecting pytz>=2020.1 (from pandas)
  Obtaining dependency information for pytz>=2020.1 from https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl.metadata
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Using cached pandas-2.3.3-cp311-cp311-win_amd64.whl (11.3 MB)
Using cached numpy-2.4.0-cp311-cp311-win_a


[notice] A new release of pip is available: 23.2.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import pandas as pd
import numpy as np
import random
from datetime import timedelta

random.seed(7)
np.random.seed(7)

# =====================================================
# DATE RANGE
# =====================================================
START_DATE = pd.to_datetime("2024-04-01")
END_DATE = pd.to_datetime("2025-03-31")

# =====================================================
# COUNTS
# =====================================================
NUM_CANDIDATES = 1100
NUM_JOBS = 25
NUM_APPLICATIONS = 1650

# =====================================================
# NAME POOL (THOUSANDS OF COMBINATIONS)
# =====================================================
first_names = [
    "Aarav","Rahul","Amit","Rohit","Kunal","Suresh","Vikas","Ankit","Karthik","Nikhil",
    "Sneha","Pooja","Neha","Riya","Anjali","Kavya","Priya","Swati","Isha","Shreya"
]

last_names = [
    "Sharma","Verma","Gupta","Patel","Mehta","Kulkarni","Iyer","Reddy","Naik","Joshi",
    "Singh","Chatterjee","Banerjee","Das","Malhotra","Kapoor","Bansal","Agarwal"
]

# =====================================================
# CANDIDATES
# =====================================================
candidates = pd.DataFrame({
    "candidate_id": range(1, NUM_CANDIDATES + 1),
    "candidate_name": [
        f"{random.choice(first_names)} {random.choice(last_names)}"
        for _ in range(NUM_CANDIDATES)
    ],
    "email": [f"candidate{i}@email.com" for i in range(1, NUM_CANDIDATES + 1)],
    "phone": [f"9{random.randint(100000000,999999999)}" for _ in range(NUM_CANDIDATES)],
    "experience_years": np.round(np.random.uniform(0.5, 15, NUM_CANDIDATES), 1),
    "current_location": np.random.choice(
        ["Bangalore","Pune","Hyderabad","Chennai","Mumbai","Noida"],
        NUM_CANDIDATES
    )
})

# =====================================================
# JOBS
# =====================================================
job_titles = [
    "Data Analyst","Data Scientist","Backend Developer",
    "Frontend Developer","QA Engineer"
]

jobs_data = []
for job_id in range(1, NUM_JOBS + 1):
    open_dt = START_DATE - timedelta(days=random.randint(0, 90))
    close_dt = (
        open_dt + timedelta(days=random.randint(90, 210))
        if random.random() < 0.7 else None
    )

    jobs_data.append([
        job_id,
        random.choice(job_titles),
        "IT",
        random.choice(["Bangalore","Pune","Hyderabad"]),
        open_dt,
        close_dt
    ])

jobs = pd.DataFrame(jobs_data, columns=[
    "job_id","job_title","department","job_location","open_date","close_date"
])

# =====================================================
# APPLICATIONS
# =====================================================
applications = []
for app_id in range(1, NUM_APPLICATIONS + 1):
    job = jobs.sample(1).iloc[0]
    app_date = job["open_date"] + timedelta(days=random.randint(1, 300))

    applications.append([
        app_id,
        random.randint(1, NUM_CANDIDATES),
        job["job_id"],
        random.choice(["LinkedIn","Naukri","Referral","Company Website"]),
        random.randint(1, 12),
        app_date,
        None  # filled after stages
    ])

applications = pd.DataFrame(applications, columns=[
    "application_id","candidate_id","job_id",
    "source","recruiter_id","application_date","current_status"
])

# =====================================================
# STAGE DEFINITIONS
# =====================================================
stages = [
    "Resume Screening",
    "Technical Interview",
    "Managerial Interview",
    "HR Interview",
    "Offer Made",
    "Joined"
]

drop_reasons = {
    "Resume Screening": ["Skill mismatch","Experience mismatch","Profile not relevant"],
    "Technical Interview": ["Technical gap","Low problem-solving","Domain mismatch"],
    "Managerial Interview": ["Team fit issue","Project mismatch"],
    "HR Interview": ["Salary mismatch","Location constraint","Notice period issue"],
    "Offer Made": ["Better offer","Compensation issue","Location issue"],
    "Joined": ["Counter offer","Personal reasons"]
}

stage_events = []
stage_event_id = 1

for _, app in applications.iterrows():
    stage_date = app["application_date"]
    final_status = "Rejected"

    for stage in stages:
        stage_date += timedelta(days=random.randint(2, 7))

        # Drop probabilities
        drop_prob = {
            "Resume Screening": 0.30,
            "Technical Interview": 0.25,
            "Managerial Interview": 0.20,
            "HR Interview": 0.15,
            "Offer Made": 0.20,
            "Joined": 0.10
        }[stage]

        if random.random() < drop_prob and stage != "Joined":
            stage_events.append([
                stage_event_id,
                app["application_id"],
                stage,
                stage_date,
                "Rejected" if stage != "Offer Made" else "Declined",
                random.choice(drop_reasons[stage])
            ])
            final_status = "Offer Declined" if stage == "Offer Made" else "Rejected"
            stage_event_id += 1
            break

        status = (
            "Accepted" if stage == "Offer Made" else
            "Joined" if stage == "Joined" else
            "Selected"
        )

        stage_events.append([
            stage_event_id,
            app["application_id"],
            stage,
            stage_date,
            status,
            None
        ])
        stage_event_id += 1

        if stage == "Joined":
            final_status = "Hired"

    applications.loc[
        applications["application_id"] == app["application_id"],
        "current_status"
    ] = final_status

stage_events = pd.DataFrame(stage_events, columns=[
    "stage_event_id","application_id","stage_name",
    "stage_date","stage_status","drop_reason"
])

# =====================================================
# SAVE DATA
# =====================================================
candidates.to_csv("candidates.csv", index=False)
jobs.to_csv("jobs.csv", index=False)
applications.to_csv("applications.csv", index=False)
stage_events.to_csv("recruitment_stage_events.csv", index=False)

print("✅ REAL-WORLD recruitment dataset generated successfully.")


✅ REAL-WORLD recruitment dataset generated successfully.
