In [38]:
# Time: O(total_rows * total_cols) to read + write
# Space: O(total_rows * total_cols) for the combined DataFrame in memory

import pandas as pd
import glob

# point this to your folder or list files explicitly
files = [
    "ai_job_dataset.csv",
    "AI-based Career Recommendation System.csv",
    "DataScience_salaries_2024.csv",
]
# OR: files = sorted(glob.glob("data/*.csv"))

def normalize(col: str) -> str:
    import re
    col = col.strip().lower()
    col = re.sub(r"[^a-z0-9]+", "_", col)
    return re.sub(r"_+", "_", col).strip("_")

dfs = []
for f in files:
    try:
        df = pd.read_csv(f)
    except UnicodeDecodeError:
        df = pd.read_csv(f, encoding="latin-1")
    df.columns = [normalize(c) for c in df.columns]
    df["source_file"] = f
    dfs.append(df)

merged = pd.concat(dfs, ignore_index=True, sort=False)
merged.head()  # preview
# Save result
merged.to_csv("merged_all_rows.csv", index=False)


In [39]:
df=pd.read_csv('merged_all_rows.csv')

In [5]:
df.head()

Unnamed: 0,job_id,job_title,salary_usd,salary_currency,experience_level,employment_type,company_location,company_size,employee_residence,remote_ratio,...,name,age,education,skills,interests,recommended_career,recommendation_score,work_year,salary,salary_in_usd
0,AI00001,AI Research Scientist,90376.0,USD,SE,CT,China,M,China,50.0,...,,,,,,,,,,
1,AI00002,AI Software Engineer,61895.0,USD,EN,CT,Canada,M,Ireland,100.0,...,,,,,,,,,,
2,AI00003,AI Specialist,152626.0,USD,MI,FL,Switzerland,L,South Korea,0.0,...,,,,,,,,,,
3,AI00004,NLP Engineer,80215.0,USD,SE,FL,India,M,India,50.0,...,,,,,,,,,,
4,AI00005,AI Consultant,54624.0,EUR,EN,PT,France,S,Singapore,100.0,...,,,,,,,,,,


In [40]:
df = df.drop(columns=["job_id"])

In [41]:
df = df.drop(columns=["salary_in_usd"])

In [42]:
df = df.drop(columns=["recommendation_score"])

In [43]:
df = df.drop(columns=["recommended_career"])

In [44]:
!pip install faker



In [45]:
from faker import Faker
fake = Faker()

df["name"] = [fake.name() for _ in range(len(df))]


In [46]:
if "name" in df.columns:
    cols = ["name"] + [c for c in df.columns if c != "name"]
    df = df[cols]

df.head()

Unnamed: 0,name,job_title,salary_usd,salary_currency,experience_level,employment_type,company_location,company_size,employee_residence,remote_ratio,...,benefits_score,company_name,source_file,candidateid,age,education,skills,interests,work_year,salary
0,Matthew Hernandez,AI Research Scientist,90376.0,USD,SE,CT,China,M,China,50.0,...,5.9,Smart Analytics,ai_job_dataset.csv,,,,,,,
1,Tyler Mcdonald,AI Software Engineer,61895.0,USD,EN,CT,Canada,M,Ireland,100.0,...,5.2,TechCorp Inc,ai_job_dataset.csv,,,,,,,
2,Stacey Brown,AI Specialist,152626.0,USD,MI,FL,Switzerland,L,South Korea,0.0,...,9.4,Autonomous Tech,ai_job_dataset.csv,,,,,,,
3,Wayne Mcdonald,NLP Engineer,80215.0,USD,SE,FL,India,M,India,50.0,...,8.6,Future Systems,ai_job_dataset.csv,,,,,,,
4,Lynn Mitchell,AI Consultant,54624.0,EUR,EN,PT,France,S,Singapore,100.0,...,6.6,Advanced Robotics,ai_job_dataset.csv,,,,,,,


In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30038 entries, 0 to 30037
Data columns (total 27 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   name                    30038 non-null  object 
 1   job_title               29838 non-null  object 
 2   salary_usd              15000 non-null  float64
 3   salary_currency         29838 non-null  object 
 4   experience_level        29838 non-null  object 
 5   employment_type         29838 non-null  object 
 6   company_location        29838 non-null  object 
 7   company_size            29838 non-null  object 
 8   employee_residence      29838 non-null  object 
 9   remote_ratio            29838 non-null  float64
 10  required_skills         15000 non-null  object 
 11  education_required      15000 non-null  object 
 12  years_experience        15000 non-null  float64
 13  industry                15000 non-null  object 
 14  posting_date            15000 non-null

In [48]:
import numpy as np
def fake_salary(title):
    title = str(title).lower()
    if "intern" in title:
        return np.random.randint(10000, 30001)   # 10k–30k
    elif "junior" in title or "fresher" in title:
        return np.random.randint(30000, 60001)   # 30k–60k
    elif "senior" in title:
        return np.random.randint(80000, 150001)  # 80k–150k
    else:
        return np.random.randint(50000, 120001)  # default

df["salary"] = df["job_title"].apply(fake_salary)
df.head()

Unnamed: 0,name,job_title,salary_usd,salary_currency,experience_level,employment_type,company_location,company_size,employee_residence,remote_ratio,...,benefits_score,company_name,source_file,candidateid,age,education,skills,interests,work_year,salary
0,Matthew Hernandez,AI Research Scientist,90376.0,USD,SE,CT,China,M,China,50.0,...,5.9,Smart Analytics,ai_job_dataset.csv,,,,,,,90886
1,Tyler Mcdonald,AI Software Engineer,61895.0,USD,EN,CT,Canada,M,Ireland,100.0,...,5.2,TechCorp Inc,ai_job_dataset.csv,,,,,,,88450
2,Stacey Brown,AI Specialist,152626.0,USD,MI,FL,Switzerland,L,South Korea,0.0,...,9.4,Autonomous Tech,ai_job_dataset.csv,,,,,,,65087
3,Wayne Mcdonald,NLP Engineer,80215.0,USD,SE,FL,India,M,India,50.0,...,8.6,Future Systems,ai_job_dataset.csv,,,,,,,62121
4,Lynn Mitchell,AI Consultant,54624.0,EUR,EN,PT,France,S,Singapore,100.0,...,6.6,Advanced Robotics,ai_job_dataset.csv,,,,,,,56739


In [49]:
df = df.drop(columns=["work_year"])


In [50]:
experience_levels = ["EN", "MI", "SE", "EX"]


In [51]:
import numpy as np

# Randomly assign experience levels to each row
df["experience_level"] = np.random.choice(experience_levels, size=len(df))


In [52]:
def fake_experience(title):
    title = str(title).lower()
    if "intern" in title or "junior" in title:
        return "EN"
    elif "senior" in title:
        return "SE"
    elif "manager" in title or "lead" in title:
        return "EX"
    else:
        return np.random.choice(["MI", "SE"])

df["experience_level"] = df["job_title"].apply(fake_experience)


In [53]:
df.head()


Unnamed: 0,name,job_title,salary_usd,salary_currency,experience_level,employment_type,company_location,company_size,employee_residence,remote_ratio,...,job_description_length,benefits_score,company_name,source_file,candidateid,age,education,skills,interests,salary
0,Matthew Hernandez,AI Research Scientist,90376.0,USD,SE,CT,China,M,China,50.0,...,1076.0,5.9,Smart Analytics,ai_job_dataset.csv,,,,,,90886
1,Tyler Mcdonald,AI Software Engineer,61895.0,USD,SE,CT,Canada,M,Ireland,100.0,...,1268.0,5.2,TechCorp Inc,ai_job_dataset.csv,,,,,,88450
2,Stacey Brown,AI Specialist,152626.0,USD,SE,FL,Switzerland,L,South Korea,0.0,...,1974.0,9.4,Autonomous Tech,ai_job_dataset.csv,,,,,,65087
3,Wayne Mcdonald,NLP Engineer,80215.0,USD,MI,FL,India,M,India,50.0,...,1345.0,8.6,Future Systems,ai_job_dataset.csv,,,,,,62121
4,Lynn Mitchell,AI Consultant,54624.0,EUR,MI,PT,France,S,Singapore,100.0,...,1989.0,6.6,Advanced Robotics,ai_job_dataset.csv,,,,,,56739


In [25]:
df = df.drop(columns=["candidateid"])

In [54]:
import numpy as np

def fake_age(level):
    if level == "EN":  # entry
        return np.random.randint(21, 27)
    elif level == "MI":  # mid
        return np.random.randint(27, 36)
    elif level == "SE":  # senior
        return np.random.randint(30, 46)
    elif level == "EX":  # executive
        return np.random.randint(35, 61)
    else:  # fallback
        return np.random.randint(22, 50)
'''
EN → Entry-level

MI → Mid-level

SE → Senior

EX → Executive'''

'\nEN → Entry-level\n\nMI → Mid-level\n\nSE → Senior\n\nEX → Executive'

In [55]:
df["age"] = df["experience_level"].apply(fake_age)
df.head(6)

Unnamed: 0,name,job_title,salary_usd,salary_currency,experience_level,employment_type,company_location,company_size,employee_residence,remote_ratio,...,job_description_length,benefits_score,company_name,source_file,candidateid,age,education,skills,interests,salary
0,Matthew Hernandez,AI Research Scientist,90376.0,USD,SE,CT,China,M,China,50.0,...,1076.0,5.9,Smart Analytics,ai_job_dataset.csv,,41,,,,90886
1,Tyler Mcdonald,AI Software Engineer,61895.0,USD,SE,CT,Canada,M,Ireland,100.0,...,1268.0,5.2,TechCorp Inc,ai_job_dataset.csv,,41,,,,88450
2,Stacey Brown,AI Specialist,152626.0,USD,SE,FL,Switzerland,L,South Korea,0.0,...,1974.0,9.4,Autonomous Tech,ai_job_dataset.csv,,44,,,,65087
3,Wayne Mcdonald,NLP Engineer,80215.0,USD,MI,FL,India,M,India,50.0,...,1345.0,8.6,Future Systems,ai_job_dataset.csv,,31,,,,62121
4,Lynn Mitchell,AI Consultant,54624.0,EUR,MI,PT,France,S,Singapore,100.0,...,1989.0,6.6,Advanced Robotics,ai_job_dataset.csv,,33,,,,56739
5,Dana Terrell,AI Architect,123574.0,EUR,SE,CT,Germany,M,Germany,50.0,...,819.0,5.9,Neural Networks Co,ai_job_dataset.csv,,30,,,,72311


In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30038 entries, 0 to 30037
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   name                    30038 non-null  object 
 1   job_title               29838 non-null  object 
 2   salary_usd              15000 non-null  float64
 3   salary_currency         29838 non-null  object 
 4   experience_level        30038 non-null  object 
 5   employment_type         29838 non-null  object 
 6   company_location        29838 non-null  object 
 7   company_size            29838 non-null  object 
 8   employee_residence      29838 non-null  object 
 9   remote_ratio            29838 non-null  float64
 10  required_skills         15000 non-null  object 
 11  education_required      15000 non-null  object 
 12  years_experience        15000 non-null  float64
 13  industry                15000 non-null  object 
 14  posting_date            15000 non-null

In [57]:
job_titles = {
    "EN": ["Intern", "Junior Data Analyst", "Trainee Engineer", "Assistant Developer"],
    "MI": ["Data Analyst", "Software Engineer", "Machine Learning Engineer", "Backend Developer"],
    "SE": ["Senior Data Scientist", "Senior Software Engineer", "AI Specialist", "Tech Lead"],
    "EX": ["Engineering Manager", "Head of AI", "CTO", "Principal Data Scientist"]
}
import numpy as np

def fake_job_title(level):
    # pick random job title from the right category
    return np.random.choice(job_titles.get(level, ["Employee"]))

df["job_title"] = df["experience_level"].apply(fake_job_title)


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30038 entries, 0 to 30037
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   name                    30038 non-null  object 
 1   job_title               30038 non-null  object 
 2   salary_usd              15000 non-null  float64
 3   salary_currency         29838 non-null  object 
 4   experience_level        30038 non-null  object 
 5   employment_type         29838 non-null  object 
 6   company_location        29838 non-null  object 
 7   company_size            29838 non-null  object 
 8   employee_residence      29838 non-null  object 
 9   remote_ratio            29838 non-null  float64
 10  required_skills         15000 non-null  object 
 11  education_required      15000 non-null  object 
 12  years_experience        15000 non-null  float64
 13  industry                15000 non-null  object 
 14  posting_date            15000 non-null

In [58]:
df.head(50)

Unnamed: 0,name,job_title,salary_usd,salary_currency,experience_level,employment_type,company_location,company_size,employee_residence,remote_ratio,...,job_description_length,benefits_score,company_name,source_file,candidateid,age,education,skills,interests,salary
0,Matthew Hernandez,Senior Data Scientist,90376.0,USD,SE,CT,China,M,China,50.0,...,1076.0,5.9,Smart Analytics,ai_job_dataset.csv,,41,,,,90886
1,Tyler Mcdonald,Senior Software Engineer,61895.0,USD,SE,CT,Canada,M,Ireland,100.0,...,1268.0,5.2,TechCorp Inc,ai_job_dataset.csv,,41,,,,88450
2,Stacey Brown,Tech Lead,152626.0,USD,SE,FL,Switzerland,L,South Korea,0.0,...,1974.0,9.4,Autonomous Tech,ai_job_dataset.csv,,44,,,,65087
3,Wayne Mcdonald,Backend Developer,80215.0,USD,MI,FL,India,M,India,50.0,...,1345.0,8.6,Future Systems,ai_job_dataset.csv,,31,,,,62121
4,Lynn Mitchell,Software Engineer,54624.0,EUR,MI,PT,France,S,Singapore,100.0,...,1989.0,6.6,Advanced Robotics,ai_job_dataset.csv,,33,,,,56739
5,Dana Terrell,Tech Lead,123574.0,EUR,SE,CT,Germany,M,Germany,50.0,...,819.0,5.9,Neural Networks Co,ai_job_dataset.csv,,30,,,,72311
6,James Juarez,Backend Developer,79670.0,GBP,MI,FL,United Kingdom,S,United Kingdom,0.0,...,1936.0,6.3,DataVision Ltd,ai_job_dataset.csv,,27,,,,87382
7,Jake Davis,Backend Developer,70640.0,EUR,MI,FL,France,L,France,0.0,...,1286.0,7.6,Cloud AI Solutions,ai_job_dataset.csv,,33,,,,81060
8,Mary Hogan,Senior Data Scientist,160710.0,USD,SE,CT,Singapore,L,Singapore,0.0,...,551.0,9.3,Quantum Computing Inc,ai_job_dataset.csv,,42,,,,112168
9,Jessica Jones,AI Specialist,102557.0,USD,SE,PT,Austria,M,Austria,0.0,...,2340.0,5.8,Cloud AI Solutions,ai_job_dataset.csv,,32,,,,107634


In [59]:
# --- Consistent skills generator (keeps all existing columns) ---
import numpy as np
import pandas as pd
import re

# (Optional) reproducibility
# np.random.seed(42)

# Helper: safe getters so we don't alter your existing columns
def get_val(series, i, default):
    try:
        v = series.iat[i]
        if pd.isna(v):
            return default
        return v
    except Exception:
        return default

# Skill pools
FOUNDATIONAL = [
    "Python", "Git", "Linux", "SQL", "Data Analysis", "Communication", "Teamwork",
    "Problem-Solving", "Excel"
]
SOFTWARE = [
    "JavaScript", "TypeScript", "React", "Node.js", "REST APIs",
    "Java", "C++", "Testing/QA", "System Design"
]
DATA = [
    "Pandas", "NumPy", "Data Cleaning", "Data Visualization", "ETL",
    "Power BI", "Tableau"
]
ML_CORE = [
    "Machine Learning", "Scikit-learn", "Feature Engineering",
    "Model Evaluation", "MLOps Basics"
]
ML_ADV = [
    "Deep Learning", "PyTorch", "TensorFlow", "NLP", "Computer Vision",
    "Recommendation Systems"
]
CLOUD_DEVOPS = [
    "AWS", "GCP", "Azure", "Docker", "Kubernetes", "Airflow", "Spark", "Kafka"
]
LEADERSHIP = [
    "Leadership", "Mentoring", "Project Management", "Stakeholder Management",
    "Architecture", "Roadmapping"
]
SECURITY = ["Security Basics", "Auth/OAuth", "RBAC", "OWASP Top 10"]

def job_family(title: str) -> str:
    t = str(title).lower()
    if re.search(r"\b(data|analyst|bi)\b", t):
        return "data"
    if re.search(r"\b(ml|ai|scientist|nlp|vision|recommendation)\b", t):
        return "ml"
    if re.search(r"\b(engineer|developer|backend|frontend|full\s*stack)\b", t):
        return "software"
    if re.search(r"\b(manager|lead|head|cto|principal)\b", t):
        return "lead"
    return "general"

def skill_count(level: str, age: int) -> int:
    base = {"EN": 3, "MI": 4, "SE": 5, "EX": 6}.get(str(level), 4)
    return int(np.clip(base + (1 if age >= 35 else 0), 3, 8))

def candidate_skills(level: str, salary: float, title: str):
    fam = job_family(title)
    pool = set(FOUNDATIONAL + DATA)

    if fam == "software":
        pool.update(SOFTWARE + CLOUD_DEVOPS + SECURITY)
    elif fam == "data":
        pool.update(DATA + ML_CORE)
    elif fam == "ml":
        pool.update(ML_CORE + ML_ADV + CLOUD_DEVOPS)
    elif fam == "lead":
        pool.update(SOFTWARE + DATA + ML_CORE + LEADERSHIP)
    else:
        pool.update(DATA + SOFTWARE)

    # experience enrichment
    if str(level) in ["SE", "EX"]:
        pool.update(LEADERSHIP)
        pool.update(CLOUD_DEVOPS)
        pool.update(ML_ADV)

    # salary enrichment (tune thresholds to your currency)
    if float(salary) >= 120000:
        pool.update(CLOUD_DEVOPS + ML_ADV + ["System Design"])
    elif float(salary) >= 80000:
        pool.update(CLOUD_DEVOPS)

    return list(pool)

def pick_skills(level: str, age: int, salary: float, title: str):
    pool = candidate_skills(level, salary, title)
    n = skill_count(level, age)

    adv_set = set(ML_ADV + CLOUD_DEVOPS + LEADERSHIP + ["System Design", "Kafka", "Spark"])
    weights = []
    for s in pool:
        is_adv = s in adv_set
        if level == "EN":
            w = 0.6 if is_adv else 1.0
        elif level == "MI":
            w = 0.85 if is_adv else 1.0
        elif level == "SE":
            w = 1.15 if is_adv else 1.0
        else:  # EX or other
            w = 1.25 if is_adv else 1.0
        weights.append(w)

    weights = np.array(weights, dtype=float)
    weights /= weights.sum()

    size = min(n, len(pool))
    idx = np.random.choice(len(pool), size=size, replace=False, p=weights)
    return ", ".join([pool[i] for i in idx])

# Build skills WITHOUT touching any other columns
exp_series   = df["experience_level"] if "experience_level" in df.columns else pd.Series([None]*len(df))
age_series   = df["age"]               if "age"               in df.columns else pd.Series([None]*len(df))
sal_series   = df["salary"]            if "salary"            in df.columns else pd.Series([None]*len(df))
title_series = df["job_title"]         if "job_title"         in df.columns else pd.Series([None]*len(df))

skills_col = []
for i in range(len(df)):
    level  = str(get_val(exp_series, i, "MI"))
    age    = int(get_val(age_series, i, 28))
    salary = float(get_val(sal_series, i, 90000))
    title  = str(get_val(title_series, i, "Software Engineer"))
    skills_col.append(pick_skills(level, age, salary, title))

# Only add the new column
df["skills"] = skills_col

print("✅ Done. Row count preserved:", len(df))
print("✅ Column count now (original + 'skills'):", len(df.columns))
# df.to_csv("employees_with_consistent_skills.csv", index=False)


✅ Done. Row count preserved: 30038
✅ Column count now (original + 'skills'): 26


In [60]:
df.head(50)

Unnamed: 0,name,job_title,salary_usd,salary_currency,experience_level,employment_type,company_location,company_size,employee_residence,remote_ratio,...,job_description_length,benefits_score,company_name,source_file,candidateid,age,education,skills,interests,salary
0,Matthew Hernandez,Senior Data Scientist,90376.0,USD,SE,CT,China,M,China,50.0,...,1076.0,5.9,Smart Analytics,ai_job_dataset.csv,,41,,"ETL, Architecture, Power BI, Kafka, NLP, Spark",,90886
1,Tyler Mcdonald,Senior Software Engineer,61895.0,USD,SE,CT,Canada,M,Ireland,100.0,...,1268.0,5.2,TechCorp Inc,ai_job_dataset.csv,,41,,"AWS, NLP, Recommendation Systems, Deep Learnin...",,88450
2,Stacey Brown,Tech Lead,152626.0,USD,SE,FL,Switzerland,L,South Korea,0.0,...,1974.0,9.4,Autonomous Tech,ai_job_dataset.csv,,44,,"React, Linux, Machine Learning, Java, Power BI...",,65087
3,Wayne Mcdonald,Backend Developer,80215.0,USD,MI,FL,India,M,India,50.0,...,1345.0,8.6,Future Systems,ai_job_dataset.csv,,31,,"Communication, Data Analysis, React, Kubernetes",,62121
4,Lynn Mitchell,Software Engineer,54624.0,EUR,MI,PT,France,S,Singapore,100.0,...,1989.0,6.6,Advanced Robotics,ai_job_dataset.csv,,33,,"NumPy, Spark, Kafka, Testing/QA",,56739
5,Dana Terrell,Tech Lead,123574.0,EUR,SE,CT,Germany,M,Germany,50.0,...,819.0,5.9,Neural Networks Co,ai_job_dataset.csv,,30,,"Linux, Java, Stakeholder Management, C++, Scik...",,72311
6,James Juarez,Backend Developer,79670.0,GBP,MI,FL,United Kingdom,S,United Kingdom,0.0,...,1936.0,6.3,DataVision Ltd,ai_job_dataset.csv,,27,,"Git, Kafka, Teamwork, React",,87382
7,Jake Davis,Backend Developer,70640.0,EUR,MI,FL,France,L,France,0.0,...,1286.0,7.6,Cloud AI Solutions,ai_job_dataset.csv,,33,,"Kafka, Tableau, Git, Python",,81060
8,Mary Hogan,Senior Data Scientist,160710.0,USD,SE,CT,Singapore,L,Singapore,0.0,...,551.0,9.3,Quantum Computing Inc,ai_job_dataset.csv,,42,,"Data Visualization, ETL, Feature Engineering, ...",,112168
9,Jessica Jones,AI Specialist,102557.0,USD,SE,PT,Austria,M,Austria,0.0,...,2340.0,5.8,Cloud AI Solutions,ai_job_dataset.csv,,32,,"Feature Engineering, Project Management, Tenso...",,107634


In [61]:
df = df.drop(columns=["candidateid"])

In [62]:
df = df.drop(columns=["education"])


In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30038 entries, 0 to 30037
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   name                    30038 non-null  object 
 1   job_title               30038 non-null  object 
 2   salary_usd              15000 non-null  float64
 3   salary_currency         29838 non-null  object 
 4   experience_level        30038 non-null  object 
 5   employment_type         29838 non-null  object 
 6   company_location        29838 non-null  object 
 7   company_size            29838 non-null  object 
 8   employee_residence      29838 non-null  object 
 9   remote_ratio            29838 non-null  float64
 10  required_skills         15000 non-null  object 
 11  education_required      15000 non-null  object 
 12  years_experience        15000 non-null  float64
 13  industry                15000 non-null  object 
 14  posting_date            15000 non-null

In [65]:
df = df.drop(columns=["interests"])


In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30038 entries, 0 to 30037
Data columns (total 23 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   name                    30038 non-null  object 
 1   job_title               30038 non-null  object 
 2   salary_usd              15000 non-null  float64
 3   salary_currency         29838 non-null  object 
 4   experience_level        30038 non-null  object 
 5   employment_type         29838 non-null  object 
 6   company_location        29838 non-null  object 
 7   company_size            29838 non-null  object 
 8   employee_residence      29838 non-null  object 
 9   remote_ratio            29838 non-null  float64
 10  required_skills         15000 non-null  object 
 11  education_required      15000 non-null  object 
 12  years_experience        15000 non-null  float64
 13  industry                15000 non-null  object 
 14  posting_date            15000 non-null

In [71]:
# --- Setup -------------------------------------------------------------------
import numpy as np
import pandas as pd
import re
from datetime import datetime, timedelta

# Optional: reproducibility
# np.random.seed(42)

# Try to use Faker for better company names (optional)
try:
    from faker import Faker
    fake = Faker("en_IN")  # pick "en_IN" for Indian-style names; change as you like
except Exception:
    fake = None

# --- Safe helpers -------------------------------------------------------------
def col_exists(name): return name in df.columns
def safe_series(name, default=None):
    if name in df.columns:
        return df[name]
    return pd.Series([default]*len(df), index=df.index)

def job_family(title: str) -> str:
    t = str(title).lower()
    if re.search(r"\b(data|analyst|bi)\b", t):
        return "data"
    if re.search(r"\b(ml|ai|scientist|nlp|vision|recommendation)\b", t):
        return "ml"
    if re.search(r"\b(engineer|developer|backend|frontend|full\s*stack|devops)\b", t):
        return "software"
    if re.search(r"\b(manager|lead|head|cto|principal|director)\b", t):
        return "lead"
    return "general"

# pull existing columns safely
exp_level = safe_series("experience_level", "MI")
age_ser   = safe_series("age", 28)
salary    = safe_series("salary", 90000)
title     = safe_series("job_title", "Software Engineer")

# --- 11) education_required ---------------------------------------------------
# Map experience level → typical minimum education
edu_map = {
    "EN": ["Diploma", "Bachelor's"],
    "MI": ["Bachelor's", "Master's"],
    "SE": ["Bachelor's", "Master's"],
    "EX": ["Bachelor's", "Master's", "MBA"]
}
def pick_education(level):
    level = str(level).upper()
    pool = edu_map.get(level, ["Bachelor's"])
    return np.random.choice(pool)

if not col_exists("education_required"):
    df["education_required"] = [pick_education(l) for l in exp_level]
else:
    mask = df["education_required"].isna()
    df.loc[mask, "education_required"] = [pick_education(exp_level.iat[i]) for i in df.index[mask]]

# --- 12) years_experience -----------------------------------------------------
# Make years consistent with level and age (cap by age - 18)
def years_from_level(level, age):
    level = str(level).upper()
    a = int(age) if pd.notna(age) else 28
    cap = max(0, a - 18)
    if level == "EN": lo, hi = 0, 2
    elif level == "MI": lo, hi = 3, 7
    elif level == "SE": lo, hi = 6, 15
    elif level == "EX": lo, hi = 10, 25
    else: lo, hi = 2, 8
    y = np.random.randint(lo, hi+1)
    return int(np.clip(y, 0, cap))

if not col_exists("years_experience"):
    df["years_experience"] = [years_from_level(exp_level.iat[i], age_ser.iat[i]) for i in range(len(df))]
else:
    mask = df["years_experience"].isna()
    df.loc[mask, "years_experience"] = [years_from_level(exp_level.iat[i], age_ser.iat[i]) for i in df.index[mask]]

# --- 13) industry -------------------------------------------------------------
industries_by_family = {
    "software": ["Software", "Internet", "SaaS", "E-commerce", "DevTools"],
    "data":     ["Analytics", "Consulting", "Research", "FinTech"],
    "ml":       ["AI/ML", "Healthcare AI", "Computer Vision", "NLP"],
    "lead":     ["Technology", "Enterprise Software", "Cloud"],
    "general":  ["Technology", "Education", "Media", "Telecom"]
}
def pick_industry(title):
    fam = job_family(title)
    return np.random.choice(industries_by_family.get(fam, ["Technology"]))

if not col_exists("industry"):
    df["industry"] = [pick_industry(t) for t in title]
else:
    mask = df["industry"].isna()
    df.loc[mask, "industry"] = [pick_industry(title.iat[i]) for i in df.index[mask]]

# --- 14) posting_date ---------------------------------------------------------
# Random date in the last 180 days
def random_recent_date(days=180):
    offset = np.random.randint(0, days+1)
    return (pd.Timestamp.today().normalize() - pd.Timedelta(days=offset)).date()

if not col_exists("posting_date"):
    df["posting_date"] = [random_recent_date(180) for _ in range(len(df))]
else:
    mask = df["posting_date"].isna()
    df.loc[mask, "posting_date"] = [random_recent_date(180) for _ in df.index[mask]]

# --- 15) application_deadline -------------------------------------------------
# 7–45 days after posting_date
def deadline_after(post_date):
    try:
        base = pd.to_datetime(post_date)
    except Exception:
        base = pd.Timestamp.today()
    delta = np.random.randint(7, 46)  # 7-45 days
    return (base + pd.Timedelta(days=delta)).date()

if not col_exists("application_deadline"):
    df["application_deadline"] = [deadline_after(df["posting_date"].iat[i]) for i in range(len(df))]
else:
    mask = df["application_deadline"].isna()
    df.loc[mask, "application_deadline"] = [deadline_after(df["posting_date"].iat[i]) for i in df.index[mask]]

# --- 16) job_description_length ----------------------------------------------
# If you already have a text column like 'job_description' or 'description', compute length.
# Otherwise create a plausible length based on level.
def estimate_jd_len(level):
    level = str(level).upper()
    if level == "EN": lo, hi = 80, 180
    elif level == "MI": lo, hi = 150, 300
    elif level == "SE": lo, hi = 200, 400
    elif level == "EX": lo, hi = 250, 500
    else: lo, hi = 150, 300
    return int(np.random.randint(lo, hi+1))

desc_col = None
for cand in ["job_description", "description", "jd"]:
    if cand in df.columns:
        desc_col = cand
        break

if not col_exists("job_description_length"):
    if desc_col:
        df["job_description_length"] = df[desc_col].fillna("").astype(str).str.len()
    else:
        df["job_description_length"] = [estimate_jd_len(exp_level.iat[i]) for i in range(len(df))]
else:
    mask = df["job_description_length"].isna()
    if desc_col:
        df.loc[mask, "job_description_length"] = df.loc[mask, desc_col].fillna("").astype(str).str.len()
    else:
        df.loc[mask, "job_description_length"] = [estimate_jd_len(exp_level.iat[i]) for i in df.index[mask]]

# --- 17) benefits_score -------------------------------------------------------
# 0–100; bias higher for SE/EX and higher salaries
def benefits_from(level, sal):
    level = str(level).upper()
    s = float(sal) if pd.notna(sal) else 90000.0
    base = {"EN": 50, "MI": 60, "SE": 70, "EX": 80}.get(level, 65)
    bonus = 0
    if s >= 150000: bonus += 10
    elif s >= 100000: bonus += 5
    noise = np.random.randint(-10, 11)
    return int(np.clip(base + bonus + noise, 0, 100))

if not col_exists("benefits_score"):
    df["benefits_score"] = [benefits_from(exp_level.iat[i], salary.iat[i]) for i in range(len(df))]
else:
    mask = df["benefits_score"].isna()
    df.loc[mask, "benefits_score"] = [benefits_from(exp_level.iat[i], salary.iat[i]) for i in df.index[mask]]

# --- 18) company_name ---------------------------------------------------------
def fake_company():
    if fake is not None:
        return fake.company()
    # fallback companies if Faker unavailable
    fallback = ["TechNova Labs", "DataSpark Systems", "CloudVeda", "QuantumLeaf", "NextGen Analytics"]
    return np.random.choice(fallback)

if not col_exists("company_name"):
    df["company_name"] = [fake_company() for _ in range(len(df))]
else:
    mask = df["company_name"].isna() | (df["company_name"].astype(str).str.strip() == "")
    df.loc[mask, "company_name"] = [fake_company() for _ in df.index[mask]]

# --- Final checks & (optional) save ------------------------------------------
print("✅ Rows:", len(df), "| Columns:", len(df.columns))
print("✅ Added/filled columns: education_required, years_experience, industry, posting_date, application_deadline, job_description_length, benefits_score, company_name")

# Optional: save
# df.to_csv("employees_enriched.csv", index=False)
# df.head()


✅ Rows: 30038 | Columns: 23
✅ Added/filled columns: education_required, years_experience, industry, posting_date, application_deadline, job_description_length, benefits_score, company_name


In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30038 entries, 0 to 30037
Data columns (total 23 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   name                    30038 non-null  object 
 1   job_title               30038 non-null  object 
 2   salary_usd              15000 non-null  float64
 3   salary_currency         29838 non-null  object 
 4   experience_level        30038 non-null  object 
 5   employment_type         29838 non-null  object 
 6   company_location        29838 non-null  object 
 7   company_size            29838 non-null  object 
 8   employee_residence      29838 non-null  object 
 9   remote_ratio            29838 non-null  float64
 10  required_skills         15000 non-null  object 
 11  education_required      30038 non-null  object 
 12  years_experience        30038 non-null  float64
 13  industry                30038 non-null  object 
 14  posting_date            30038 non-null

In [70]:
df.to_csv("Skill Gap Analysis & Career Path Optimization.csv", index=False)
print("✅ File saved with all original 22 columns + new 'skills' column")

✅ File saved with all original 22 columns + new 'skills' column


In [73]:
# ================== SAFE ENRICHMENT (no rows/columns dropped) ==================
import numpy as np
import pandas as pd
import re

# Optional: reproducibility
# np.random.seed(42)

# Try Faker for nicer locations; if not installed, it's fine (fallbacks used)
try:
    from faker import Faker
    fake = Faker()
except Exception:
    fake = None

def col_exists(name): return name in df.columns
def fill_if_missing(col, values):
    """Create col if missing, else only fill NaNs with provided values (array-like)."""
    if col not in df.columns:
        df[col] = values
    else:
        mask = df[col].isna()
        df.loc[mask, col] = np.array(values)[mask.values]

# --------------------------- helpers ------------------------------------------
def job_family(title: str) -> str:
    t = str(title).lower()
    if re.search(r"\b(data|analyst|bi)\b", t): return "data"
    if re.search(r"\b(ml|ai|scientist|nlp|vision|recommendation)\b", t): return "ml"
    if re.search(r"\b(engineer|developer|backend|frontend|full\s*stack|devops)\b", t): return "software"
    if re.search(r"\b(manager|lead|head|cto|principal|director)\b", t): return "lead"
    return "general"

# Country pools (ISO-ish)
COUNTRIES = ["US","IN","GB","DE","FR","CA","AU","SG","JP","AE","NL","SE","CH","ES","IT","BR"]
# currency by country
COUNTRY_TO_CUR = {
    "US":"USD","IN":"INR","GB":"GBP","DE":"EUR","FR":"EUR","CA":"CAD","AU":"AUD","SG":"SGD",
    "JP":"JPY","AE":"AED","NL":"EUR","SE":"SEK","CH":"CHF","ES":"EUR","IT":"EUR","BR":"BRL"
}
# Very rough conversion → USD (static, illustrative)
CUR_TO_USD = {
    "USD":1.0, "INR":0.012, "EUR":1.08, "GBP":1.27, "CAD":0.74, "AUD":0.67, "SGD":0.74,
    "JPY":0.0067, "AED":0.27, "SEK":0.095, "CHF":1.10, "BRL":0.19
}

# convenient series with defaults (won't overwrite real data)
job_title = df["job_title"] if "job_title" in df.columns else pd.Series(["Software Engineer"]*len(df), index=df.index)
salary    = df["salary"]    if "salary"    in df.columns else pd.Series(np.random.randint(50000, 120001, len(df)), index=df.index)

# ----------------------- employee_residence -----------------------------------
if "employee_residence" not in df.columns:
    if fake:
        emp_res = [fake.country_code(representation="alpha-2") for _ in range(len(df))]
        # normalize to our pool when possible
        emp_res = [c if c in COUNTRIES else np.random.choice(COUNTRIES) for c in emp_res]
    else:
        emp_res = [np.random.choice(COUNTRIES) for _ in range(len(df))]
    df["employee_residence"] = emp_res
else:
    mask = df["employee_residence"].isna() | (df["employee_residence"].astype(str).str.strip()=="")
    fill_vals = [np.random.choice(COUNTRIES) for _ in range(len(df))]
    df.loc[mask, "employee_residence"] = np.array(fill_vals)[mask.values]

# ----------------------- company_location -------------------------------------
if "company_location" not in df.columns:
    # 70% same as employee residence, 30% different (global companies)
    same_mask = np.random.rand(len(df)) < 0.7
    comp_loc = df["employee_residence"].copy()
    diff_vals = [np.random.choice([c for c in COUNTRIES if c != r]) for r in df["employee_residence"]]
    comp_loc.loc[~same_mask] = np.array(diff_vals)[~same_mask]
    df["company_location"] = comp_loc
else:
    mask = df["company_location"].isna() | (df["company_location"].astype(str).str.strip()=="")
    df.loc[mask, "company_location"] = df.loc[mask, "employee_residence"]

# ----------------------- salary_currency --------------------------------------
# Prefer company_location -> currency; fallback employee_residence
def infer_currency(row):
    loc = str(row.get("company_location",""))
    res = str(row.get("employee_residence",""))
    cur = COUNTRY_TO_CUR.get(loc) or COUNTRY_TO_CUR.get(res) or "USD"
    return cur

cur_vals = [infer_currency(df.iloc[i]) for i in range(len(df))]
fill_if_missing("salary_currency", cur_vals)

# ----------------------- salary_usd -------------------------------------------
# Convert existing salary using currency; if salary missing, leave NaN
def convert_to_usd(sal, cur):
    if pd.isna(sal): return np.nan
    rate = CUR_TO_USD.get(str(cur), 1.0)
    try:
        return float(sal) * float(rate)
    except Exception:
        return np.nan

usd_vals = [convert_to_usd(salary.iat[i], df["salary_currency"].iat[i]) for i in range(len(df))]
fill_if_missing("salary_usd", usd_vals)

# ----------------------- remote_ratio (0 / 50 / 100) --------------------------
def pick_remote_ratio(title):
    fam = job_family(title)
    r = np.random.rand()
    if fam in ["software","ml","data"]:
        # more remote-friendly
        if   r < 0.25: return 0
        elif r < 0.55: return 50
        else:          return 100
    else:
        if   r < 0.55: return 0
        elif r < 0.80: return 50
        else:          return 100

remote_vals = [pick_remote_ratio(job_title.iat[i]) for i in range(len(df))]
fill_if_missing("remote_ratio", remote_vals)

# ----------------------- company_size (S / M / L) -----------------------------
# bias larger size for higher salaries
def pick_company_size(sal):
    try:
        s = float(sal)
    except Exception:
        s = 80000
    if s >= 140000: probs = [0.15, 0.35, 0.50]  # S, M, L
    elif s >= 90000: probs = [0.25, 0.45, 0.30]
    else:            probs = [0.45, 0.40, 0.15]
    return np.random.choice(["S","M","L"], p=probs)

comp_size_vals = [pick_company_size(salary.iat[i]) for i in range(len(df))]
fill_if_missing("company_size", comp_size_vals)

# ----------------------- employment_type (FT/PT/CT/FL) ------------------------
def pick_employment_type(title):
    t = str(title).lower()
    if "intern" in t: return "CT"     # contract (intern)
    if "contract" in t: return "CT"
    if "freelance" in t or "consultant" in t: return "FL"
    # otherwise mostly full-time
    return np.random.choice(["FT","PT","CT","FL"], p=[0.8, 0.05, 0.10, 0.05])

emp_type_vals = [pick_employment_type(job_title.iat[i]) for i in range(len(df))]
fill_if_missing("employment_type", emp_type_vals)

# ----------------------- required_skills --------------------------------------
# If you already have 'skills', pick 3 from it; else generate generic ones
GENERIC_SKILLS = [
    "Python","SQL","Git","Linux","Data Analysis","Machine Learning","React","Node.js",
    "Docker","Kubernetes","AWS","Communication","Problem-Solving","Tableau","Pandas","NumPy"
]

def sample_from_skills_str(s):
    parts = [p.strip() for p in str(s).split(",") if p.strip()]
    if not parts:
        # fallback to generic
        k = np.random.randint(2,5)
        return ", ".join(np.random.choice(GENERIC_SKILLS, size=k, replace=False))
    k = min(len(parts), np.random.randint(2,5))
    idx = np.random.choice(len(parts), size=k, replace=False)
    return ", ".join([parts[i] for i in idx])

if "skills" in df.columns:
    req_vals = [sample_from_skills_str(df["skills"].iat[i]) for i in range(len(df))]
else:
    req_vals = [", ".join(np.random.choice(GENERIC_SKILLS, size=np.random.randint(2,5), replace=False))
                for _ in range(len(df))]
fill_if_missing("required_skills", req_vals)

# ----------------------- company_name (fill if missing) -----------------------
if "company_name" in df.columns:
    mask = df["company_name"].isna() | (df["company_name"].astype(str).str.strip()=="")
    if mask.any():
        if fake:
            fill_vals = [fake.company() for _ in range(mask.sum())]
        else:
            fallback = ["TechNova Labs","DataSpark Systems","CloudVeda","QuantumLeaf","NextGen Analytics"]
            fill_vals = [np.random.choice(fallback) for _ in range(mask.sum())]
        df.loc[mask, "company_name"] = fill_vals

# ----------------------- Final check ------------------------------------------
print("✅ Rows:", len(df), "| Columns:", len(df.columns))
print("✅ Completed columns now present:",
      [c for c in ["salary_currency","required_skills","salary_usd","remote_ratio",
                   "employee_residence","company_size","company_location","employment_type"]
       if c in df.columns])

# Optional: save
# df.to_csv("employees_completed_extra_columns.csv", index=False)
# df.head()


✅ Rows: 30038 | Columns: 23
✅ Completed columns now present: ['salary_currency', 'required_skills', 'salary_usd', 'remote_ratio', 'employee_residence', 'company_size', 'company_location', 'employment_type']


In [75]:
df = df.drop(columns=["source_file"])


In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30038 entries, 0 to 30037
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   name                    30038 non-null  object 
 1   job_title               30038 non-null  object 
 2   salary_usd              30038 non-null  float64
 3   salary_currency         30038 non-null  object 
 4   experience_level        30038 non-null  object 
 5   employment_type         30038 non-null  object 
 6   company_location        30038 non-null  object 
 7   company_size            30038 non-null  object 
 8   employee_residence      30038 non-null  object 
 9   remote_ratio            30038 non-null  float64
 10  required_skills         30038 non-null  object 
 11  education_required      30038 non-null  object 
 12  years_experience        30038 non-null  float64
 13  industry                30038 non-null  object 
 14  posting_date            30038 non-null

In [77]:
df.to_csv("Skill Gap Analysis & Career Path Optimization.csv", index=False)
print("✅ File saved with all original 22 columns + new 'skills' column")

✅ File saved with all original 22 columns + new 'skills' column
