In [None]:
import pandas as pd
import random
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
import joblib
from faker import Faker
from sklearn.metrics import classification_report
from google.colab import drive

In [None]:
def extract_skills(text):

    skills = [
        # Data / Tech
        "Python", "Java", "C++", "SQL", "NoSQL", "TensorFlow", "PyTorch", "Scikit-learn", "Keras",
        "Flask", "Django", "React", "Node.js", "MongoDB", "PostgreSQL", "AWS", "Azure", "GCP",
        "PowerBI", "Tableau", "Excel", "Hadoop", "Spark", "Kafka", "Docker", "Kubernetes",

        # DevOps / Infrastructure
        "CI/CD", "Git", "Jenkins", "Terraform", "Linux", "Ansible", "Bash", "Shell Scripting",

        # Cybersecurity
        "Firewalls", "VPN", "SIEM", "Penetration Testing", "Ethical Hacking", "Nmap", "Wireshark",

        # Marketing
        "SEO", "SEM", "Google Analytics", "Facebook Ads", "Copywriting", "Email Marketing",

        # Design
        "Figma", "Sketch", "Adobe XD", "Photoshop", "Illustrator", "Wireframing", "Prototyping",

        # Finance / Accounting
        "QuickBooks", "SAP", "Oracle ERP", "Financial Modeling", "GAAP", "Budgeting",

        # Project Management
        "Agile", "Scrum", "JIRA", "Kanban", "Trello", "Confluence", "Risk Management",

        # HR / Soft
        "Talent Acquisition", "Onboarding", "Conflict Resolution", "Payroll", "HRIS", "PeopleSoft",

        # Healthcare
        "Patient Care", "EMR", "HIPAA", "Clinical Trials", "Radiology", "Nursing", "CPR", "ICU",

        # Education
        "Curriculum Design", "Classroom Management", "LMS", "EdTech", "Pedagogy",

        # Legal
        "Legal Research", "Contract Drafting", "Litigation", "Case Management", "LexisNexis"
    ]

    return ", ".join([s for s in skills if s.lower() in text.lower()])

def estimate_experience(text):

    return round(random.uniform(1, 10), 1)

def extract_certifications(text):
    certs = [
        "AWS Certified", "Azure Certified", "GCP Certified", "PMP", "CAPM", "Scrum Master",
        "CompTIA A+", "CompTIA Network+", "CCNA", "MCSE", "CFA", "CPA", "Six Sigma", "CISSP"
    ]
    return ", ".join([c for c in certs if c.lower() in text.lower()])

def extract_tools(text):
    tools = [
        "JIRA", "Trello", "Confluence", "Salesforce", "HubSpot", "GitHub", "GitLab",
        "Tableau", "PowerBI", "Excel", "Outlook", "Slack", "Asana", "Notion", "Zapier", "MS Office", "Matlab"
    ]
    return ", ".join([t for t in tools if t.lower() in text.lower()])

def extract_soft_skills(text):
    soft_skills = [
        "leadership", "teamwork", "adaptability", "communication", "problem solving",
        "time management", "collaboration", "creativity", "critical thinking", "resilience"
    ]
    return ", ".join([s for s in soft_skills if s.lower() in text.lower()])

def map_domain(title):
    title = title.lower()

    if "data" in title or "analyst" in title:
        return "Data Tech"
    if "devops" in title or "infrastructure" in title:
        return "DevOps / Infrastructure"
    if "security" in title:
        return "Cybersecurity"
    if "engineer" in title or "developer" in title:
        return "Software Engineering"
    if "scientist" in title:
        return "Research "
    if "product" in title:
        return "Product Management"
    if "marketing" in title or "seo" in title:
        return "Marketing"
    if "sales" in title:
        return "Sales"
    if "accountant" in title or "finance" in title:
        return "Finance"
    if "nurse" in title or "clinical" in title or "healthcare" or "doctor" in title:
        return "Healthcare"
    if "teacher" in title or "education" in title:
        return "Education"
    if "hr" in title or "recruiter" in title:
        return "Human Resources"
    if "legal" in title or "lawyer" in title or "attorney" in title:
        return "Legal"
    if "aero" in title or "mechanical" or "electrical" in title:
        return "Engineering"
    if "consultant" in title:
        return "Consulting"
    if "designer" in title or "ux" in title or "ui" in title:
        return "Design"

    # Default fallback domain
    return "General"


def simulate_future_title(title, exp):
    if exp >= 5:
        if "data" in title:
            return "Senior Data Scientist"
        elif "developer" in title or "engineer" in title:
            return "Lead " + title
        elif "marketing" in title:
            return "Marketing Manager"
        else:
            return "Senior " + title
    else:
        return "Mid-level " + title


In [None]:
df_1 = pd.read_csv("/content/Resume.csv")
df_1 = df_1.rename(columns={"Resume_str": "resume_text", "Category": "current_title"})
df_1.head(5)

Unnamed: 0,ID,resume_text,Resume_html,current_title
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [None]:
df_2 = pd.read_csv("/content/resume_data.csv")
df_2 = df_2.rename(columns={"Resume_str": "resume_text", "Category": "current_title"})
df_2.head(5)

Unnamed: 0,address,career_objective,skills,educational_institution_name,degree_names,passing_years,educational_results,result_types,major_field_of_studies,professional_company_names,...,online_links,issue_dates,expiry_dates,﻿job_position_name,educationaL_requirements,experiencere_requirement,age_requirement,responsibilities.1,skills_required,matched_score
0,,Big data analytics working and database wareho...,"['Big Data', 'Hadoop', 'Hive', 'Python', 'Mapr...",['The Amity School of Engineering & Technology...,['B.Tech'],['2019'],['N/A'],[None],['Electronics'],['Coca-COla'],...,,,,Senior Software Engineer,B.Sc in Computer Science & Engineering from a ...,At least 1 year,,Technical Support\nTroubleshooting\nCollaborat...,,0.85
1,,Fresher looking to join as a data analyst and ...,"['Data Analysis', 'Data Analytics', 'Business ...","['Delhi University - Hansraj College', 'Delhi ...","['B.Sc (Maths)', 'M.Sc (Science) (Statistics)']","['2015', '2018']","['N/A', 'N/A']","['N/A', 'N/A']","['Mathematics', 'Statistics']",['BIB Consultancy'],...,,,,Machine Learning (ML) Engineer,M.Sc in Computer Science & Engineering or in a...,At least 5 year(s),,Machine Learning Leadership\nCross-Functional ...,,0.75
2,,,"['Software Development', 'Machine Learning', '...","['Birla Institute of Technology (BIT), Ranchi']",['B.Tech'],['2018'],['N/A'],['N/A'],['Electronics/Telecommunication'],['Axis Bank Limited'],...,,,,"Executive/ Senior Executive- Trade Marketing, ...",Master of Business Administration (MBA),At least 3 years,,"Trade Marketing Executive\nBrand Visibility, S...",Brand Promotion\nCampaign Management\nField Su...,0.416667
3,,To obtain a position in a fast-paced business ...,"['accounts payables', 'accounts receivables', ...","['Martinez Adult Education, Business Training ...",['Computer Applications Specialist Certificate...,['2008'],[None],[None],['Computer Applications'],"['Company Name ï¼ City , State', 'Company Name...",...,,,,Business Development Executive,Bachelor/Honors,1 to 3 years,Age 22 to 30 years,Apparel Sourcing\nQuality Garment Sourcing\nRe...,Fast typing skill\nIELTSInternet browsing & on...,0.76
4,,Professional accountant with an outstanding wo...,"['Analytical reasoning', 'Compliance testing k...",['Kent State University'],['Bachelor of Business Administration'],[None],['3.84'],[None],['Accounting'],"['Company Name', 'Company Name', 'Company Name...",...,[None],[None],"['February 15, 2021']",Senior iOS Engineer,Bachelor of Science (BSc) in Computer Science,At least 4 years,,iOS Lifecycle\nRequirement Analysis\nNative Fr...,iOS\niOS App Developer\niOS Application Develo...,0.65


In [None]:
data = []
with open("/content/skills_it.txt", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split(":::")
        if len(parts) == 3:
            resume_id, occupations, resume_text = parts
            current_title = occupations.split(";")[0]
            data.append({
                "resume_text": resume_text,
                "current_title": current_title
            })
df_3 = pd.DataFrame(data)
df_3 = df_3.dropna(subset=["resume_text", "current_title"])
df_3 = df_3[~df_3["resume_text"].str.strip().eq("") & ~df_3["current_title"].str.strip().eq("")]

df_3.head(5)

Unnamed: 0,resume_text,current_title
1,"Senior Systems Administrator Brownsville, TX A...",Senior Systems Administrator
4,"Systems Administrator Metairie, LA Driven Tech...",Systems Administrator
7,Systems Administrator Systems Administrator - ...,Systems Administrator
13,Systems Administrator Systems Administrator - ...,Systems Administrator
16,Systems Administrator / Engineer Systems Admin...,Systems Administrator / Engineer


In [None]:
df_1["skills"] = df_1["resume_text"].apply(extract_skills)
df_1["certifications"] = df_1["resume_text"].apply(extract_certifications)
df_1["tools_used"] = df_1["resume_text"].apply(extract_tools)
df_1["soft_skills"] = df_1["resume_text"].apply(extract_soft_skills)
df_1["years_experience"] = df_1["resume_text"].apply(estimate_experience)
df_1["domain"] = df_1["current_title"].apply(map_domain)
df_1["future_title"] = df_1.apply(lambda row: simulate_future_title(row["current_title"], row["years_experience"]), axis=1)


In [None]:
df_2["skills"] = df_2["skills"].fillna("").astype(str)
df_2["skills"] = df_2["skills"] + ", " + df_2["skills_required"].fillna("").astype(str)
df_2["current_title"] = df_2["﻿job_position_name"].fillna("Unknown")
df_2["pseudo_resume"] = (
    df_2["career_objective"].fillna("") + " " +
    df_2["skills"].fillna("") + " " +
    df_2["responsibilities.1"].fillna("") + " " +
    df_2["educationaL_requirements"].fillna("")
)

df_2["certifications"] = df_2["pseudo_resume"].apply(extract_certifications)
df_2["tools_used"] = df_2["pseudo_resume"].apply(extract_tools)
df_2["soft_skills"] = df_2["pseudo_resume"].apply(extract_soft_skills)

# Use experience field or estimate if invalid
def safe_estimate(exp):
    try:
        return float(re.findall(r"\d+", str(exp))[0])
    except:
        return round(random.uniform(1, 10), 1)

df_2["years_experience"] = df_2["experiencere_requirement"].apply(safe_estimate)

# Infer domain from title
df_2["domain"] = df_2["current_title"].apply(map_domain)

# Simulate a future title
df_2["future_title"] = df_2.apply(lambda row: simulate_future_title(row["current_title"], row["years_experience"]), axis=1)


In [None]:
df_3["skills"] = df_3["resume_text"].apply(extract_skills)
df_3["certifications"] = df_3["resume_text"].apply(extract_certifications)
df_3["tools_used"] = df_3["resume_text"].apply(extract_tools)
df_3["soft_skills"] = df_3["resume_text"].apply(extract_soft_skills)
df_3["years_experience"] = df_3["resume_text"].apply(estimate_experience)
df_3["domain"] = df_3["current_title"].apply(map_domain)
df_3["future_title"] = df_3.apply(lambda row: simulate_future_title(row["current_title"], row["years_experience"]), axis=1)


In [None]:
pip install pandas faker



In [None]:
fake = Faker()

job_titles = ["Software Engineer", "Data Analyst", "Marketing Associate", "Business Analyst",
              "Mechanical Engineer", "Sales Representative", "UX Designer"]

skills_pool = {
    "Software Engineer": ["Python", "Java", "C++", "Flask", "Django", "SQL"],
    "Data Analyst": ["SQL", "Excel", "PowerBI", "Python", "Tableau"],
    "Marketing Associate": ["SEO", "Google Ads", "Email Marketing", "Brand Strategy"],
    "Business Analyst": ["SQL", "UML", "Excel", "Business Strategy"],
    "Mechanical Engineer": ["SolidWorks", "MATLAB", "ANSYS", "Thermodynamics"],
    "Sales Representative": ["CRM", "Negotiation", "Cold Calling", "HubSpot"],
    "UX Designer": ["Figma", "Sketch", "User Research", "Prototyping"]
}

domains = ["Fintech", "Healthcare", "E-commerce", "Education", "Real Estate", "Entertainment", "Energy"]

education_levels = ["High School", "Bachelor's", "Master's", "PhD"]
university_tiers = ["1", "2", "3"]
company_tiers = ["1", "2", "3"]

In [None]:
def generate_sample():
    current_title = random.choice(job_titles)
    skills = random.sample(skills_pool[current_title], k=random.randint(2, 4))
    domain = random.choice(domains)
    years_experience = round(random.uniform(1, 10), 1)
    education = random.choices(education_levels, weights=[1, 5, 3, 1])[0]
    company_tier = random.choice(company_tiers)
    university_tier = random.choice(university_tiers)

    # Simulate future title based on current title and experience
    if years_experience >= 5:
        if current_title.startswith("Data"):
            future_title = "Senior Data Scientist"
        elif "Engineer" in current_title:
            future_title = "Lead " + current_title
        elif "Marketing" in current_title:
            future_title = "Marketing Manager"
        else:
            future_title = "Senior " + current_title
    else:
        future_title = "Mid-level " + current_title

    return {
        "name": fake.name(),
        "current_title": current_title,
        "skills": ", ".join(skills),
        "domain": domain,
        "years_experience": years_experience,
        "education_level": education,
        "company_tier": company_tier,
        "university_tier": university_tier,
        "future_title": future_title
    }


In [None]:
data = [generate_sample() for _ in range(1000)]
df = pd.DataFrame(data)
df.to_csv("simulated_career_trajectory_dataset.csv", index=False)

In [None]:
df_4= pd.read_csv("simulated_career_trajectory_dataset.csv")
data = [generate_sample() for _ in range(1000)]
df_4 = pd.DataFrame(data)
df_4.head(5)

Unnamed: 0,name,current_title,skills,domain,years_experience,education_level,company_tier,university_tier,future_title
0,Jackie Fitzpatrick,Data Analyst,"Python, SQL, Excel",E-commerce,1.3,Master's,3,2,Mid-level Data Analyst
1,Jonathan Solomon,Sales Representative,"Negotiation, Cold Calling, HubSpot",Education,2.5,Bachelor's,3,1,Mid-level Sales Representative
2,James Nichols,Mechanical Engineer,"Thermodynamics, SolidWorks",Fintech,6.5,Master's,3,2,Lead Mechanical Engineer
3,Joseph Smith,Mechanical Engineer,"MATLAB, Thermodynamics",Energy,8.6,Bachelor's,2,2,Lead Mechanical Engineer
4,Tracy Garcia,Software Engineer,"Java, C++, Python",E-commerce,9.0,Master's,3,3,Lead Software Engineer


In [None]:
required_columns = [
    "current_title", "skills", "domain", "years_experience",
    "certifications", "tools_used", "soft_skills", "future_title"
]

# Fill missing cols in each dataset if they don't exist
for col in required_columns:
    for df in [df_1, df_2, df_3, df_4]:
        if col not in df.columns:
            df[col] = ""

df_all = pd.concat([df_1, df_2, df_3, df_4], ignore_index=True)
df_all.dropna(subset=["current_title", "future_title"], inplace=True)
df_all = df_all.drop_duplicates(subset=["current_title", "skills", "domain"])
df_all.to_csv("career_trajectory_dataset.csv", index=False)

In [None]:
df_all["features"] = (
    "Title: " + df_all["current_title"] + " | " +
    "Domain: " + df_all["domain"] + " | " +
    "Skills: " + df_all["skills"] + " | " +
    "Certifications: " + df_all["certifications"] + " | " +
    "Tools: " + df_all["tools_used"] + " | " +
    "Soft Skills: " + df_all["soft_skills"] + " | " +
    "Experience: " + df_all["years_experience"].astype(str) + " years"
)

In [None]:
def enrich_features(row):
    return (
        f"{row['current_title']} with {row['years_experience']} years of experience in "
        f"{row['domain']}. Skilled in {row['skills']}. Tools used include: {row['tools_used']}. "
        f"Certified in: {row['certifications']}. Known for {row['soft_skills']}."
    )

df_all["features"] = df_all.apply(enrich_features, axis=1)

In [None]:
df_all["features"].sample(5).values

array(["Sr.Officer / Executive - Internal Audit with 2.0 years of experience in Healthcare. Skilled in ['Machine Learning', 'Text Analytics', 'Software Development', 'Data Analysis', 'Python', 'Java', 'JavaScript', 'Matplotlib'], Having CACC from reputed CA Firm\nInternal Audit and Compliance. Tools used include: Excel. Certified in: . Known for .",
       "Head of Internal Control & Compliance (ICC) - SEVP/DMD with 15.0 years of experience in Healthcare. Skilled in ['advertising', 'architect', 'asset management', 'auditing', 'Budgeting', 'budgets', 'budget', 'business case', 'closing', 'Cognos', 'client', 'customer service', 'Financials', 'Financial', 'financial analysis', 'financial modeling', 'leadership', 'Leadership skills', 'marketing', 'Excel', 'Microsoft Office', 'negotiations', 'office management', 'Oracle Financials', 'PeopleSoft', 'executive presentations', 'processes', 'coding', 'progress', 'Real Estate', 'reporting', 'SAP', 'strategic', 'strategic marketing', 'telecom'], A

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


# Trim and sample
df_all["features"] = df_all["features"].str[:1000]
df_all = df_all.sample(n=500, random_state=42)

# Label Encoding
le = LabelEncoder()
df_all["future_title_encoded"] = le.fit_transform(df_all["future_title"])

X_train, X_test, y_train, y_test = train_test_split(
    df_all["features"], df_all["future_title_encoded"], test_size=0.2, random_state=42
)

# Pipeline with lighter config
model = make_pipeline(
    TfidfVectorizer(max_features=1000, ngram_range=(1, 1)),
    RandomForestClassifier(n_estimators=50, random_state=42)
)

model.fit(X_train, y_train)


In [None]:
y_pred = model.predict(X_test)
used_labels = np.unique(y_test)
target_names = le.inverse_transform(used_labels)
print(classification_report(y_test, y_pred, labels=used_labels, target_names=target_names))

                                                                                                   precision    recall  f1-score   support

                                                                           Lead Software Engineer       0.50      1.00      0.67         1
                                                                                Marketing Manager       1.00      1.00      1.00         1
                                                                             Mid-level ACCOUNTANT       0.00      0.00      0.00         1
                                                                               Mid-level ADVOCATE       0.00      0.00      0.00         1
                                                                                Mid-level APPAREL       0.00      0.00      0.00         1
                                                         Mid-level Business Development Executive       1.00      1.00      1.00         3
                          

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
drive.mount('/content/drive')
joblib.dump(model, '/content/drive/MyDrive/career_trajectory_model_rf.pkl')
joblib.dump(le, '/content/drive/MyDrive/label_encoder.pkl')

Mounted at /content/drive


['/content/drive/MyDrive/label_encoder.pkl']