SIMPLE ML-BASED CAREER RECOMMENDER

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib

1. LOAD DATA

In [2]:
df = pd.read_csv("Career_Dataset_10000_Realistic.csv")

# Combine Skills + Interests into a single text feature
df["Combined_Text"] = df["Skills"] + ";" + df["Interests"]

X = df[["Age", "Education", "Combined_Text"]]
y = df["Recommended_Career"]

2. TRAIN / TEST SPLIT

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"\nTrain size: {X_train.shape[0]}, Test size: {X_test.shape[0]}")


Train size: 8000, Test size: 2000


3. PREPROCESSING PIPELINE

In [7]:
text_features = ["Combined_Text"]
numeric_features = ["Age"]
cat_features = ["Education"]

preprocessor = ColumnTransformer(
    transformers=[
        ("tfidf", TfidfVectorizer(
            max_features=1500,
            token_pattern=r"[^;]+",     # treat each skill/interest as a token
            lowercase=True
        ), "Combined_Text"),
        ("ohe", OneHotEncoder(handle_unknown="ignore"), ["Education"]),
        ("scaler", StandardScaler(), ["Age"])
    ],
    sparse_threshold=0.3
    )

 4. MODEL

In [8]:
model = RandomForestClassifier(
    n_estimators=250,
    random_state=42,
    n_jobs=-1,
    max_depth=None
)

pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("classifier", model)
])

5. TRAIN MODEL

In [9]:
pipeline.fit(X_train, y_train)


 6. EVALUATE

In [10]:
y_pred = pipeline.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print(f"\nModel Accuracy: {acc:.4f}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred))


Model Accuracy: 0.9995

Classification Report:
                       precision    recall  f1-score   support

           Accountant       1.00      1.00      1.00       103
     Business Analyst       1.00      1.00      1.00       101
       Civil Engineer       1.00      1.00      1.00       111
       Content Writer       1.00      1.00      1.00       104
Cybersecurity Analyst       1.00      1.00      1.00        94
         Data Analyst       1.00      1.00      1.00       102
       Data Scientist       1.00      1.00      1.00        99
  Electrical Engineer       1.00      1.00      1.00        97
         Entrepreneur       1.00      1.00      1.00        96
     Graphic Designer       1.00      0.99      1.00       103
           HR Manager       1.00      1.00      1.00        98
  Marketing Executive       1.00      1.00      1.00        96
  Mechanical Engineer       1.00      1.00      1.00       104
   Operations Manager       1.00      1.00      1.00        99
      

7. SAVE PIPELINE

In [12]:
joblib.dump(pipeline, "career_recommender_model.joblib")

print("\nModel saved as career_recommender_model.joblib")


Model saved as career_recommender_model.joblib


DEFINE PREDICTION FUNCTION

In [25]:
import joblib
import numpy as np
import pandas as pd

pipeline = joblib.load("career_recommender_model.joblib")

def predict_top_3_careers(age, education, skills_list, interests_list):
    combined_text = ";".join(skills_list) + ";" + ";".join(interests_list)

    # Convert into DataFrame
    user_df = pd.DataFrame([{
        "Age": age,
        "Education": education,
        "Combined_Text": combined_text
    }])

    # Predict proba
    proba = pipeline.predict_proba(user_df)[0]
    classes = pipeline.classes_

    # Top 3 indices
    top3_idx = np.argsort(proba)[-3:][::-1]

    # Return tuples [(career, probability), ...]
    return [(classes[i], float(proba[i])) for i in top3_idx]


In [26]:
results = predict_top_3_careers(
    age=25,
    education="Bachelor's",
    skills_list=["Python", "Data Analysis", "Machine Learning", "Critical Thinking"],
    interests_list=["Technology", "Data Science", "Innovation"]
)

print("\nExample user prediction (Top 3):")
for career, prob in results:
    print(f"- {career} ({prob:.2f})")



Example user prediction (Top 3):
- Data Scientist (0.92)
- Software Developer (0.08)
- Teacher (0.00)
