In [8]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, top_k_accuracy_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from scipy.sparse import hstack
import joblib


In [9]:
# Paths
BASE_DIR = os.getcwd()
DATA_PATH = os.path.join(BASE_DIR, "backend", "Dataset.csv")
ARTIFACTS_DIR = os.path.join(BASE_DIR, "backend", "artifacts")
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

# Load dataset
df = pd.read_csv(DATA_PATH)
df.columns = df.columns.str.strip().str.lower()
print("✅ Dataset loaded:", df.shape)
df.head()


✅ Dataset loaded: (1000, 9)


Unnamed: 0,degree,major,cgpa,employed,experience,skills,certifications,industry preference,job role
0,MBA,Mechanical,6.06,yes,7,"SolidWorks, Thermodynamics",Data Science Certificate,Finance,Design Engineer
1,B.Tech,Mechanical,7.21,yes,5,"Thermodynamics, MATLAB, AutoCAD, SolidWorks",,Research,Manufacturing Engineer
2,MBA,Economics,6.66,no,7,"Data Analysis, R, Econometrics, Python, Statis...",PMP,IT,Business Analyst
3,MBA,Finance,6.95,no,9,"Financial Modeling, Excel, Risk Analysis",Six Sigma,Manufacturing,Investment Banker
4,M.Tech,Computer Science,7.15,no,4,"Java, Machine Learning, Python, C++",Data Science Certificate,Manufacturing,Backend Developer


In [10]:
df['certifications'] = df['certifications'].fillna("None")
df['skills'] = df['skills'].fillna("None")
df['degree'] = df['degree'].fillna("Unknown")
df['major'] = df['major'].fillna("Unknown")
df['cgpa'] = df['cgpa'].fillna(df['cgpa'].mean())
df['experience'] = df['experience'].fillna(0)
print("✅ Missing values handled")


✅ Missing values handled


In [11]:
df.info()
df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   degree               1000 non-null   object 
 1   major                1000 non-null   object 
 2   cgpa                 1000 non-null   float64
 3   employed             1000 non-null   object 
 4   experience           1000 non-null   int64  
 5   skills               1000 non-null   object 
 6   certifications       1000 non-null   object 
 7   industry preference  1000 non-null   object 
 8   job role             1000 non-null   object 
dtypes: float64(1), int64(1), object(7)
memory usage: 70.4+ KB


degree                 0
major                  0
cgpa                   0
employed               0
experience             0
skills                 0
certifications         0
industry preference    0
job role               0
dtype: int64

In [12]:
for col in ["degree", "major", "skills", "certifications"]:
    df[col] = df[col].astype(str).str.lower().str.strip()
print("✅ Text normalized")


✅ Text normalized


In [13]:
df["num_skills"] = df["skills"].apply(lambda x: 0 if x=="none" else len([s.strip() for s in x.split(",") if s.strip()]))
df["num_certs"] = df["certifications"].apply(lambda x: 0 if x=="none" else len([c.strip() for c in x.split(",") if c.strip()]))
df["cgpa_x_exp"] = df["cgpa"] * df["experience"]
df["skills_x_certs"] = df["num_skills"] * df["num_certs"]
print("✅ Derived features created")


✅ Derived features created


In [14]:
# Profile text
df["profile_text"] = df["degree"] + " " + df["major"] + " " + df["skills"] + " " + df["certifications"]
print("✅ Profile text created")

# TF-IDF
vectorizer = TfidfVectorizer(max_features=15000, ngram_range=(1,3), min_df=1)
X_text = vectorizer.fit_transform(df["profile_text"])

# Numeric features
numeric_features = df[["cgpa", "experience", "num_skills", "num_certs", "cgpa_x_exp", "skills_x_certs"]].values
scaler = StandardScaler()
numeric_features = scaler.fit_transform(numeric_features)

# Combine text + numeric
X = hstack([X_text, numeric_features])
print("✅ Feature matrix shape:", X.shape)


✅ Profile text created
✅ Feature matrix shape: (1000, 767)


In [15]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df["job role"])
print("Classes:", label_encoder.classes_)


Classes: ['Accountant' 'Backend Developer' 'Business Analyst' 'Data Analyst'
 'Data Scientist' 'Design Engineer' 'Economist' 'Electrical Engineer'
 'Electronics Engineer' 'Embedded Systems Engineer' 'Financial Analyst'
 'Investment Banker' 'Manufacturing Engineer' 'Mechanical Engineer'
 'Software Engineer']


In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("Train:", X_train.shape, "Test:", X_test.shape)


Train: (800, 767) Test: (200, 767)


In [17]:
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
print("✅ After SMOTE:", X_train_res.shape)


✅ After SMOTE: (1050, 767)


In [18]:
model = RandomForestClassifier(
    n_estimators=1200,
    max_depth=50,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features="sqrt",
    class_weight="balanced_subsample",
    n_jobs=-1,
    random_state=42
)
model.fit(X_train_res, y_train_res)
print("✅ Model trained")


✅ Model trained


In [19]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)

print("\n📊 Evaluation Results:")
print("Top-1 Accuracy:", accuracy_score(y_test, y_pred))
print("Top-3 Accuracy:", top_k_accuracy_score(y_test, y_proba, k=3))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Cross-validation
scores = cross_val_score(model, X, y, cv=5, scoring="accuracy", n_jobs=-1)
print("Cross-validation mean accuracy:", scores.mean())



📊 Evaluation Results:
Top-1 Accuracy: 0.86
Top-3 Accuracy: 1.0

Classification Report:
                            precision    recall  f1-score   support

               Accountant       0.86      0.86      0.86        14
        Backend Developer       1.00      0.86      0.92        14
         Business Analyst       0.67      0.80      0.73        10
             Data Analyst       1.00      1.00      1.00        13
           Data Scientist       1.00      1.00      1.00        10
          Design Engineer       0.79      0.88      0.83        17
                Economist       0.82      0.69      0.75        13
      Electrical Engineer       0.73      0.85      0.79        13
     Electronics Engineer       0.86      0.75      0.80        16
Embedded Systems Engineer       1.00      1.00      1.00        11
        Financial Analyst       0.78      0.78      0.78        18
        Investment Banker       0.86      0.86      0.86        14
   Manufacturing Engineer       0.83   

In [20]:
joblib.dump(model, os.path.join(ARTIFACTS_DIR, "job_role_model.pkl"))
joblib.dump(vectorizer, os.path.join(ARTIFACTS_DIR, "vectorizer.pkl"))
joblib.dump(scaler, os.path.join(ARTIFACTS_DIR, "scaler.pkl"))
joblib.dump(label_encoder, os.path.join(ARTIFACTS_DIR, "label_encoder.pkl"))
print("✅ All artifacts saved in:", ARTIFACTS_DIR)


✅ All artifacts saved in: c:\Users\Administrator\OneDrive\Desktop\edu2-predicting-job-role\backend\artifacts
