# **AI Virtual Career Counsellor**

## Random Forest

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.metrics import accuracy_score, f1_score, hamming_loss, classification_report
from sklearn.multiclass import OneVsRestClassifier

In [22]:
# Load dataset
url = "https://raw.githubusercontent.com/SiddardhaShayini/Career-Recommender-Dataset/refs/heads/main/dataset/cleaned_dataset.csv"
df = pd.read_csv(url)

In [23]:
# Features and targets
X = df.iloc[:, :-2]
y_courses = df['Courses']
y_career_options_raw = df['Career_Options']

In [24]:
# ----- Multi-Class: Courses -----
# Encode Courses
le_courses = LabelEncoder()
y_courses_encoded = le_courses.fit_transform(y_courses)

In [25]:
# Train-test split
X_train_mc, X_test_mc, y_train_mc, y_test_mc = train_test_split(X, y_courses_encoded, test_size=0.2, random_state=42)


In [26]:
# Train Random Forest for Courses
rf_mc = RandomForestClassifier(n_estimators=100, random_state=42)
rf_mc.fit(X_train_mc, y_train_mc)

In [27]:
# Predict and evaluate
y_pred_mc = rf_mc.predict(X_test_mc)
print("=== Multi-Class (Courses) Evaluation ===")
print("Accuracy:", accuracy_score(y_test_mc, y_pred_mc))
print("Macro F1 Score:", f1_score(y_test_mc, y_pred_mc, average='macro'))
print()

=== Multi-Class (Courses) Evaluation ===
Accuracy: 0.9943502824858758
Macro F1 Score: 0.9416033369521741



In [28]:
# ----- Multi-Label: Career Options -----
# Binarize Career_Options
y_career_options_split = y_career_options_raw.str.split(', ')
mlb = MultiLabelBinarizer()
y_career_encoded = mlb.fit_transform(y_career_options_split)

In [29]:
# Train-test split
X_train_ml, X_test_ml, y_train_ml, y_test_ml = train_test_split(X, y_career_encoded, test_size=0.2, random_state=42)


In [30]:
# One-vs-Rest Random Forest for Multi-label
rf_ml = OneVsRestClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
rf_ml.fit(X_train_ml, y_train_ml)



In [31]:
# Predict and evaluate
y_pred_ml = rf_ml.predict(X_test_ml)
print("=== Multi-Label (Career Options) Evaluation ===")
print("Hamming Loss:", hamming_loss(y_test_ml, y_pred_ml))
print("Macro F1 Score:", f1_score(y_test_ml, y_pred_ml, average='macro'))

=== Multi-Label (Career Options) Evaluation ===
Hamming Loss: 0.0002711864406779661
Macro F1 Score: 0.8121299519997194


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


###  Model Performance Summary

####  Multi-Class (Courses)

* **Accuracy: 99.4%**
* **Macro F1 Score: 94.2%**

 **Excellent performance**. The model is highly accurate and balanced across classes. A macro F1 above 90% means it's doing well across even the less frequent courses.

####  Multi-Label (Career Options)

* **Hamming Loss: 0.00027** (very low = good)
* **Macro F1 Score: 81.2%**

Also **very strong** performance for a multi-label problem. F1 of 81% means it's reliably predicting multiple correct career options for most inputs.

---

###  Interpretation

* **Shuffling** clearly helped — it avoided any training bias from original ordering.
* **Random Forest** is doing a great job out of the box — good choice for this dataset with binary input features.
* If needed, performance could be boosted further with:

  * Hyperparameter tuning
  * Feature selection or dimensionality reduction
  * Using advanced models (like XGBoost for `Courses` or deep learning for `Career_Options`)




In [32]:
import joblib

# Save the multi-class (Courses) model
joblib.dump(rf_mc, "random_forest_courses_model.pkl")

# Save the multi-label (Career Options) model
joblib.dump(rf_ml, "random_forest_career_options_model.pkl")

# Save label encoders too (optional but recommended)
joblib.dump(le_courses, "courses_label_encoder.pkl")
joblib.dump(mlb, "career_options_mlb.pkl")


['career_options_mlb.pkl']

In [33]:
from google.colab import files

# Download all saved files
files.download("random_forest_courses_model.pkl")
files.download("random_forest_career_options_model.pkl")
files.download("courses_label_encoder.pkl")
files.download("career_options_mlb.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [35]:
import joblib
import numpy as np

# Load models and encoders
rf_mc = joblib.load("random_forest_courses_model.pkl")
rf_ml = joblib.load("random_forest_career_options_model.pkl")
le_courses = joblib.load("courses_label_encoder.pkl")
mlb = joblib.load("career_options_mlb.pkl")

In [36]:
# Sample input (MUST be length 59 – match original features exactly)
sample_input = [0, 1, 0, 0, 0, 1, 0, 0, 1, 0,  # and so on...
                0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
                0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
                0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
                0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
                0, 1, 0, 0, 0, 1, 0, 1, 0]

In [37]:
# Reshape for prediction (1 sample, 59 features)
sample_input_np = np.array(sample_input).reshape(1, -1)

In [38]:
# Predict Course
predicted_course_index = rf_mc.predict(sample_input_np)[0]
predicted_course = le_courses.inverse_transform([predicted_course_index])[0]



In [39]:
# Predict Career Options
predicted_career_binary = rf_ml.predict(sample_input_np)
predicted_careers = mlb.inverse_transform(predicted_career_binary)[0]



In [40]:
# Print Results
print("Predicted Course:", predicted_course)
print("Suggested Career Options:", predicted_careers)


Predicted Course: B.Sc- Nursing
Suggested Career Options: ('Event Manager', 'PR Executive', 'Wedding Planner')
