In [3]:
# --- 1. Import Libraries ---
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [4]:
# --- 2. Load Dataset ---
df = pd.read_csv(r"D:\Career Guidance\Dataset\Vocational_dataset\vocational_courses_with_careers.csv")

print("Dataset Shape:", df.shape)
print("Columns:", df.columns.tolist())
display(df.head())

Dataset Shape: (1000, 24)
Columns: ['Course', 'Area of Interest', 'Fine Arts', 'Creativity', 'Communication', 'Empathy', 'English', 'Sociology', 'Memory', 'Computer Science', 'Mathematics', 'Statistics', 'Accountancy', 'Business Studies', 'Economics', 'Psychology', 'Hindi', 'Geography', 'Biology', 'Chemistry', 'Logical Reasoning', 'Critical Thinking', 'Numerical Aptitude', 'Career Options']


Unnamed: 0,Course,Area of Interest,Fine Arts,Creativity,Communication,Empathy,English,Sociology,Memory,Computer Science,...,Economics,Psychology,Hindi,Geography,Biology,Chemistry,Logical Reasoning,Critical Thinking,Numerical Aptitude,Career Options
0,Diploma in Performing Arts (Dance/Theatre),Performing Arts & Entertainment,87,96,94,82,69.0,69,80,53,...,55,60,46,41,47,51,42,52,55,"Professional Dancer / Actor, Drama Instructor,..."
1,Diploma in Computer Applications (DCA),Computer Skills & IT Basics,48,49,67,60,67.0,60,49,89,...,46,45,58,42,54,54,90,66,94,"IT Support Executive, Data Entry / Office Auto..."
2,Diploma in Paramedical Sciences,Healthcare & Allied Services,40,51,73,85,,74,81,49,...,50,57,40,45,82,82,49,66,46,"Lab Technician, Radiology Assistant, Emergency..."
3,Diploma in Journalism & Mass Communication,Media & Communication,52,76,94,43,92.0,83,60,56,...,58,61,55,43,44,46,75,81,51,"Journalist / Reporter, News Anchor / Editor, P..."
4,Diploma in Animation & Multimedia,"Animation, VFX & Multimedia",93,97,63,48,73.0,41,58,83,...,53,50,49,59,55,41,71,98,42,"Animator / VFX Artist, Multimedia Designer, Ga..."


In [5]:
# --- 3. Data Cleaning ---
# Fill missing values (if any) with median for numeric columns
df.fillna(df.median(numeric_only=True), inplace=True)


In [None]:
# --- 4. Feature Preparation ---
features = df.select_dtypes(include=[np.number])   # numerical features (subjects & skills)
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(features)

# Encode courses (classification target)
le_course = LabelEncoder()
y_course = le_course.fit_transform(df["Course"])

# Encode career options (multi-label if needed later)
mlb = MultiLabelBinarizer()
y_career = mlb.fit_transform(df["Career Options"].str.split(", "))

In [7]:
# --- 5. Train-Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_course, test_size=0.2, random_state=42)

In [None]:
# --- 6. Supervised Model (Classifier) ---
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("\n--- Classification Report (Course Prediction) ---\n")
print(classification_report(y_test, y_pred, target_names=le_course.classes_))
print("Accuracy:", accuracy_score(y_test, y_pred))


--- Classification Report (Course Prediction) ---

                                            precision    recall  f1-score   support

         Diploma in Animation & Multimedia       0.46      0.60      0.52        10
    Diploma in Computer Applications (DCA)       1.00      1.00      1.00        11
 Diploma in Culinary Arts (Hotel Catering)       1.00      1.00      1.00        10
              Diploma in Digital Marketing       1.00      1.00      1.00        15
               Diploma in Event Management       1.00      1.00      1.00        15
              Diploma in Fashion Designing       1.00      1.00      1.00        11
              Diploma in Graphic Designing       0.71      0.59      0.65        17
               Diploma in Hotel Management       1.00      1.00      1.00        17
             Diploma in Interior Designing       0.94      1.00      0.97        15
Diploma in Journalism & Mass Communication       1.00      1.00      1.00        15
           Diploma in P

In [None]:
# --- 7. Hybrid Recommendation System ---
def recommend_courses(user_profile, top_n=5):
    """
    Recommend Top-N courses for a student profile, with career options
    """
    # Scale user input
    user_scaled = scaler.transform([user_profile])
    
    # Cosine similarity with dataset
    sims = cosine_similarity(user_scaled, X_scaled)[0]
    df_temp = df.copy()
    df_temp["similarity"] = sims
    
    # Aggregate similarity per course
    course_scores = df_temp.groupby("Course")["similarity"].mean().reset_index()
    top_courses = course_scores.sort_values("similarity", ascending=False).head(top_n)
    
    # Collect career options
    recommendations = []
    for _, row in top_courses.iterrows():
        course = row["Course"]
        sim = row["similarity"]
        careers = df_temp[df_temp["Course"] == course]["Career Options"].unique()
        recommendations.append({
            "Course": course,
            "Similarity": round(sim, 3),
            "Career Options": ", ".join(careers)
        })
    
    return pd.DataFrame(recommendations)

In [None]:
# --- 8. Test Recommendation System ---
# Example student profile (first row of dataset)
sample_student = features.iloc[0].values
recommendations = recommend_courses(sample_student, top_n=5)

print("\n--- Top 5 Recommended Courses & Careers ---\n")
print(recommendations)


--- Top 5 Recommended Courses & Careers ---

                                       Course  Similarity  \
0  Diploma in Performing Arts (Dance/Theatre)       0.873   
1   Diploma in Culinary Arts (Hotel Catering)       0.830   
2                Diploma in Fashion Designing       0.806   
3                      Diploma in Photography       0.782   
4               Diploma in Interior Designing       0.780   

                                      Career Options  
0  Professional Dancer / Actor, Drama Instructor,...  
1  Chef / Sous Chef, Food & Beverage Manager, Cat...  
2  Fashion Designer, Textile / Apparel Merchandis...  
3  Professional Photographer, Photojournalist, Fa...  
4  Interior Designer, Space Planner, Furniture De...  




In [12]:
# --- 9. Save Necessary Elements for Streamlit ---
with open("vocational_scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

with open("vocational_course_classifier.pkl", "wb") as f:
    pickle.dump(clf, f)

with open("vocational_label_encoder.pkl", "wb") as f:
    pickle.dump(le_course, f)

with open("vocational_feature_columns.pkl", "wb") as f:
    pickle.dump(features.columns.tolist(), f)

with open("vocational_dataset.pkl", "wb") as f:
    pickle.dump(df, f)

print("✅ All necessary elements saved as pickle files")

✅ All necessary elements saved as pickle files
