In [1]:
# --- 1. Import Libraries ---
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.utils.multiclass import unique_labels
import pickle

In [2]:
# --- 2. Load Dataset ---
df = pd.read_csv(r"D:\Career Guidance\Dataset\pcm_dataset\pcm_courses_with_careers.csv")

print("Dataset Shape:", df.shape)
print("Columns:", df.columns.tolist())
display(df.head())

Dataset Shape: (500, 17)
Columns: ['Physics', 'Chemistry', 'Math', 'CS', 'Statistics', 'English', 'Hindi', 'Regional', 'PE', 'Logical', 'Analytical', 'ProblemSolving', 'Numerical', 'Verbal', 'Interest_Area', 'Suggested_Course', 'Career Options']


Unnamed: 0,Physics,Chemistry,Math,CS,Statistics,English,Hindi,Regional,PE,Logical,Analytical,ProblemSolving,Numerical,Verbal,Interest_Area,Suggested_Course,Career Options
0,98,50,92,92,77,69,48,56,44,87,95,87,100,66,Engineering/CS/IT,CSE-AIML,"AI Engineer, Machine Learning Engineer, Data S..."
1,98,42,85,43,44,62,42,41,83,76,74,96,81,47,Aviation/Defence,NDA,"Army Officer, Navy Officer, Air Force Officer,..."
2,57,53,99,61,82,74,41,55,49,61,100,47,99,44,Science/Research,B.Sc Statistics,"Statistician, Data Analyst, Risk Analyst, Mark..."
3,43,58,99,79,88,71,45,59,50,63,95,43,85,45,Science/Research,B.Sc Statistics,"Statistician, Data Analyst, Risk Analyst, Mark..."
4,95,92,88,47,47,77,54,45,45,60,91,89,84,44,Engineering/CS/IT,Civil,"Civil Engineer, Structural Engineer, Construct..."


In [3]:
# --- 3. Feature Preparation ---
features = df.select_dtypes(include=[np.number])
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(features)

# Encode courses (classification target → Suggested_Course)
le_course = LabelEncoder()
y_course = le_course.fit_transform(df["Suggested_Course"])

# Encode career options (multi-label if needed later)
mlb = MultiLabelBinarizer()
y_career = mlb.fit_transform(df["Career Options"].str.split(", "))

In [4]:
# --- 4. Supervised Model (For Evaluation) ---
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_course, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("\n--- Classification Report (Course Prediction) ---\n")
labels_used = unique_labels(y_test, y_pred)
print(classification_report(
    y_test, y_pred, 
    labels=labels_used, 
    target_names=le_course.classes_[labels_used]
))
print("Accuracy:", accuracy_score(y_test, y_pred))# --- 4. Supervised Model (For Evaluation) ---
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_course, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("\n--- Classification Report (Course Prediction) ---\n")
labels_used = unique_labels(y_test, y_pred)
print(classification_report(
    y_test, y_pred, 
    labels=labels_used, 
    target_names=le_course.classes_[labels_used]
))
print("Accuracy:", accuracy_score(y_test, y_pred))


--- Classification Report (Course Prediction) ---

                           precision    recall  f1-score   support

                   B.Arch       1.00      1.00      1.00         8
         B.Sc Aeronautics       1.00      1.00      1.00         5
                  B.Sc IT       0.67      0.57      0.62         7
         B.Sc Mathematics       1.00      1.00      1.00         5
             B.Sc Physics       1.00      1.00      1.00         4
          B.Sc Statistics       1.00      1.00      1.00         9
                      BCA       1.00      1.00      1.00         3
                      CSE       0.60      0.43      0.50         7
                 CSE-AIML       0.43      0.60      0.50         5
                   CSE-DS       0.25      0.33      0.29         3
                    Civil       0.43      0.43      0.43         7
Commercial Pilot Training       1.00      1.00      1.00         8
                      ECE       1.00      1.00      1.00         7
         

In [5]:
# --- 5. Hybrid Recommendation System ---
def recommend_courses(user_profile, top_n=5):
    """
    Recommend Top-N courses for a student profile, with career options
    """
    # Scale user input
    user_scaled = scaler.transform([user_profile])
    
    # Cosine similarity with dataset
    sims = cosine_similarity(user_scaled, X_scaled)[0]
    df_temp = df.copy()
    df_temp["similarity"] = sims
    
    # Aggregate similarity per course
    course_scores = df_temp.groupby("Suggested_Course")["similarity"].mean().reset_index()
    top_courses = course_scores.sort_values("similarity", ascending=False).head(top_n)
    
    # Collect career options
    recommendations = []
    for _, row in top_courses.iterrows():
        course = row["Suggested_Course"]
        sim = row["similarity"]
        careers = df_temp[df_temp["Suggested_Course"] == course]["Career Options"].unique()
        recommendations.append({
            "Course": course,
            "Similarity": round(sim, 3),
            "Career Options": ", ".join(careers)
        })
    
    return pd.DataFrame(recommendations)


In [6]:
# --- 6. Test Recommendation System ---
# Example student profile
sample_student = features.iloc[0].values
recommendations = recommend_courses(sample_student, top_n=5)

print("\n--- Top 5 Recommended Courses & Careers ---\n")
print(recommendations)


--- Top 5 Recommended Courses & Careers ---

     Course  Similarity                                     Career Options
0  CSE-AIML       0.959  AI Engineer, Machine Learning Engineer, Data S...
1       CSE       0.949  Software Engineer, Full-Stack Developer, Cyber...
2   B.Sc IT       0.910  IT Analyst, Software Developer, System Adminis...
3    CSE-DS       0.901  Data Scientist, Machine Learning Engineer, Big...
4       ECE       0.892  Electronics Engineer, Telecom Engineer, VLSI D...




In [7]:
# --- 7. Test on Multiple Random Students ---
for i in [10, 50, 100]:
    print(f"\n=== Test Student #{i} ===")
    recs = recommend_courses(features.iloc[i].values, top_n=5)
    print(recs)



=== Test Student #10 ===
         Course  Similarity                                     Career Options
0           EEE       0.929  Electrical Engineer, Electronics Design Engine...
1         Civil       0.928  Civil Engineer, Structural Engineer, Construct...
2    Mechanical       0.917  Mechanical Engineer, Automotive Engineer, Robo...
3      Chemical       0.911  Chemical Engineer, Process Engineer, Petroleum...
4  B.Sc Physics       0.827  Physicist, Research Scientist, Data Analyst, M...

=== Test Student #50 ===
            Course  Similarity  \
0  B.Sc Statistics       0.905   
1              ECE       0.835   
2          B.Sc IT       0.822   
3           CSE-DS       0.821   
4         CSE-AIML       0.797   

                                      Career Options  
0  Statistician, Data Analyst, Risk Analyst, Mark...  
1  Electronics Engineer, Telecom Engineer, VLSI D...  
2  IT Analyst, Software Developer, System Adminis...  
3  Data Scientist, Machine Learning Engineer, Big



In [9]:
# --- 8. Save Necessary Elements for Streamlit ---
# Save scaler
with open("pcm_scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

# Save trained classifier
with open("pcm_course_classifier.pkl", "wb") as f:
    pickle.dump(clf, f)

# Save LabelEncoder
with open("pcm_label_encoder.pkl", "wb") as f:
    pickle.dump(le_course, f)

# Save feature column names
with open("pcm_feature_columns.pkl", "wb") as f:
    pickle.dump(features.columns.tolist(), f)

# Save dataset (needed for similarity + careers)
with open("pcm_dataset.pkl", "wb") as f:
    pickle.dump(df, f)

print("✅ All necessary elements saved as pickle files")

✅ All necessary elements saved as pickle files
