In [36]:
# --- 1. Import Libraries ---
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [37]:
# --- 2. Load Dataset ---
df = pd.read_csv(r"D:\Career Guidance\Dataset\pcb_dataset\pcb_courses_with_careers_single_option.csv")

print("Dataset Shape:", df.shape)
print("Columns:", df.columns.tolist())
display(df.head())

Dataset Shape: (2000, 20)
Columns: ['Course', 'Physics', 'Chemistry', 'Biology', 'Math', 'English', 'Hindi', 'Psychology', 'CS', 'Logical Reasoning', 'Analytical Thinking', 'Critical Thinking', 'Problem-Solving', 'Communication', 'Creativity', 'Research Skills', 'Numerical Aptitude', 'Empathy', 'Attention to Detail', 'Career Option']


Unnamed: 0,Course,Physics,Chemistry,Biology,Math,English,Hindi,Psychology,CS,Logical Reasoning,Analytical Thinking,Critical Thinking,Problem-Solving,Communication,Creativity,Research Skills,Numerical Aptitude,Empathy,Attention to Detail,Career Option
0,MBBS,64,80,86,55,77,55,59,51,53,45,86,88,67,52,63,57,94,42,Public Health Specialist
1,MBBS,66,88,98,50,74,50,57,58,58,40,83,95,67,43,68,53,91,55,Public Health Specialist
2,MBBS,71,82,80,43,78,49,57,48,49,46,98,85,74,40,68,52,93,56,Medical Researcher
3,MBBS,65,85,83,55,72,47,40,45,45,51,81,98,77,52,66,51,81,45,Medical Educator
4,MBBS,71,81,81,40,71,59,60,55,41,56,82,83,73,59,80,56,94,43,Medical Writer


In [38]:
df = df.sample(2000)

In [41]:
df.head()

Unnamed: 0,Course,Physics,Chemistry,Biology,Math,English,Hindi,Psychology,CS,Logical Reasoning,Analytical Thinking,Critical Thinking,Problem-Solving,Communication,Creativity,Research Skills,Numerical Aptitude,Empathy,Attention to Detail,Career Option
691,BPT (Physiotherapy),70,81,80,45,65,53,45,42,49,43,79,80,86,56,41,40,94,59,Pediatric Physiotherapist
1686,B.Sc. Food Technology,74,96,87,58,62,54,57,77,54,72,48,95,57,48,83,47,52,47,Quality Assurance Manager
1014,B.Sc. Microbiology,76,96,88,49,64,54,49,79,42,73,97,53,42,54,94,47,58,46,Medical Laboratory Scientist
1126,B.Sc. Biochemistry,68,95,98,53,60,57,57,71,45,87,50,78,55,44,89,46,55,43,Research Scientist
571,BNYS,79,87,92,58,62,53,53,53,44,44,78,47,84,50,67,49,88,52,Clinical Practitioner (Naturopathy)


In [39]:
# --- 3. Feature Preparation ---
# Select only numeric columns for similarity + classification
features = df.select_dtypes(include=[np.number])
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(features)

# Encode courses (classification target)
le_course = LabelEncoder()
y_course = le_course.fit_transform(df["Course"])

# Encode career options (multi-label if needed later)
mlb = MultiLabelBinarizer()
y_career = mlb.fit_transform(df["Career Option"].str.split(", "))

In [42]:
# --- 4. Feature Preparation ---
features = df.select_dtypes(include=[np.number])

# Fix: fill missing values with column mean
features = features.fillna(features.mean())

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(features)

# Encode course labels
le_course = LabelEncoder()
y_course = le_course.fit_transform(df["Course"])

# Encode Career Options (multi-label)
mlb = MultiLabelBinarizer()
y_career = mlb.fit_transform(df["Career Option"].str.split(", "))

In [43]:
# --- 5. Supervised Model (for evaluation) ---
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_course, test_size=0.2, random_state=42
)

clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("\n--- Classification Report (Course Prediction) ---\n")
print(classification_report(y_test, y_pred, target_names=le_course.classes_))
print("Accuracy:", accuracy_score(y_test, y_pred))


--- Classification Report (Course Prediction) ---

                                      precision    recall  f1-score   support

                         B.Optometry       0.75      0.22      0.34        27
                             B.Pharm       1.00      1.00      1.00        23
                   B.Sc. Agriculture       1.00      1.00      1.00        20
                  B.Sc. Biochemistry       1.00      1.00      1.00        14
                 B.Sc. Biotechnology       1.00      1.00      1.00        26
         B.Sc. Environmental Science       1.00      1.00      1.00        22
               B.Sc. Food Technology       1.00      1.00      1.00        24
              B.Sc. Forensic Science       0.94      1.00      0.97        16
                      B.Sc. Forestry       1.00      1.00      1.00        19
                      B.Sc. Genetics       0.58      0.61      0.59        18
                  B.Sc. Microbiology       0.61      0.58      0.59        19
           

In [44]:
# --- 5. Hybrid Recommendation System ---
def recommend_courses(user_profile, top_n=5):
    """
    Recommend Top-N courses for a student profile, with career options
    """
    # Scale user input
    user_scaled = scaler.transform([user_profile])
    
    # Cosine similarity with dataset
    sims = cosine_similarity(user_scaled, X_scaled)[0]
    df_temp = df.copy()
    df_temp["similarity"] = sims
    
    # Aggregate similarity per course
    course_scores = df_temp.groupby("Course")["similarity"].mean().reset_index()
    top_courses = course_scores.sort_values("similarity", ascending=False).head(top_n)
    
    # Collect career options
    recommendations = []
    for _, row in top_courses.iterrows():
        course = row["Course"]
        sim = row["similarity"]
        careers = df_temp[df_temp["Course"] == course]["Career Option"].unique()
        recommendations.append({
            "Course": course,
            "Similarity": round(sim, 3),
            "Career Option": ", ".join(careers)
        })
    
    return pd.DataFrame(recommendations)

# --- 6. Test Recommendation System ---
# Example student profile
sample_student = features.iloc[0].values
recommendations = recommend_courses(sample_student, top_n=5)

print("\n--- Top 5 Recommended Courses & Careers ---\n")
print(recommendations)


--- Top 5 Recommended Courses & Careers ---

                Course  Similarity  \
0          B.Optometry       0.828   
1  BPT (Physiotherapy)       0.826   
2                 MBBS       0.810   
3        B.Sc. Nursing       0.782   
4                 BNYS       0.777   

                                       Career Option  
0  Optometrist, Optical Product Specialist, Optom...  
1  Pediatric Physiotherapist, Physiotherapist, Sp...  
2  Medical Educator, Medical Researcher, Medical ...  
3  Critical Care Nurse, Public Health Nurse, Nurs...  
4  Clinical Practitioner (Naturopathy), Naturopat...  




In [45]:
# --- 7. Test Recommendation ---
sample_student = features.iloc[0].values
recs = recommend_courses(sample_student, top_n=5)
print("\n--- Top 5 Recommended Courses & Careers ---\n")
print(recs)



--- Top 5 Recommended Courses & Careers ---

                Course  Similarity  \
0          B.Optometry       0.828   
1  BPT (Physiotherapy)       0.826   
2                 MBBS       0.810   
3        B.Sc. Nursing       0.782   
4                 BNYS       0.777   

                                       Career Option  
0  Optometrist, Optical Product Specialist, Optom...  
1  Pediatric Physiotherapist, Physiotherapist, Sp...  
2  Medical Educator, Medical Researcher, Medical ...  
3  Critical Care Nurse, Public Health Nurse, Nurs...  
4  Clinical Practitioner (Naturopathy), Naturopat...  




In [46]:
# --- 7. Test Recommendation ---
sample_student = features.iloc[0].values
recs = recommend_courses(sample_student, top_n=5)
print("\n--- Top 5 Recommended Courses & Careers ---\n")
print(recs)


--- Top 5 Recommended Courses & Careers ---

                Course  Similarity  \
0          B.Optometry       0.828   
1  BPT (Physiotherapy)       0.826   
2                 MBBS       0.810   
3        B.Sc. Nursing       0.782   
4                 BNYS       0.777   

                                       Career Option  
0  Optometrist, Optical Product Specialist, Optom...  
1  Pediatric Physiotherapist, Physiotherapist, Sp...  
2  Medical Educator, Medical Researcher, Medical ...  
3  Critical Care Nurse, Public Health Nurse, Nurs...  
4  Clinical Practitioner (Naturopathy), Naturopat...  




In [47]:
# --- 8. Save Necessary Elements as Pickle ---
with open("pcb_scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

with open("pcb_course_classifier.pkl", "wb") as f:
    pickle.dump(clf, f)

with open("pcb_label_encoder.pkl", "wb") as f:
    pickle.dump(le_course, f)

with open("pcb_feature_columns.pkl", "wb") as f:
    pickle.dump(features.columns.tolist(), f)

with open("pcb_dataset.pkl", "wb") as f:
    pickle.dump(df, f)

print("✅ PCB model, scaler, encoder, dataset saved as pickle files!")

✅ PCB model, scaler, encoder, dataset saved as pickle files!
