In [1]:
# --- 1. Import Libraries ---
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:

# --- 2. Load Dataset ---
df = pd.read_csv(r"D:\Career Guidance\Dataset\Commerce_dataset\commerce_courses_with_careers.csv")

print("Dataset Shape:", df.shape)
print("Columns:", df.columns.tolist())
display(df.head())

Dataset Shape: (1000, 20)
Columns: ['Course', 'Area of Interest', 'Accountancy', 'Business Studies', 'Economics', 'Mathematics', 'Statistics', 'Computer Science', 'English', 'Psychology', 'Sociology', 'Hindi', 'Logical Reasoning', 'Numerical Aptitude', 'Communication', 'Critical Thinking', 'Creativity', 'Empathy', 'Memory', 'Career Options']


Unnamed: 0,Course,Area of Interest,Accountancy,Business Studies,Economics,Mathematics,Statistics,Computer Science,English,Psychology,Sociology,Hindi,Logical Reasoning,Numerical Aptitude,Communication,Critical Thinking,Creativity,Empathy,Memory,Career Options
0,BBA,Management & Administration,63,88,85,58,53,51,74,41,79,43,97,42,88,99,44,54,59,"Business Development Executive, Marketing Mana..."
1,B.Com (E-Commerce),Digital Business,96,81,65,63,47,93,76,54,47,59,96,97,40,52,53,56,51,"E-Commerce Specialist, Digital Marketing Analy..."
2,M.Com,Advanced Commerce & Research,80,64,99,80,88,59,57,60,53,41,59,48,73,97,42,55,100,"Professor / Lecturer, Financial Analyst, Inves..."
3,BMS,Strategic Management,80,95,96,49,51,49,76,50,61,47,86,56,46,83,45,47,95,"Management Trainee, Business Consultant, Opera..."
4,B.Com (Computer Applications),Business & IT,100,89,45,74,58,99,65,44,60,56,93,91,55,63,60,56,43,"Accountant (with IT specialization), Tax Consu..."


In [3]:
df = df.sample(1000)

In [4]:
df.head()

Unnamed: 0,Course,Area of Interest,Accountancy,Business Studies,Economics,Mathematics,Statistics,Computer Science,English,Psychology,Sociology,Hindi,Logical Reasoning,Numerical Aptitude,Communication,Critical Thinking,Creativity,Empathy,Memory,Career Options
774,B.Com (General),Business & Finance,84,93,87,52,52,59,78,53,54,46,84,84,61,67,43,56,54,"Accountant, Tax Consultant, Banking Executive,..."
777,B.Com (Computer Applications),Business & IT,85,85,44,60,57,97,80,42,53,54,80,84,48,71,60,59,48,"Accountant (with IT specialization), Tax Consu..."
331,M.Com,Advanced Commerce & Research,91,60,82,60,96,59,50,44,48,47,54,50,74,93,50,45,83,"Professor / Lecturer, Financial Analyst, Inves..."
380,B.Com (Banking & Insurance),Banking & Insurance Services,97,89,92,74,99,50,55,53,48,47,63,91,62,44,45,40,60,"Bank Officer, Insurance Advisor, Financial Ris..."
876,B.Com (International Business),International Trade,80,100,81,56,41,47,89,57,68,41,94,55,98,69,53,49,54,"International Trade Analyst, Export-Import Man..."


In [5]:
# --- 3. Feature Preparation ---
features = df.select_dtypes(include=[np.number])
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(features)

# Encode courses (classification target)
le_course = LabelEncoder()
y_course = le_course.fit_transform(df["Course"])

# Encode career options (multi-label for future use if needed)
mlb = MultiLabelBinarizer()
y_career = mlb.fit_transform(df["Career Options"].str.split(", "))

In [6]:

# --- 4. Supervised Model (For Evaluation) ---
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_course, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("\n--- Classification Report (Course Prediction) ---\n")
print(classification_report(y_test, y_pred, target_names=le_course.classes_))
print("Accuracy:", accuracy_score(y_test, y_pred))



--- Classification Report (Course Prediction) ---

                                 precision    recall  f1-score   support

    B.Com (Banking & Insurance)       1.00      1.00      1.00        14
  B.Com (Computer Applications)       0.89      1.00      0.94        16
             B.Com (E-Commerce)       1.00      0.86      0.92        14
                B.Com (General)       1.00      1.00      1.00        12
                   B.Com (Hons)       1.00      1.00      1.00         5
 B.Com (International Business)       1.00      1.00      1.00        21
B.Com (Professional Accounting)       1.00      1.00      1.00        12
               B.Com (Taxation)       1.00      1.00      1.00        11
                            BBA       1.00      1.00      1.00        16
                            BBM       1.00      1.00      1.00        10
                            BMS       1.00      1.00      1.00        14
                             CA       1.00      1.00      1.00         

In [7]:
# --- 5. Hybrid Recommendation System ---
def recommend_courses(user_profile, top_n=5):
    """
    Recommend Top-N courses for a student profile, with career options
    """
    # Scale user input
    user_scaled = scaler.transform([user_profile])
    
    # Cosine similarity with dataset
    sims = cosine_similarity(user_scaled, X_scaled)[0]
    df_temp = df.copy()
    df_temp["similarity"] = sims
    
    # Aggregate similarity per course
    course_scores = df_temp.groupby("Course")["similarity"].mean().reset_index()
    top_courses = course_scores.sort_values("similarity", ascending=False).head(top_n)
    
    # Collect career options
    recommendations = []
    for _, row in top_courses.iterrows():
        course = row["Course"]
        sim = row["similarity"]
        careers = df_temp[df_temp["Course"] == course]["Career Options"].unique()
        recommendations.append({
            "Course": course,
            "Similarity": round(sim, 3),
            "Career Options": ", ".join(careers)
        })
    
    return pd.DataFrame(recommendations)


In [8]:
# --- 6. Test Recommendation System ---
# Example student profile
sample_student = features.iloc[0].values
recommendations = recommend_courses(sample_student, top_n=5)

print("\n--- Top 5 Recommended Courses & Careers ---\n")
print(recommendations)


--- Top 5 Recommended Courses & Careers ---

                           Course  Similarity  \
0                 B.Com (General)       0.945   
1                B.Com (Taxation)       0.896   
2              B.Com (E-Commerce)       0.886   
3   B.Com (Computer Applications)       0.866   
4  B.Com (International Business)       0.862   

                                      Career Options  
0  Accountant, Tax Consultant, Banking Executive,...  
1  Tax Consultant, Tax Analyst, Corporate Tax Adv...  
2  E-Commerce Specialist, Digital Marketing Analy...  
3  Accountant (with IT specialization), Tax Consu...  
4  International Trade Analyst, Export-Import Man...  




In [9]:

# --- 7. Test on Multiple Random Students ---
for i in [10, 50, 100]:
    print(f"\n=== Test Student #{i} ===")
    recs = recommend_courses(features.iloc[i].values, top_n=5)
    print(recs)


=== Test Student #10 ===
                           Course  Similarity  \
0                             BBM       0.940   
1                              CS       0.769   
2  B.Com (International Business)       0.748   
3                             BBA       0.746   
4                 B.Com (General)       0.686   

                                      Career Options  
0  Business Manager, Marketing Executive, Financi...  
1  Company Secretary, Corporate Governance Adviso...  
2  International Trade Analyst, Export-Import Man...  
3  Business Development Executive, Marketing Mana...  
4  Accountant, Tax Consultant, Banking Executive,...  

=== Test Student #50 ===
                           Course  Similarity  \
0                             BBM       0.928   
1                              CS       0.820   
2  B.Com (International Business)       0.808   
3                             BBA       0.798   
4                             BMS       0.720   

                            



In [11]:
# --- 8. Save Necessary Elements for Streamlit ---
# Save scaler
with open("commerce_scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

# Save trained classifier
with open("commerce_course_classifier.pkl", "wb") as f:
    pickle.dump(clf, f)

# Save LabelEncoder
with open("commerce_label_encoder.pkl", "wb") as f:
    pickle.dump(le_course, f)

# Save feature column names
with open("commerce_feature_columns.pkl", "wb") as f:
    pickle.dump(features.columns.tolist(), f)

# Save dataset (needed for similarity + careers)
with open("commerce_dataset.pkl", "wb") as f:
    pickle.dump(df, f)

print("✅ All necessary elements saved as pickle files")

✅ All necessary elements saved as pickle files
