In [1]:
# --- 1. Import Libraries ---
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity

In [None]:

# --- 2. Load Dataset ---
df = pd.read_csv("arts_courses_with_careers.csv")

print("Dataset Shape:", df.shape)
print("Columns:", df.columns.tolist())
df.head()

Dataset Shape: (1000, 22)
Columns: ['Course', 'Area of Interest', 'History', 'Political Science', 'Sociology', 'Psychology', 'Economics', 'Philosophy', 'Geography', 'English', 'Hindi', 'Sanskrit', 'Fine Arts', 'PE', 'Logical Reasoning', 'Memory', 'Communication', 'Empathy', 'Creativity', 'Critical Thinking', 'Final Score', 'Career Options']


Unnamed: 0,Course,Area of Interest,History,Political Science,Sociology,Psychology,Economics,Philosophy,Geography,English,...,Fine Arts,PE,Logical Reasoning,Memory,Communication,Empathy,Creativity,Critical Thinking,Final Score,Career Options
0,BA History,Historical Research,86,99,74,50,47,60,46,78,...,60,43,47,82,80,41,51,85,66.38,"Historian, Archivist, Museum Curator, History ..."
1,BA History,Historical Research,100,80,71,51,56,49,55,74,...,51,59,42,84,78,46,60,88,68.58,"Historian, Archivist, Museum Curator, History ..."
2,BA History,Historical Research,97,83,73,57,48,60,41,79,...,51,47,54,82,73,56,43,97,68.11,"Historian, Archivist, Museum Curator, History ..."
3,BA History,Historical Research,83,81,65,49,43,57,51,61,...,53,55,54,87,73,47,60,95,67.56,"Historian, Archivist, Museum Curator, History ..."
4,BA History,Historical Research,97,94,80,52,48,54,52,60,...,40,51,47,90,78,56,47,82,66.27,"Historian, Archivist, Museum Curator, History ..."


In [3]:
# --- 3. Feature Preparation ---
# Select numerical features (subjects + aptitudes)
features = df.select_dtypes(include=[np.number])
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(features)

In [4]:
# --- 4. Supervised Model (For Evaluation) ---
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_course, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)

NameError: name 'y_course' is not defined

In [None]:
y_pred = clf.predict(X_test)
print("\n--- Classification Report (Course Prediction) ---\n")
print(classification_report(y_test, y_pred, target_names=le_course.classes_))
print("Accuracy:", accuracy_score(y_test, y_pred))


--- Classification Report (Course Prediction) ---

                                    precision    recall  f1-score   support

                   BA Anthropology       1.00      1.00      1.00        13
                    BA Archaeology       1.00      1.00      1.00        11
                      BA Economics       1.00      1.00      1.00        11
                      BA Education       1.00      1.00      1.00         8
                        BA English       1.00      1.00      1.00         9
                      BA Fine Arts       1.00      1.00      1.00        12
                      BA Geography       1.00      1.00      1.00        12
                          BA Hindi       1.00      1.00      1.00         8
                        BA History       1.00      1.00      1.00         6
        BA International Relations       1.00      1.00      1.00        14
BA Journalism & Mass Communication       1.00      1.00      1.00        10
        BA Law (5-Year Integrated) 

In [None]:
# Encode Career Options (multi-label)
mlb = MultiLabelBinarizer()
y_career = mlb.fit_transform(df["Career Options"].str.split(", "))

In [None]:
# --- Step 1: Train Course Classifier ---
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_course, test_size=0.2, random_state=42)
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le_course.classes_))

                                    precision    recall  f1-score   support

                   BA Anthropology       1.00      1.00      1.00        13
                    BA Archaeology       1.00      1.00      1.00        11
                      BA Economics       1.00      1.00      1.00        11
                      BA Education       1.00      1.00      1.00         8
                        BA English       1.00      1.00      1.00         9
                      BA Fine Arts       1.00      1.00      1.00        12
                      BA Geography       1.00      1.00      1.00        12
                          BA Hindi       1.00      1.00      1.00         8
                        BA History       1.00      1.00      1.00         6
        BA International Relations       1.00      1.00      1.00        14
BA Journalism & Mass Communication       1.00      1.00      1.00        10
        BA Law (5-Year Integrated)       1.00      0.90      0.95        10
           

In [None]:
# --- 5. Hybrid Recommendation System ---
def recommend_courses(user_profile, top_n=5):
    """
    Recommend Top-N courses for a student profile, with career options
    """
    # Scale user input
    user_scaled = scaler.transform([user_profile])
    
    # Step 1: Cosine similarity with all dataset profiles
    sims = cosine_similarity(user_scaled, X_scaled)[0]
    df["similarity"] = sims
    
    # Step 2: Aggregate similarity per course
    course_scores = df.groupby("Course")["similarity"].mean().reset_index()
    top_courses = course_scores.sort_values("similarity", ascending=False).head(top_n)
    
    # Step 3: Collect career options for each course
    recommendations = []
    for _, row in top_courses.iterrows():
        course = row["Course"]
        sim = row["similarity"]
        careers = df[df["Course"] == course]["Career Options"].unique()
        recommendations.append({
            "Course": course,
            "Similarity": round(sim, 3),
            "Career Options": ", ".join(careers)
        })
    
    return pd.DataFrame(recommendations)

In [None]:

# --- 6. Test Recommendation System ---
# Example student profile (replace with real input later)
sample_student = features.iloc[0].values
recommendations = recommend_courses(sample_student, top_n=5)

print("\n--- Top 5 Recommended Courses & Careers ---\n")
print(recommendations)


--- Top 5 Recommended Courses & Careers ---

                       Course  Similarity  \
0                  BA History       0.942   
1        BA Political Science       0.848   
2  BA Law (5-Year Integrated)       0.833   
3    BA Public Administration       0.824   
4  BA International Relations       0.814   

                                      Career Options  
0  Historian, Archivist, Museum Curator, History ...  
1  Policy Analyst, Political Consultant, Civil Se...  
2  Lawyer / Advocate, Corporate Legal Advisor, Ju...  
3  Civil Services Officer, Policy Analyst, Urban ...  
4  Diplomat / Foreign Service Officer, Internatio...  




In [None]:
# --- 7. Test on Multiple Random Students ---
for i in [10, 50, 100]:
    print(f"\n=== Test Student #{i} ===")
    recs = recommend_courses(features.iloc[i].values, top_n=5)
    print(recs)


=== Test Student #10 ===
                       Course  Similarity  \
0                  BA History       0.935   
1              BA Archaeology       0.838   
2             BA Anthropology       0.823   
3        BA Political Science       0.801   
4  BA Law (5-Year Integrated)       0.780   

                                      Career Options  
0  Historian, Archivist, Museum Curator, History ...  
1  Archaeologist, Museum Curator, Heritage Conser...  
2  Anthropologist, Archaeologist, Museum Research...  
3  Policy Analyst, Political Consultant, Civil Se...  
4  Lawyer / Advocate, Corporate Legal Advisor, Ju...  

=== Test Student #50 ===
                       Course  Similarity  \
0        BA Political Science       0.932   
1  BA Law (5-Year Integrated)       0.923   
2    BA Public Administration       0.901   
3  BA International Relations       0.885   
4                  BA History       0.804   

                                      Career Options  
0  Policy Analyst, Po



In [None]:
import pickle

# Save scaler
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

# Save RandomForest model (optional for classification backup)
with open("course_classifier.pkl", "wb") as f:
    pickle.dump(clf, f)

# Save LabelEncoder for courses
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le_course, f)

# Save feature column names
with open("feature_columns.pkl", "wb") as f:
    pickle.dump(features.columns.tolist(), f)

# Save full dataset (needed for similarity-based career mapping)
with open("dataset.pkl", "wb") as f:
    pickle.dump(df, f)

print("✅ All necessary elements saved as pickle files")


✅ All necessary elements saved as pickle files
