In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
import numpy as np
import warnings

# Try to import SMOTE, if it fails, create alternative class
try:
    from imblearn.over_sampling import SMOTE
except ImportError:
    print("SMOTE not available. Using alternative oversampling method.")
    from sklearn.utils import resample
    
    class SMOTE:
        def __init__(self, random_state=42):
            self.random_state = random_state
        
        def fit_resample(self, X, y):
            df = pd.concat([pd.DataFrame(X), pd.Series(y, name='target')], axis=1)
            classes = df['target'].value_counts()
            max_count = classes.max()
            
            oversampled = []
            for class_label in classes.index:
                class_df = df[df['target'] == class_label]
                if len(class_df) < max_count:
                    oversampled_class = resample(class_df, replace=True, n_samples=max_count, random_state=self.random_state)
                    oversampled.append(oversampled_class)
                else:
                    oversampled.append(class_df)
            
            result_df = pd.concat(oversampled).sample(frac=1, random_state=self.random_state)
            return result_df.drop('target', axis=1).values, result_df['target'].values

# Try to import XGBoost
try:
    from xgboost import XGBClassifier
except ImportError:
    print("XGBoost not available. Install with: pip install xgboost")

warnings.filterwarnings("ignore")

# Your machine learning code can start here...

SMOTE not available. Using alternative oversampling method.
XGBoost not available. Install with: pip install xgboost


In [43]:
# Load and preprocess the dataset
df = pd.read_csv(r"C:\Users\ragul\OneDrive\Desktop\CareerPath_Recommender-main\Jupiter file & dataset\student-scores.csv")

In [44]:

# Drop unnecessary columns
df.drop(columns=['id', 'first_name', 'last_name', 'email'], axis=1, inplace=True)

In [45]:

# Calculate total and average scores
df["total_score"] = (
    df["math_score"] + df["history_score"] + df["physics_score"] +
    df["chemistry_score"] + df["biology_score"] + df["english_score"] + df["geography_score"]
)
df["average_score"] = df["total_score"] / 7

In [46]:
# Map categorical variables
gender_map = {'male': 0, 'female': 1}
part_time_job_map = {False: 0, True: 1}
extracurricular_activities_map = {False: 0, True: 1}
career_aspiration_map = {
    'Lawyer': 0, 'Doctor': 1, 'Government Officer': 2, 'Artist': 3, 'Unknown': 4,
    'Software Engineer': 5, 'Teacher': 6, 'Business Owner': 7, 'Scientist': 8,
    'Banker': 9, 'Writer': 10, 'Accountant': 11, 'Designer': 12,
    'Construction Engineer': 13, 'Game Developer': 14, 'Stock Investor': 15,
    'Real Estate Developer': 16
}

df['gender'] = df['gender'].map(gender_map)
df['part_time_job'] = df['part_time_job'].map(part_time_job_map)
df['extracurricular_activities'] = df['extracurricular_activities'].map(extracurricular_activities_map)
df['career_aspiration'] = df['career_aspiration'].map(career_aspiration_map)

In [47]:
X = df.drop('career_aspiration', axis=1)
y = df['career_aspiration']
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [48]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [49]:
# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [50]:

try:
    from xgboost import XGBClassifier
    models = {
        "Logistic Regression": LogisticRegression(),
        "Support Vector Classifier": SVC(),
        "Random Forest Classifier": RandomForestClassifier(),
        "K Nearest Neighbors": KNeighborsClassifier(),
        "Decision Tree Classifier": DecisionTreeClassifier(),
        "Gaussian Naive Bayes": GaussianNB(),
        "AdaBoost Classifier": AdaBoostClassifier(),
        "Gradient Boosting Classifier": GradientBoostingClassifier(),
        "XGBoost Classifier": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    }
except ImportError:
    print("XGBoost not available. Proceeding without it.")
    models = {
        "Logistic Regression": LogisticRegression(),
        "Support Vector Classifier": SVC(),
        "Random Forest Classifier": RandomForestClassifier(),
        "K Nearest Neighbors": KNeighborsClassifier(),
        "Decision Tree Classifier": DecisionTreeClassifier(),
        "Gaussian Naive Bayes": GaussianNB(),
        "AdaBoost Classifier": AdaBoostClassifier(),
        "Gradient Boosting Classifier": GradientBoostingClassifier()
    }

# Train and evaluate models
for name, model in models.items():
    print("=" * 50)
    print("Model:", name)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

XGBoost not available. Proceeding without it.
Model: Logistic Regression
Accuracy: 0.41456582633053224
Classification Report:
               precision    recall  f1-score   support

           0       0.45      0.40      0.43        75
           1       0.40      0.63      0.49        57
           2       0.41      0.26      0.32        70
           3       0.49      0.57      0.53        56
           4       0.25      0.15      0.19        67
           5       0.28      0.18      0.21        74
           6       0.49      0.66      0.57        62
           7       0.76      0.73      0.75        71
           8       0.51      0.56      0.53        57
           9       0.18      0.09      0.12        68
          10       0.44      0.52      0.47        64
          11       0.26      0.40      0.32        48
          12       0.18      0.13      0.15        68
          13       0.36      0.58      0.45        60
          14       0.57      0.89      0.70        61
        

In [51]:
# Random Forest Model - detailed evaluation
print("\n=== Random Forest Classifier Evaluation ===")
rf_model = RandomForestClassifier()
rf_model.fit(X_train_scaled, y_train)
y_pred_rf = rf_model.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


=== Random Forest Classifier Evaluation ===
Accuracy: 0.9299719887955182
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.91      0.89        75
           1       0.90      1.00      0.95        57
           2       0.96      1.00      0.98        70
           3       0.97      1.00      0.98        56
           4       0.97      0.87      0.91        67
           5       0.70      0.50      0.58        74
           6       0.95      1.00      0.98        62
           7       0.96      0.96      0.96        71
           8       0.97      1.00      0.98        57
           9       0.84      0.85      0.85        68
          10       0.98      1.00      0.99        64
          11       0.96      0.94      0.95        48
          12       0.99      0.99      0.99        68
          13       0.90      1.00      0.94        60
          14       0.97      1.00      0.98        61
          15       0.94      0.97    

In [52]:
# Save the model and scaler
pickle.dump(scaler, open("scaler.pkl", 'wb'))
pickle.dump(rf_model, open("model.pkl", 'wb'))

In [53]:
# Recommendation system
class_names = [
    'Lawyer', 'Doctor', 'Government Officer', 'Artist', 'Unknown', 'Software Engineer',
    'Teacher', 'Business Owner', 'Scientist', 'Banker', 'Writer', 'Accountant', 'Designer',
    'Construction Engineer', 'Game Developer', 'Stock Investor', 'Real Estate Developer'
]

def Recommendations(gender, part_time_job, absence_days, extracurricular_activities,
                    weekly_self_study_hours, math_score, history_score, physics_score,
                    chemistry_score, biology_score, english_score, geography_score,
                    total_score, average_score):
    # Encode input
    gender_encoded = 1 if gender.lower() == 'female' else 0
    part_time_job_encoded = 1 if part_time_job else 0
    extracurricular_activities_encoded = 1 if extracurricular_activities else 0

    # Feature array
    feature_array = np.array([[gender_encoded, part_time_job_encoded, absence_days,
                               extracurricular_activities_encoded, weekly_self_study_hours,
                               math_score, history_score, physics_score, chemistry_score,
                               biology_score, english_score, geography_score, total_score, average_score]])

    # Scale and predict
    scaled_features = scaler.transform(feature_array)
    probabilities = rf_model.predict_proba(scaled_features)

    # Get top predictions
    top_classes_idx = np.argsort(-probabilities[0])[:5]
    return [(class_names[idx], probabilities[0][idx]) for idx in top_classes_idx]
