In [None]:
# ============================================
# CAREER RECOMMENDATION SYSTEM – FINAL VERSION
# (LightGBM instead of Random Forest)
# ============================================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# --------------------------------------------
# 1. LOAD DATA
# --------------------------------------------
df = pd.read_csv("student_topicwise_performance_career_dataset.csv")

# --------------------------------------------
# 2. STUDENT ID (SAFE HANDLING)
# --------------------------------------------
if "Student_ID" not in df.columns:
    df.insert(0, "Student_ID", range(1, len(df) + 1))

# --------------------------------------------
# 3. ORDINAL ENCODING (SKILL LEVELS)
# --------------------------------------------
ordinal_map = {
    "Poor": 0,
    "Average": 1,
    "Good": 2,
    "Excellent": 3
}

skill_cols = df.columns.drop(
    ["Student_ID", "Predicted_Career_Position"]
)

for col in skill_cols:
    df[col] = df[col].map(ordinal_map)

# --------------------------------------------
# 4. FEATURE ENGINEERING (DOMAIN SCORES)
# --------------------------------------------
domain_groups = {
    "Programming_Score": ["Python", "Java", "C++"],
    "Data_Score": ["SQL", "Machine_Learning", "Data_Analysis"],
    "Cloud_Score": ["AWS", "DevOps", "Linux"],
    "Web_Score": ["HTML_CSS_JS", "React", "Backend_Development"],
    "Security_Score": ["Cyber_Security", "Networking"]
}

for new_col, cols in domain_groups.items():
    valid_cols = [c for c in cols if c in df.columns]
    df[new_col] = df[valid_cols].mean(axis=1)

# --------------------------------------------
# 5. LABEL ENCODE TARGET
# --------------------------------------------
le = LabelEncoder()
df["Predicted_Career_Position"] = le.fit_transform(
    df["Predicted_Career_Position"]
)

# --------------------------------------------
# 6. FEATURE / TARGET SPLIT
# --------------------------------------------
X = df.drop(
    ["Student_ID", "Predicted_Career_Position"],
    axis=1
)
y = df["Predicted_Career_Position"]

# --------------------------------------------
# 7. TRAIN–TEST SPLIT
# --------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# ============================================
# 8. LIGHTGBM – GRID SEARCH
# ============================================
lgbm = LGBMClassifier(
    objective="multiclass",
    random_state=42
)

lgbm_params = {
    "n_estimators": [300, 500],
    "max_depth": [8, 12],
    "learning_rate": [0.05, 0.1],
    "num_leaves": [31, 63]
}

lgbm_grid = GridSearchCV(
    lgbm,
    lgbm_params,
    cv=3,
    scoring="accuracy",
    n_jobs=-1
)

lgbm_grid.fit(X_train, y_train)
best_lgbm = lgbm_grid.best_estimator_

print("\nBest LightGBM Params:", lgbm_grid.best_params_)
print("LightGBM CV Accuracy:", lgbm_grid.best_score_)

# ============================================
# 9. XGBOOST – GRID SEARCH
# ============================================
xgb = XGBClassifier(
    objective="multi:softprob",
    eval_metric="mlogloss",
    random_state=42,
    use_label_encoder=False
)

xgb_params = {
    "n_estimators": [200, 300],
    "max_depth": [4, 6],
    "learning_rate": [0.05, 0.1],
    "subsample": [0.8, 1.0]
}

xgb_grid = GridSearchCV(
    xgb,
    xgb_params,
    cv=3,
    scoring="accuracy",
    n_jobs=-1
)

xgb_grid.fit(X_train, y_train)
best_xgb = xgb_grid.best_estimator_

print("\nBest XGBoost Params:", xgb_grid.best_params_)
print("XGBoost CV Accuracy:", xgb_grid.best_score_)

# ============================================
# 10. TEST SET EVALUATION
# ============================================
lgbm_test_acc = accuracy_score(y_test, best_lgbm.predict(X_test))
xgb_test_acc = accuracy_score(y_test, best_xgb.predict(X_test))

print("\nTEST ACCURACY")
print("LightGBM:", lgbm_test_acc)
print("XGBoost:", xgb_test_acc)

# ============================================
# 11. TOP-3 ACCURACY (BEST METRIC)
# ============================================
probs = best_xgb.predict_proba(X_test)
top_3_preds = np.argsort(probs, axis=1)[:, -3:]

top_3_accuracy = np.mean([
    y_test.iloc[i] in top_3_preds[i]
    for i in range(len(y_test))
])

print("\nTop-3 Accuracy (XGBoost):", top_3_accuracy)

# ============================================
# 12. SAMPLE PREDICTION (DECODE RESULT)
# ============================================
sample_index = 0
pred_encoded = best_xgb.predict(X_test.iloc[[sample_index]])
pred_label = le.inverse_transform(pred_encoded)

print("\nSample Career Recommendation:", pred_label[0])
