In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import ipywidgets as widgets
from IPython.display import display, clear_output

# Load the dataset
df = pd.read_csv('engineering_data_improved.csv')
print(f"Data loaded successfully. Shape: {df.shape}")

# Define feature lists based on data generation logic
numerical_features = [
    '10th_percent', '12th_percent', 'jee_rank', 'experience', 'experience_field',
    'num_projects', 'expertise_level', 'num_internships', 'soft_skill_rating',
    'aptitude_rating', 'dsa_level', 'num_hackathons', 'competitive_coding_solved',
    'num_repos', 'github_activities', 'linkedin_posts', 'num_certifications', 'cgpa'
]

categorical_features_college = ['gender', 'engineering_field', 'domain', 'referral']
categorical_features_salary = ['gender', 'engineering_field', 'college_tier', 'domain', 'referral']

# Define college tier hierarchy and encode the target
college_hierarchy = ['Private College', 'State College', 'NIT', 'Mid IIT', 'Top IIT']
college_encoder = OrdinalEncoder(categories=[college_hierarchy])
df['college_tier_encoded'] = college_encoder.fit_transform(df[['college_tier']])

# Split features and targets
# For college tier: exclude 'college_tier' and 'salary' from features
X_college = df.drop(['college_tier', 'salary'], axis=1)
y_college = df['college_tier_encoded']

# For salary: exclude only 'salary', include 'college_tier' as a feature
X_salary = df.drop('salary', axis=1)
y_salary = df['salary']

# Train-test split (80% training, 20% testing) with consistent random_state
X_train_college, X_test_college, y_train_college, y_test_college = train_test_split(
    X_college, y_college, test_size=0.2, random_state=42
)
X_train_salary, X_test_salary, y_train_salary, y_test_salary = train_test_split(
    X_salary, y_salary, test_size=0.2, random_state=42
)
print("Data preprocessed and split into training and test sets.")

# Plot target distribution on training and test sets (for College Tier)
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.countplot(x=y_train_college.astype(int), palette="magma")
plt.title("Training Set - College Tier Distribution")
plt.xlabel("Encoded College Tier")
plt.subplot(1, 2, 2)
sns.countplot(x=y_test_college.astype(int), palette="magma")
plt.title("Test Set - College Tier Distribution")
plt.xlabel("Encoded College Tier")
plt.show()

# Preprocessors: Scale numerical features and one-hot encode categorical features
preprocessor_college = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_college)
])

preprocessor_salary = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_salary)
])

# Pipeline for College Admission Prediction (Classification)
college_pipeline = Pipeline([
    ('preprocessor', preprocessor_college),
    ('model', XGBClassifier(
        objective='multi:softmax',
        eval_metric='mlogloss',
        num_class=5,
        random_state=42
    ))
])

# Pipeline for Salary Prediction (Regression)
salary_pipeline entrenched pipeline
salary_pipeline = Pipeline([
    ('preprocessor', preprocessor_salary),
    ('model', XGBRegressor(objective='reg:squarederror', random_state=42))
])

# Define expanded hyperparameter grids for better accuracy
college_params = {
    'model__n_estimators': [200, 500, 800],
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__max_depth': [3, 5, 7],
    'model__subsample': [0.8, 1.0],
    'model__colsample_bytree': [0.8, 1.0]
}

salary_params = {
    'model__n_estimators': [500, 800, 1200],
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__max_depth': [3, 5, 7],
    'model__subsample': [0.8, 1.0],
    'model__colsample_bytree': [0.8, 1.0]
}

print("Pipelines and hyperparameter grids set up.")

# Train College Predictor using GridSearchCV
print("Training College Prediction Model...")
college_grid = GridSearchCV(college_pipeline, college_params, cv=5, scoring='accuracy', n_jobs=-1)
college_grid.fit(X_train_college, y_train_college)
best_college_model = college_grid.best_estimator_
print("Best parameters for College Prediction:", college_grid.best_params_)

# Train Salary Predictor using GridSearchCV
print("\nTraining Salary Prediction Model...")
salary_grid = GridSearchCV(salary_pipeline, salary_params, cv=5, scoring='r2', n_jobs=-1)
salary_grid.fit(X_train_salary, y_train_salary)
best_salary_model = salary_grid.best_estimator_
print("Best parameters for Salary Prediction:", salary_grid.best_params_)

# Evaluation function
def evaluate_models():
    # College Prediction Evaluation
    college_preds = best_college_model.predict(X_test_college)
    acc = accuracy_score(y_test_college, college_preds)
    mae_class = mean_absolute_error(y_test_college, college_preds)  # MAE for ordinal classification
    print("\nCollege Prediction Metrics:")
    print("Best Parameters:", college_grid.best_params_)
    print(f"Accuracy: {acc:.2%}")
    print(f"Mean Absolute Error (Ordinal): {mae_class:.2f}")
    print("\nClassification Report:")
    print(classification_report(y_test_college, college_preds, target_names=college_hierarchy))
    
    # Confusion Matrix Plot
    plt.figure(figsize=(10, 8))
    cm = confusion_matrix(y_test_college, college_preds)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=college_hierarchy, yticklabels=college_hierarchy)
    plt.title('College Tier Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.xticks(rotation=45)
    plt.show()
    
    # Salary Prediction Evaluation
    salary_preds = best_salary_model.predict(X_test_salary)
    mae = mean_absolute_error(y_test_salary, salary_preds)
    rmse = np.sqrt(mean_squared_error(y_test_salary, salary_preds))
    r2 = r2_score(y_test_salary, salary_preds)
    
    print("\nSalary Prediction Metrics:")
    print("Best Parameters:", salary_grid.best_params_)
    print(f"MAE: ₹{mae:,.0f}")
    print(f"RMSE: ₹{rmse:,.0f}")
    print(f"R² Score: {r2:.2%}")
    
    # Scatter Plot: Actual vs Predicted Salary
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test_salary, salary_preds, alpha=0.3)
    plt.plot([y_test_salary.min(), y_test_salary.max()],
             [y_test_salary.min(), y_test_salary.max()], 'r--')
    plt.xlabel('Actual Salary')
    plt.ylabel('Predicted Salary')
    plt.title('Actual vs Predicted Salary')
    plt.show()
    
    # Feature Importance Visualization
    fig, ax = plt.subplots(1, 2, figsize=(18, 6))
    
    # College Prediction Feature Importance
    college_importances = best_college_model.named_steps['model'].feature_importances_
    college_features = best_college_model.named_steps['preprocessor'].get_feature_names_out()
    pd.Series(college_importances, index=college_features).sort_values().plot.barh(ax=ax[0])
    ax[0].set_title('College Prediction Feature Importance')
    
    # Salary Prediction Feature Importance
    salary_importances = best_salary_model.named_steps['model'].feature_importances_
    salary_features = best_salary_model.named_steps['preprocessor'].get_feature_names_out()
    pd.Series(salary_importances, index=salary_features).sort_values().plot.barh(ax=ax[1])
    ax[1].set_title('Salary Prediction Feature Importance')
    
    plt.tight_layout()
    plt.show()

# Run evaluation
evaluate_models()

# Interactive prediction interface
def predict_admission_interactive():
    # Define widgets for numerical inputs
    tenth_widget = widgets.FloatText(value=80.0, description='10th %:')
    twelfth_widget = widgets.FloatText(value=80.0, description='12th %:')
    jee_widget = widgets.IntText(value=1000, description='JEE Rank:')
    workexp_widget = widgets.IntText(value=2, description='Work Exp (yrs):')
    exp_field_widget = widgets.Dropdown(options=list(range(0, 21)), value=2, description='Field Exp:')
    projects_widget = widgets.Dropdown(options=list(range(0, 21)), value=1, description='Projects:')
    expertise_widget = widgets.Dropdown(options=[1, 2, 3, 4, 5], value=3, description='Expertise:')
    internships_widget = widgets.Dropdown(options=list(range(0, 11)), value=0, description='Internships:')
    softskill_widget = widgets.Dropdown(options=[1, 2, 3, 4, 5], value=3, description='Soft Skills:')
    aptitude_widget = widgets.Dropdown(options=[1, 2, 3, 4, 5], value=3, description='Aptitude:')
    dsa_widget = widgets.Dropdown(options=[1, 2, 3, 4, 5], value=3, description='DSA Level:')
    hackathons_widget = widgets.Dropdown(options=list(range(0, 11)), value=0, description='Hackathons:')
    coding_widget = widgets.Dropdown(options=list(range(0, 201, 10)), value=50, description='Coding Qs:')
    repos_widget = widgets.Dropdown(options=list(range(0, 51)), value=5, description='Repos:')
    github_widget = widgets.Dropdown(options=list(range(0, 51)), value=10, description='GitHub Acts:')
    linkedin_widget = widgets.Dropdown(options=list(range(0, 51)), value=3, description='LinkedIn Posts:')
    certs_widget = widgets.Dropdown(options=list(range(0, 21)), value=0, description='Certifications:')
    cgpa_widget = widgets.Dropdown(options=[round(x,1) for x in np.arange(1, 10.1, 0.5)], value=7.0, description='CGPA:')

    # Widgets for categorical inputs
    gender_widget = widgets.Dropdown(options=['Male', 'Female', 'Other'], value='Male', description='Gender:')
    eng_field_widget = widgets.Dropdown(options=['Computer Science', 'Mechanical', 'Electrical', 'Civil', 'Electronics'],
                                        value='Computer Science', description='Eng Field:')
    domain_widget = widgets.Dropdown(options=['Full Stack', 'Machine Learning', 'Android Development', 'Other'],
                                     value='Full Stack', description='Domain:')
    referral_widget = widgets.Dropdown(options=['Yes', 'No'], value='No', description='Referral:')
    college_widget = widgets.Dropdown(options=college_hierarchy, value='NIT', description='College Tier:')  # For salary prediction

    submit_button = widgets.Button(description="Predict")
    output = widgets.Output()

    def on_submit(b):
        with output:
            clear_output()
            # Gather user inputs into a dictionary
            user_input_college = {
                '10th_percent': tenth_widget.value,
                '12th_percent': twelfth_widget.value,
                'jee_rank': jee_widget.value,
                'experience': workexp_widget.value,
                'experience_field': exp_field_widget.value,
                'num_projects': projects_widget.value,
                'expertise_level': expertise_widget.value,
                'num_internships': internships_widget.value,
                'soft_skill_rating': softskill_widget.value,
                'aptitude_rating': aptitude_widget.value,
                'dsa_level': dsa_widget.value,
                'num_hackathons': hackathons_widget.value,
                'competitive_coding_solved': coding_widget.value,
                'num_repos': repos_widget.value,
                'github_activities': github_widget.value,
                'linkedin_posts': linkedin_widget.value,
                'num_certifications': certs_widget.value,
                'cgpa': cgpa_widget.value,
                'gender': gender_widget.value,
                'engineering_field': eng_field_widget.value,
                'domain': domain_widget.value,
                'referral': referral_widget.value
            }
            
            user_input_salary = user_input_college.copy()
            user_input_salary['college_tier'] = college_widget.value
            
            input_df_college = pd.DataFrame([user_input_college])
            input_df_salary = pd.DataFrame([user_input_salary])
            
            # College Prediction
            college_code = best_college_model.predict(input_df_college)[0]
            predicted_college = college_hierarchy[int(college_code)]
            
            # Salary Prediction
            predicted_salary = best_salary_model.predict(input_df_salary)[0]
            
            print("\nPrediction Results:")
            print(f"Predicted College Tier: {predicted_college}")
            print(f"Estimated Annual Salary: ₹{predicted_salary:,.2f}")

    submit_button.on_click(on_submit)

    # Organize widgets in a clean layout
    input_widgets = widgets.VBox([
        widgets.HBox([tenth_widget, twelfth_widget, jee_widget]),
        widgets.HBox([workexp_widget, exp_field_widget]),
        widgets.HBox([projects_widget, expertise_widget, internships_widget]),
        widgets.HBox([softskill_widget, aptitude_widget, dsa_widget]),
        widgets.HBox([hackathons_widget, coding_widget]),
        widgets.HBox([repos_widget, github_widget, linkedin_widget]),
        widgets.HBox([certs_widget, cgpa_widget]),
        widgets.HBox([gender_widget, eng_field_widget, domain_widget]),
        widgets.HBox([referral_widget, college_widget]),
        submit_button,
        output
    ])
    display(input_widgets)

# Run the interactive prediction interface
predict_admission_interactive()

# Performance Metrics Check Block
college_preds = best_college_model.predict(X_test_college)
college_accuracy = accuracy_score(y_test_college, college_preds)
print("College Prediction Accuracy: {:.2%}".format(college_accuracy))

salary_preds = best_salary_model.predict(X_test_salary)
salary_r2 = r2_score(y_test_salary, salary_preds)
print("Salary Prediction R² Score: {:.2%}".format(salary_r2))

# Save the models
joblib.dump(best_college_model, 'college_predictor.pkl')
joblib.dump(best_salary_model, 'salary_predictor.pkl')
print("\nModels saved as 'college_predictor.pkl' and 'salary_predictor.pkl'.")