In [1]:
# Libraries for data handling, visualization, modeling, and widgets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
from IPython.display import display, clear_output

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (accuracy_score, mean_absolute_error, r2_score,
                             confusion_matrix, classification_report,
                             mean_squared_error)
from xgboost import XGBClassifier, XGBRegressor
import joblib


In [2]:
# Load CSV and check shape
df = pd.read_csv('engineering_data_IMPRV.csv')

# Select features and targets
numerical_features = [...]
categorical_features = ['gender', 'domain', 'referral']
target_col_college = 'college_tier'
target_col_salary = 'salary'

# Keep only the required columns
required_columns = numerical_features + categorical_features + [target_col_college, target_col_salary]
df = df[required_columns]
print("✅ Data loaded successfully. Shape:", df.shape)


KeyError: '[Ellipsis] not in index'

In [None]:
# Distribution plots for numerical
df[numerical_features].hist(figsize=(18, 12), bins=20)
plt.suptitle("Numerical Features Distribution")
plt.show()

# Count plots for categorical
for col in categorical_features + [target_col_college]:
    plt.figure(figsize=(6, 4))
    sns.countplot(data=df, x=col, palette="viridis")
    plt.title(f"Count Plot for {col}")
    plt.xticks(rotation=45)
    plt.show()

# Correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(df[numerical_features].corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Correlation Matrix for Numerical Features")
plt.show()


In [None]:
# Encode target
college_hierarchy = ['Private College', 'State College', 'NIT', 'Mid IIT', 'Top IIT']
college_encoder = OrdinalEncoder(categories=[college_hierarchy])
df['college_tier_encoded'] = college_encoder.fit_transform(df[[target_col_college]])

# Define input and output
X = df[numerical_features + categorical_features]
y_college = df['college_tier_encoded']
y_salary = df[target_col_salary]

# Split
X_train, X_test, y_college_train, y_college_test, y_salary_train, y_salary_test = train_test_split(
    X, y_college, y_salary, test_size=0.2, random_state=42
)


In [None]:
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.countplot(x=y_college_train.astype(int), palette="magma")
plt.title("Training Set - College Tier Distribution")
plt.xlabel("Encoded College Tier")
plt.subplot(1, 2, 2)
sns.countplot(x=y_college_test.astype(int), palette="magma")
plt.title("Test Set - College Tier Distribution")
plt.xlabel("Encoded College Tier")
plt.show()


In [None]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

college_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', XGBClassifier(objective='multi:softmax', eval_metric='mlogloss', use_label_encoder=False))
])

salary_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(objective='reg:squarederror', random_state=42))
])


In [None]:
# Set hyperparameters
college_params = {
    'model__n_estimators': [500, 1000],
    'model__learning_rate': [0.05, 0.1],
    'model__max_depth': [5, 7]
}
salary_params = {
    'model__n_estimators': [800, 1200],
    'model__learning_rate': [0.05, 0.1],
    'model__max_depth': [3, 5]
}

# Train models
print("Training College Prediction Model...")
college_grid = GridSearchCV(college_pipeline, college_params, cv=5, scoring='accuracy', n_jobs=-1)
college_grid.fit(X_train, y_college_train)
best_college_model = college_grid.best_estimator_

print("\nTraining Salary Prediction Model...")
salary_grid = GridSearchCV(salary_pipeline, salary_params, cv=5, scoring='r2', n_jobs=-1)
salary_grid.fit(X_train, y_salary_train)
best_salary_model = salary_grid.best_estimator_


In [None]:
def evaluate_models():
    college_preds = best_college_model.predict(X_test)
    acc = accuracy_score(y_college_test, college_preds)
    print("\nCollege Prediction Accuracy:", f"{acc:.2%}")
    print(classification_report(y_college_test, college_preds, target_names=college_hierarchy))

    plt.figure(figsize=(10, 8))
    sns.heatmap(confusion_matrix(y_college_test, college_preds), annot=True, fmt='d', cmap='Blues',
                xticklabels=college_hierarchy, yticklabels=college_hierarchy)
    plt.title('College Tier Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

    salary_preds = best_salary_model.predict(X_test)
    print("\nSalary Prediction Metrics:")
    print("MAE: ₹{:,.0f}".format(mean_absolute_error(y_salary_test, salary_preds)))
    print("RMSE: ₹{:,.0f}".format(np.sqrt(mean_squared_error(y_salary_test, salary_preds))))
    print("R² Score: {:.2%}".format(r2_score(y_salary_test, salary_preds)))

    plt.figure(figsize=(10, 6))
    plt.scatter(y_salary_test, salary_preds, alpha=0.3)
    plt.plot([y_salary_test.min(), y_salary_test.max()],
             [y_salary_test.min(), y_salary_test.max()], 'r--')
    plt.xlabel('Actual Salary')
    plt.ylabel('Predicted Salary')
    plt.title('Actual vs Predicted Salary')
    plt.show()

    # Feature importance
    fig, ax = plt.subplots(1, 2, figsize=(18, 6))
    features = best_college_model.named_steps['preprocessor'].get_feature_names_out()
    pd.Series(best_college_model.named_steps['model'].feature_importances_, index=features).sort_values().plot.barh(ax=ax[0])
    ax[0].set_title('College Prediction Feature Importance')

    pd.Series(best_salary_model.named_steps['model'].feature_importances_, index=features).sort_values().plot.barh(ax=ax[1])
    ax[1].set_title('Salary Prediction Feature Importance')
    plt.tight_layout()
    plt.show()

# Call evaluation
evaluate_models()


In [None]:
def predict_admission_interactive():
    # widgets defined...
    # on_submit logic...
    # display UI

predict_admission_interactive()


In [None]:
def predict_admission_interactive():
    # widgets defined...
    # on_submit logic...
    # display UI

predict_admission_interactive()
