In [6]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor

# Load model and relevant files
GB_MODEL_PATH = '../models/gb_model_best.joblib'
ML_READY_DATA_PATH = '../data/processed/ml_data6.csv'

# Load the processed data
df = pd.read_csv(ML_READY_DATA_PATH)

# Load the GradientBoosting model
gb_model = joblib.load(GB_MODEL_PATH)

# Get feature columns for the model (making sure to exclude 'universityRankingNum' and 'uniqueID')
feature_cols = ['ieltsMarks', 'toefl_ibt', 'minimumGPA', 'tuitionFeeUSD'] + \
    [col for col in df.columns if col.startswith('country_') or col.startswith('courseLevelSimplified_')]

# Check the number of features the model was trained on
print("Number of features in the model:", len(gb_model.feature_importances_))
print("Features used in the model during training:", feature_cols)

# Check the actual feature names the model expects
try:
    print("Model's expected feature names:", gb_model.feature_names_in_)
except AttributeError:
    print("Model does not have attribute 'feature_names_in_'. Likely, this model was not fitted with feature names.")

# Ensure that the number of features in feature_cols matches the number of model's expected features
if len(gb_model.feature_importances_) != len(feature_cols):
    print("Warning: Mismatch between the model's expected features and the feature list provided.")
    # Trim or adjust the feature list to match the model's actual number of features
    feature_cols = feature_cols[:len(gb_model.feature_importances_)]
    print(f"Adjusted feature list: {feature_cols}")

# Visualize feature importance
def plot_feature_importance(model, feature_names):
    # Get feature importances
    feature_importance = model.feature_importances_
    
    # Sort feature importance
    sorted_idx = np.argsort(feature_importance)[::-1]
    
    # Plot feature importance
    plt.figure(figsize=(10, 6))
    plt.barh(np.array(feature_names)[sorted_idx], feature_importance[sorted_idx], align='center')
    plt.xlabel("Feature Importance")
    plt.ylabel("Feature")
    plt.title("Feature Importance - Gradient Boosting Model")
    plt.show()

# Call this function to visualize feature importance
plot_feature_importance(gb_model, feature_cols)

# Now let's also check the feature importance (sort them)
sorted_feature_importance = sorted(zip(gb_model.feature_importances_, feature_cols), reverse=True)
print("\nFeature Importance (sorted):")
for importance, feature in sorted_feature_importance:
    print(f"{feature}: {importance}")

Number of features in the model: 15
Features used in the model during training: ['ieltsMarks', 'toefl_ibt', 'minimumGPA', 'tuitionFeeUSD', 'country_AUS', 'country_CAD', 'country_NZ', 'country_UK', 'country_US', 'courseLevelSimplified_Postgraduate', 'courseLevelSimplified_Undergraduate']
Model does not have attribute 'feature_names_in_'. Likely, this model was not fitted with feature names.
Adjusted feature list: ['ieltsMarks', 'toefl_ibt', 'minimumGPA', 'tuitionFeeUSD', 'country_AUS', 'country_CAD', 'country_NZ', 'country_UK', 'country_US', 'courseLevelSimplified_Postgraduate', 'courseLevelSimplified_Undergraduate']


IndexError: index 11 is out of bounds for axis 0 with size 11

<Figure size 1000x600 with 0 Axes>