In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import plotly.express as px

# Load the cleaned dataset
train_data = pd.read_csv('data_final/train_df_cleaned_full.csv')
test_data = pd.read_csv('data_final/test_df_cleaned_full.csv')

# Define target and features
X = train_data.drop('trip_duration', axis=1)  # Assuming 'trip_duration' is the target
y = train_data['trip_duration']

# Handle categorical features (example: label encoding for simplicity)
label_encoder = LabelEncoder()

# Loop through each categorical column
for col in X.select_dtypes(include=['object']).columns:
    X[col] = label_encoder.fit_transform(X[col])

# Split the training data (for validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBRegressor
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.05, max_depth=6)

# Train the model
xgb_model.fit(X_train, y_train)

# Get feature importance scores
importance = xgb_model.feature_importances_
features = X_train.columns

In [2]:

# Create a DataFrame for plotting
importance_df = pd.DataFrame({'Feature': features, 'Importance': importance})

# Sort the DataFrame by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Normalize importance values for better visualization (optional)
importance_df['Importance_normalized'] = importance_df['Importance'] / importance_df['Importance'].sum()

# Plot using Plotly with enhanced interactivity and clarity
fig = px.bar(
    importance_df,
    x='Importance_normalized',  # Use normalized importance
    y='Feature',
    orientation='h',
    title='Feature Importance',
    labels={'Importance_normalized': 'Normalized Importance', 'Feature': 'Features'},
    hover_data=['Importance'],  # Show the raw importance on hover
)

# Customize the layout for clarity
fig.update_layout(
    xaxis_title='Normalized Importance',
    yaxis_title='Features',
    yaxis=dict(autorange='reversed'),  # Reverses the order of features for readability
    template='plotly_white'  # Use a cleaner template for better visuals
)

# Show the interactive plot
fig.write_html('Normalized Importance.html')

Unnamed: 0,Feature,Importance,Importance_normalized
7,is_weekend,0.022831,0.022831
32,rain_wind_combined,0.001613,0.001613
17,ata,0.002753,0.002753
14,dropoff_lng,0.007477,0.007477
30,rain_last_6_hours,0.000164,0.000164
10,pickup_hour_cat,0.02046,0.02046
12,dropoff_ts,0.010113,0.010113
25,surge_multiplier,0.005087,0.005087
4,pick_lat,0.007399,0.007399
2,pickup_day_of_the_week,0.007474,0.007474
