# Feature Importance Analysis

Analyze which features are most important in our models to guide further feature engineering.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
sns.set_style('whitegrid')

SEED = 42
np.random.seed(SEED)

print("Loading data...")
train_df = pd.read_csv('/home/code/data/train.csv')
test_df = pd.read_csv('/home/code/data/test.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

In [None]:
# Feature engineering function (same as used in experiments)
def create_features(df):
    """Create engineered features for the model"""
    df_new = df.copy()
    
    # Fix Sex column mapping
    if 'Sex' in df_new.columns:
        df_new['Sex'] = df_new['Sex'].map({'M': 'male', 'F': 'female'})
    
    # Original numerical features
    num_features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']
    
    # Log1p transformations
    for col in num_features:
        df_new[f'{col}_log1p'] = np.log1p(df_new[col])
    
    # Product features (from winning solutions)
    df_new['Weight_Duration'] = df_new['Weight'] * df_new['Duration']
    df_new['Duration_Heart_Rate'] = df_new['Duration'] * df_new['Heart_Rate']
    df_new['Height_Weight'] = df_new['Height'] * df_new['Weight']
    
    # Ratio features
    df_new['Weight_Height'] = df_new['Weight'] / (df_new['Height'] + 1e-6)
    
    # BMI feature (Body Mass Index approximation)
    df_new['BMI'] = df_new['Weight'] / ((df_new['Height'] / 100) ** 2 + 1e-6)
    
    return df_new

# Create features
train_feat = create_features(train_df)
test_feat = create_features(test_df)

# Define feature columns - exclude categorical from numerical features
feature_cols = [col for col in train_feat.columns if col not in ['id', 'Calories']]
print(f"All features before filtering: {feature_cols}")

# Separate categorical and numerical features
cat_features = ['Sex'] if 'Sex' in feature_cols else []
num_features = [col for col in feature_cols if col not in cat_features]

print(f"Numerical features: {num_features}")
print(f"Categorical features: {cat_features}")

X = train_feat[feature_cols]
y = train_feat['Calories']

print(f"Total features: {len(feature_cols)}")

In [None]:
# Train a CatBoost model on full data to get feature importance
print("Training CatBoost model for feature importance...")

model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    loss_function='RMSE',
    random_seed=SEED,
    verbose=False
)

# Create pool
train_pool = Pool(X, label=y, cat_features=cat_features)

# Train model
model.fit(train_pool, verbose=False)

print("Model trained successfully!")
print(f"Best iteration: {model.best_iteration_}")

In [None]:
# Get feature importance
feature_importance = model.get_feature_importance(train_pool)
importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("Top 10 most important features:")
print(importance_df.head(10))

# Plot feature importance
plt.figure(figsize=(12, 8))
sns.barplot(data=importance_df.head(15), x='importance', y='feature', palette='viridis')
plt.title('Top 15 Feature Importance (CatBoost)')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

In [None]:
# Analyze correlation between features and target
correlation_df = train_feat[feature_cols + ['Calories']].corr()['Calories'].sort_values(ascending=False)

print("Correlation with target (Calories):")
print(correlation_df.head(15))

# Plot correlation
plt.figure(figsize=(12, 8))
sns.barplot(x=correlation_df.values[1:16], y=correlation_df.index[1:16], palette='coolwarm')
plt.title('Top 15 Features Correlation with Target')
plt.xlabel('Correlation')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

In [None]:
# Analyze feature patterns and interactions
print("Analyzing key feature patterns...")

# Top features by importance
top_features = importance_df.head(8)['feature'].tolist()
print(f"Top features by importance: {top_features}")

# Create pairplot for top features
sns.pairplot(train_feat[top_features + ['Calories']], 
             diag_kind='kde', 
             plot_kws={'alpha': 0.6, 's': 10},
             height=2.5)
plt.suptitle('Pairplot of Top Features vs Target', y=1.02)
plt.show()

In [None]:
# Analyze residuals to identify patterns
print("Analyzing residuals...")

# Get predictions on training data
preds = model.predict(train_pool)
residuals = y - preds

# Add predictions and residuals to dataframe
analysis_df = train_feat.copy()
analysis_df['predictions'] = preds
analysis_df['residuals'] = residuals

print(f"Mean residual: {residuals.mean():.4f}")
print(f"Std residual: {residuals.std():.4f}")

# Plot residuals vs predictions
plt.figure(figsize=(10, 6))
plt.scatter(preds, residuals, alpha=0.6, s=10)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Calories')
plt.ylabel('Residuals (Actual - Predicted)')
plt.title('Residuals vs Predictions')
plt.tight_layout()
plt.show()

# Plot residuals vs top features
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('Residuals vs Top Features', fontsize=16)

for idx, feature in enumerate(top_features[:6]):
    row = idx // 3
    col = idx % 3
    axes[row, col].scatter(analysis_df[feature], residuals, alpha=0.6, s=10)
    axes[row, col].axhline(y=0, color='r', linestyle='--')
    axes[row, col].set_xlabel(feature)
    axes[row, col].set_ylabel('Residuals')
    axes[row, col].set_title(f'Residuals vs {feature}')

plt.tight_layout()
plt.show()

In [None]:
# Identify potential new features based on analysis
print("Identifying potential new features...")

# Current top features
print("Current top features by importance:")
for i, (feat, imp) in enumerate(importance_df.head(10).values):
    print(f"{i+1:2d}. {feat:20s} - Importance: {imp:.2f}")

# Suggest new features based on patterns
print("\nSuggested new features based on analysis:")
print("1. Interaction terms between top features:")
print("   - Duration × Heart_Rate (already have)")
print("   - Weight × Heart_Rate")
print("   - Age × Duration")
print("   - BMI × Duration")
print("\n2. Polynomial features:")
print("   - Duration² (non-linear effect)")
print("   - Heart_Rate²")
print("\n3. Ratio features:")
print("   - Heart_Rate / Age")
print("   - Body_Temp / Heart_Rate")
print("\n4. Combined metrics:")
print("   - (Weight × Duration) / Height")
print("   - Duration × (Heart_Rate - Resting_Heart_Rate)")

# Save feature importance for reference
importance_df.to_csv('/home/code/exploration/feature_importance_catboost.csv', index=False)
print(f"\nFeature importance saved to: /home/code/exploration/feature_importance_catboost.csv")