# Enhanced UHI Prediction Model

Improvements:
1. Advanced Feature Engineering
2. XGBoost Model + Neural Network Ensemble
3. Feature Importance Analysis
4. Cross-Validation

In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import KFold
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input, Concatenate
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Load and Prepare Data

In [None]:
# Load data
with open('train_features (1).pkl', 'rb') as f:
    train_features = pickle.load(f)

with open('submit_features.pkl', 'rb') as f:
    test_df = pickle.load(f)

train_df = pd.read_csv('Training_data_uhi_index_UHI2025-v2.csv')

print("Training features shape:", train_features.shape)
print("Test data shape:", test_df.shape)

## 2. Advanced Feature Engineering

In [None]:
def engineer_features(df):
    # Create copy to avoid modifying original
    df = df.copy()
    
    # Temperature-related features
    df['temp_range'] = df['temp_max'] - df['temp_min']
    df['temp_to_humidity'] = df['temp_mean'] / df['humidity_mean']
    
    # LST-related features
    df['lst_to_temp'] = df['lst'] / df['temp_mean']
    df['lst_anomaly'] = df['lst'] - df['temp_mean']
    
    # Building-related features
    if 'building_density' in df.columns:
        df['building_volume'] = df['building_density'] * df['avg_building_height']
        df['building_efficiency'] = df['building_coverage'] / df['building_density']
    
    # Interaction features
    df['temp_humidity_interaction'] = df['temp_mean'] * df['humidity_mean']
    
    # Normalize s2_features
    df['s2_features_norm'] = df['s2_features'] / df['s2_features'].max()
    
    return df

# Apply feature engineering
train_features_eng = engineer_features(train_features)
test_features_eng = engineer_features(test_df)

# Define final feature set
features = [
    'lst', 's2_features', 'temp_mean', 'temp_max', 'temp_min',
    'temp_std', 'humidity_mean', 'humidity_max', 'humidity_min',
    'temp_range', 'temp_to_humidity', 'lst_to_temp', 'lst_anomaly',
    'temp_humidity_interaction', 's2_features_norm'
]

# Prepare features and target
X = train_features_eng[features]
y = train_df['UHI Index']
X_test = test_features_eng[features]

# Scale features
scaler = RobustScaler()  # More robust to outliers than StandardScaler
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

print("Final feature set shape:", X_scaled.shape)

## 3. Create Enhanced Neural Network

In [None]:
def create_enhanced_nn(input_dim):
    # Main input
    main_input = Input(shape=(input_dim,))
    
    # Branch 1 - Deep network for complex patterns
    x1 = Dense(128, activation='relu')(main_input)
    x1 = BatchNormalization()(x1)
    x1 = Dropout(0.3)(x1)
    x1 = Dense(64, activation='relu')(x1)
    x1 = BatchNormalization()(x1)
    
    # Branch 2 - Shallow network for direct relationships
    x2 = Dense(32, activation='relu')(main_input)
    x2 = BatchNormalization()(x2)
    
    # Merge branches
    merged = Concatenate()([x1, x2])
    
    # Output layer
    output = Dense(1, activation='linear')(merged)
    
    # Create model
    model = Model(inputs=main_input, outputs=output)
    
    # Compile with custom learning rate
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    
    return model

# Create XGBoost model
xgb_model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=6,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

## 4. Cross-Validation Training

In [None]:
# Initialize K-Fold
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Lists to store predictions
nn_predictions = np.zeros((X_test.shape[0],))
xgb_predictions = np.zeros((X_test.shape[0],))
val_scores = []

# Train models with cross-validation
for fold, (train_idx, val_idx) in enumerate(kfold.split(X_scaled)):
    print(f"\nFold {fold + 1}")
    
    # Split data
    X_train_fold = X_scaled[train_idx]
    y_train_fold = y.iloc[train_idx]
    X_val_fold = X_scaled[val_idx]
    y_val_fold = y.iloc[val_idx]
    
    # Train Neural Network
    nn_model = create_enhanced_nn(X_scaled.shape[1])
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True
    )
    
    nn_model.fit(
        X_train_fold, y_train_fold,
        validation_data=(X_val_fold, y_val_fold),
        epochs=100,
        batch_size=32,
        callbacks=[early_stopping],
        verbose=0
    )
    
    # Train XGBoost
    xgb_model.fit(
        X_train_fold, y_train_fold,
        eval_set=[(X_val_fold, y_val_fold)],
        early_stopping_rounds=50,
        verbose=0
    )
    
    # Make predictions
    nn_fold_pred = nn_model.predict(X_test_scaled, verbose=0)
    xgb_fold_pred = xgb_model.predict(X_test_scaled)
    
    nn_predictions += nn_fold_pred.flatten()
    xgb_predictions += xgb_fold_pred
    
    # Validate fold
    val_pred = 0.6 * nn_model.predict(X_val_fold, verbose=0).flatten() + \
               0.4 * xgb_model.predict(X_val_fold)
    val_score = r2_score(y_val_fold, val_pred)
    val_scores.append(val_score)
    print(f"Fold {fold + 1} R² Score: {val_score:.4f}")

# Average predictions
nn_predictions /= 5
xgb_predictions /= 5

# Ensemble predictions (weighted average)
final_predictions = 0.6 * nn_predictions + 0.4 * xgb_predictions

print(f"\nAverage Cross-Validation R² Score: {np.mean(val_scores):.4f} ± {np.std(val_scores):.4f}")

## 5. Feature Importance Analysis

In [None]:
# Get feature importance from XGBoost
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': xgb_model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance, x='importance', y='feature')
plt.title('Feature Importance')
plt.show()

## 6. Save Predictions

In [None]:
# Prepare submission
submit_df = pd.read_csv('Submission_template_UHI2025-v2.csv')
submit_df['UHI Index'] = final_predictions

# Save predictions
output_file = 'UHI_prediction_v2.csv'
submit_df.to_csv(output_file, index=False)
print(f"\nPredictions saved to {output_file}")

# Display prediction statistics
print("\nPrediction Statistics:")
print(submit_df['UHI Index'].describe())