In [5]:
# Import required libraries
import pandas as pd
import numpy as np
import rasterio
import tensorflow as tf

# RandomFroestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import xarray as xr
from sklearn.preprocessing import StandardScaler

print("Numpy version:", np.__version__)
print("Pandas version:", pd.__version__)
print("TensorFlow version:", tf.__version__)

Numpy version: 1.23.5
Pandas version: 2.0.3
TensorFlow version: 2.12.0


In [3]:
# update on 2/15 current features check
import pickle
with open('feature_columns.pkl', 'rb') as f:
    feature_columns = pickle.load(f)
feature_columns
['lst',
 's2_features',
 'temp_mean',
# 'temp_max',
# 'temp_min',
 'temp_std',
 'humidity_mean',
#  'humidity_max',
#  'humidity_min',
 'wind_speed_mean',
 'solar_flux_mean',
# 'solar_flux_max',
 'wind_direction_mean',
 'building_density',
 'avg_building_height',
 'building_coverage',
 'building_coverage_ratio',
# 'building_density_squared',
 'height_to_coverage_ratio',
 'temp_building_interaction',
# 'temp_coverage_interaction',
 'temp_std_coverage_ratio',
 'lst_building_interaction',
 'lst_coverage_interaction',
 'wind_building_interaction',
# 'wind_speed_coverage_ratio',
# 'surface_exposure_index',
 'temp_wind_building',
 'solar_building_exposure']

['lst',
 's2_features',
 'temp_mean',
 'temp_max',
 'temp_min',
 'temp_std',
 'humidity_mean',
 'humidity_max',
 'humidity_min',
 'wind_speed_mean',
 'solar_flux_mean',
 'solar_flux_max',
 'wind_direction_mean',
 'building_density',
 'avg_building_height',
 'building_coverage',
 'building_coverage_ratio',
 'building_density_squared',
 'height_to_coverage_ratio',
 'temp_building_interaction',
 'temp_coverage_interaction',
 'temp_std_coverage_ratio',
 'lst_building_interaction',
 'lst_coverage_interaction',
 'wind_building_interaction',
 'wind_speed_coverage_ratio',
 'surface_exposure_index',
 'temp_wind_building',
 'solar_building_exposure']

Data Prep

In [6]:
# pull all dataset and see how does it look
train_df = pd.read_csv('Training_data_uhi_index_UHI2025-v2.csv')
submit_df = pd.read_csv('Submission_template_UHI2025-v2.csv')
features = pd.read_pickle('X.pkl')
target = pd.read_pickle('Y.pkl')
print("Train",train_df.columns, len(train_df))
print("Test",submit_df.columns, len(submit_df))
print("original",features.columns, len(features))

Train Index(['Longitude', 'Latitude', 'datetime', 'UHI Index'], dtype='object') 11229
Test Index(['Longitude', 'Latitude', 'UHI Index'], dtype='object') 1040
original Index(['lst', 's2_features', 'location', 'temp_mean', 'temp_max', 'temp_min',
       'temp_std', 'humidity_mean', 'humidity_max', 'humidity_min',
       'wind_speed_mean', 'solar_flux_mean', 'solar_flux_max',
       'wind_direction_mean', 'building_density', 'avg_building_height',
       'building_coverage'],
      dtype='object') 11229


In [13]:
  # Load the satellite data
def load_geotiff(file_path):
    with rasterio.open(file_path) as src:
        return src.read(1), src.transform

# Function to get pixel values at given coordinates
def get_pixel_values(lat, lon, data, transform):
    row, col = rasterio.transform.rowcol(transform, lon, lat)
    try:
        return data[row, col]
    except IndexError:
        return np.nan

def create_feature_matrix(train_data, weather_features):
    # Combine all features
    features = pd.DataFrame()

    # Add original features (excluding lat/lon)
    original_features = train_data.drop(['Longitude', 'Latitude', 'UHI Index'], axis=1)

    features = pd.concat([features, original_features], axis=1)

    # Add weather features
    features = features.merge(weather_features, on='location', how='left')


    # Add satellite features
    features['lst'] = train_data['lst']
    features['s2_features'] = train_data['s2_features']

    return features

def add_location_column(df):
    """Add location column based on coordinates"""
    # Define Manhattan and Bronx boundaries (approximate)
    manhattan_bounds = {
        'lat_min': 40.75,
        'lat_max': 40.88,
        'lon_min': -74.01,
        'lon_max': -73.86
    }

    bronx_bounds = {
        'lat_min': 40.785,
        'lat_max': 40.92,
        'lon_min': -73.93,
        'lon_max': -73.765
    }

    def get_location(row):
        lat = row['Latitude']
        lon = row['Longitude']

        if (manhattan_bounds['lat_min'] <= lat <= manhattan_bounds['lat_max'] and
            manhattan_bounds['lon_min'] <= lon <= manhattan_bounds['lon_max']):
            return 'Manhattan'
        elif (bronx_bounds['lat_min'] <= lat <= bronx_bounds['lat_max'] and
              bronx_bounds['lon_min'] <= lon <= bronx_bounds['lon_max']):
            return 'Bronx'
        else:
            return 'Unknown'

    df['location'] = df.apply(get_location, axis=1)
    return df

In [8]:
X = features[['lst', 's2_features']]
       #'temp_mean', 'temp_max', 'temp_min',
       #'temp_std', 'humidity_mean', 'humidity_max', 'humidity_min']]
       #'wind_speed_mean', 'solar_flux_mean', 'solar_flux_max',
       #'wind_direction_mean']]
# Prepare features and target
y = target

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

print(len(X), len(y))

11229 11229


Model - Neural Networks

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

def create_deep_model(input_dim):
    """Create a deep neural network model"""
    model = Sequential([
        # Input layer
        Dense(256, activation='relu', input_dim=input_dim),
        BatchNormalization(),
        Dropout(0.3),

        # Hidden layers
        Dense(512, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),

        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),

        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),

        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),

        # Output layer
        Dense(1, activation='linear')
    ])

    return model

def train_model(X_train, X_test, y_train, y_test):
    """Train the neural network model"""
    # Set random seeds for reproducibility
    tf.random.set_seed(42)
    np.random.seed(42)

    # Create model
    model = create_deep_model(X_train.shape[1])

    # Compile model
    optimizer = Adam(learning_rate=0.001)
    model.compile(
        optimizer=optimizer,
        loss='mse',
        metrics=['mae', 'mse', tf.keras.metrics.RootMeanSquaredError()]
    )

    # Callbacks
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=20,
        restore_best_weights=True
    )

    reduce_lr = ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.2,
        patience=5,
        min_lr=1e-6
    )

    # Convert data to NumPy arrays and ensure correct data types
    X_train = np.asarray(X_train).astype('float32')
    X_test = np.asarray(X_test).astype('float32')
    y_train = np.asarray(y_train).astype('float32')
    y_test = np.asarray(y_test).astype('float32')

    # Train model
    history = model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=500,
        batch_size=32,
        callbacks=[early_stopping, reduce_lr],
        verbose=1
    )

    return model, history

def evaluate_model(model, X_test, y_test):
    """Evaluate model performance"""
    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate metrics
    mse = np.mean((y_test - y_pred.flatten()) ** 2)
    rmse = np.sqrt(mse)
    mae = np.mean(np.abs(y_test - y_pred.flatten()))
    r2 = 1 - (np.sum((y_test - y_pred.flatten()) ** 2) /
              np.sum((y_test - np.mean(y_test)) ** 2))

    print("\nModel Performance:")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R2 Score: {r2:.4f}")
    print(f"Accuracy: {(1 - rmse/np.mean(y_test))*100:.2f}%")

    return mse, rmse, mae, r2

In [10]:
 # Train model
print("\nTraining model...")
model, history = train_model(X_train, X_val, y_train, y_val)

# Evaluate model
print("\nEvaluating model...")
metrics = evaluate_model(model, X_val, y_val)

# Save model
print("\nSaving model...")
model.save('nn_deep_model_v2.h5')


Training model...
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500

Evaluating model...

Model Performance:
MSE: 0.0002
RMSE: 0.0154
MAE: 0.0124
R2 Score: 0.1013
Accuracy: 98.46%

Saving model...


Test Data Prep and Predictions

In [11]:
# Load Landsat LST data
lst_data, lst_transform = load_geotiff('Landsat_LST.tiff')
# Load Sentinel-2 data (contains NDVI, NDWI, EVI)
s2_data, s2_transform = load_geotiff('S2_sample.tiff')
# Extract features for submission points
submit_df['lst'] = submit_df.apply(lambda x: get_pixel_values(x['Latitude'], x['Longitude'], lst_data, lst_transform), axis=1)
submit_df['s2_features'] = submit_df.apply(lambda x: get_pixel_values(x['Latitude'], x['Longitude'], s2_data, s2_transform), axis=1)

In [12]:
submission_features = create_feature_matrix(
    add_location_column(submit_df),
    weather_features= pd.read_pickle("weather_features.pkl")
)

submission_features.columns

Index(['lst', 's2_features', 'location', 'temp_mean', 'temp_max', 'temp_min',
       'temp_std', 'humidity_mean', 'humidity_max', 'humidity_min',
       'wind_speed_mean', 'solar_flux_mean', 'solar_flux_max',
       'wind_direction_mean'],
      dtype='object')

In [14]:
submission_features.to_pickle('submit_features.pkl')
# Scale features
X_submission = submission_features[['lst', 's2_features']]
       #temp_mean', 'temp_max', 'temp_min',
       #'temp_std', 'humidity_mean', 'humidity_max', 'humidity_min']]
       #'wind_speed_mean', 'solar_flux_mean', 'solar_flux_max',
       #'wind_direction_mean']]
X_submission_scaled = scaler.transform(X_submission)

# Make predictions
submit_df['UHI Index'] = model.predict(X_submission_scaled)

# Save predictions
submit_df[['Latitude','Longitude', 'UHI Index']].to_csv('UHI_predictions_v9.csv', index=False)

