In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
from pycaret.regression import *
from shapely.geometry import Point, buffer
import geopandas as gpd
import numpy as np

In [None]:
# Load the satellite data
def load_geotiff(file_path):
    with rasterio.open(file_path) as src:
        return src.read(1), src.transform

# Load Landsat LST data
lst_data, lst_transform = load_geotiff('Landsat_LST.tiff')

# Load Sentinel-2 data (contains NDVI, NDWI, EVI)
s2_data, s2_transform = load_geotiff('S2_sample.tiff')

train_df = pd.read_csv('Training_data_uhi_index_UHI2025-v2.csv')

# Function to get pixel values at given coordinates
def get_pixel_values(lat, lon, data, transform):
    row, col = rasterio.transform.rowcol(transform, lon, lat)
    try:
        return data[row, col]
    except IndexError:
        return np.nan

# Extract features for each training point
train_df['lst'] = train_df.apply(lambda x: get_pixel_values(x['Latitude'], x['Longitude'], lst_data, lst_transform), axis=1)
train_df['s2_features'] = train_df.apply(lambda x: get_pixel_values(x['Latitude'], x['Longitude'], s2_data, s2_transform), axis=1)

# Drop rows with missing values
train_df = train_df.dropna()

In [6]:
# Load datasets
def load_all_data():
    # Original data
    train_data = train_df.copy()
    
    # Load weather data
    weather_data = pd.read_excel('NY_Mesonet_Weather.xlsx', sheet_name=['Manhattan', 'Bronx'])
    manhattan_weather = weather_data['Manhattan']
    bronx_weather = weather_data['Bronx']
    
    # Load building footprint
    building_data = gpd.read_file('Building_Footprint.kml')
    
    return train_data, manhattan_weather, bronx_weather, building_data

In [7]:
# Preprocess weather data
def process_weather_data(manhattan_weather, bronx_weather):
    # First, let's print the data structure
    print("Manhattan columns:", manhattan_weather.columns)
    print("Bronx columns:", bronx_weather.columns)
    
    # Create copies to avoid modifying original data
    manhattan_df = manhattan_weather.copy()
    bronx_df = bronx_weather.copy()
    
    # Add location column
    manhattan_df.loc[:, 'location'] = 'Manhattan'
    bronx_df.loc[:, 'location'] = 'Bronx'
    
    # Combine the data
    weather_combined = pd.concat([manhattan_df, bronx_df], ignore_index=True)
    
    # Verify the combined data
    print("\nCombined data columns:", weather_combined.columns)
    print("Combined data shape:", weather_combined.shape)
    
    try:
        # Group and aggregate
        weather_features = weather_combined.groupby('location').agg({
            'Air Temp at Surface [degC]': ['mean', 'max', 'min', 'std'],
            'Relative Humidity [percent]': ['mean', 'max', 'min'],
            'Avg Wind Speed [m/s]': 'mean',
            'Solar Flux [W/m^2]': ['mean', 'max'],
            'Wind Direction [degrees]': ['mean']
        }).reset_index()
        
        # Flatten and rename columns
        weather_features.columns = [
            'location',
            'temp_mean', 'temp_max', 'temp_min', 'temp_std',
            'humidity_mean', 'humidity_max', 'humidity_min',
            'wind_speed_mean',
            'solar_flux_mean', 'solar_flux_max',
            'wind_direction_mean'
        ]
        
        return weather_features
        
    except Exception as e:
        print(f"Error during processing: {str(e)}")
        print("\nFirst few rows of combined data:")
        print(weather_combined.head())
        raise

In [None]:
def add_location_column(df):
    """Add location column based on coordinates"""
    # Define Manhattan and Bronx boundaries (approximate)
    manhattan_bounds = {
        'lat_min': 40.7,
        'lat_max': 40.88,
        'lon_min': -74.02,
        'lon_max': -73.91
    }
    
    bronx_bounds = {
        'lat_min': 40.785,
        'lat_max': 40.92,
        'lon_min': -73.93,
        'lon_max': -73.765
    }
    
    def get_location(row):
        lat = row['Latitude']
        lon = row['Longitude']
        
        if (manhattan_bounds['lat_min'] <= lat <= manhattan_bounds['lat_max'] and 
            manhattan_bounds['lon_min'] <= lon <= manhattan_bounds['lon_max']):
            return 'Manhattan'
        elif (bronx_bounds['lat_min'] <= lat <= bronx_bounds['lat_max'] and 
              bronx_bounds['lon_min'] <= lon <= bronx_bounds['lon_max']):
            return 'Bronx'
        else:
            return 'Unknown'
    
    df['location'] = df.apply(get_location, axis=1)
    return df

In [8]:
# Building Data
def calculate_density(building_data, train_data, radius=500):
    """Calculate building density within a radius around each point"""
    # Create GeoDataFrame from training points
    train_points = gpd.GeoDataFrame(
        train_data, 
        geometry=[Point(xy) for xy in zip(train_data['Longitude'], train_data['Latitude'])]
    )
    
    # Calculate density for each point
    densities = []
    for _, point in train_points.iterrows():
        # Create buffer around point using the buffer method
        buffer_zone = point.geometry.buffer(radius/111000)  
        
        # Count buildings within buffer
        buildings_in_zone = building_data[building_data.geometry.intersects(buffer_zone)]
        density = len(buildings_in_zone) / (np.pi * (radius/1000)**2)  # buildings per km²
        densities.append(density)
    
    return densities

def calculate_height(building_data, train_data, radius=500):
    """Calculate average building height within radius"""
    from shapely.geometry import Point
    
    # Print available columns
    print("Available columns in building data:", building_data.columns.tolist())
    
    # Try different possible height column names
    possible_height_columns = ['HEIGHT', 'height', 'Height', 'building_height', 
                             'BUILDING_HEIGHT', 'HeightRoof', 'HEIGHT_ROOF']
    
    height_col = None
    for col in possible_height_columns:
        if col in building_data.columns:
            height_col = col
            break
    
    if height_col is None:
        print("Warning: No height column found. Using default height of 10 meters.")
        building_data['height'] = 10  # Set default height
        height_col = 'height'
    
    # Create GeoDataFrame from training points
    train_points = gpd.GeoDataFrame(
        train_data, 
        geometry=[Point(xy) for xy in zip(train_data['Longitude'], train_data['Latitude'])]
    )
    
    # Calculate average height for each point
    avg_heights = []
    for _, point in train_points.iterrows():
        buffer_zone = point.geometry.buffer(radius/111000)
        buildings_in_zone = building_data[building_data.geometry.intersects(buffer_zone)]
        
        if len(buildings_in_zone) > 0:
            avg_height = buildings_in_zone[height_col].mean()
        else:
            avg_height = 0
            
        avg_heights.append(avg_height)
    
    return avg_heights

def calculate_coverage(building_data, train_data, radius=500):
    """Calculate building coverage ratio within radius"""
    from shapely.geometry import Point, Polygon  # Import Polygon

    # Create GeoDataFrame from training points
    train_points = gpd.GeoDataFrame(
        train_data, 
        geometry=[Point(xy) for xy in zip(train_data['Longitude'], train_data['Latitude'])]
    )
    
    # Calculate coverage for each point
    coverage_ratios = []
    for _, point in train_points.iterrows():
        buffer_zone = point.geometry.buffer(radius/111000)
        buildings_in_zone = building_data[building_data.geometry.intersects(buffer_zone)]
        
        if len(buildings_in_zone) > 0:
            # Calculate total building footprint area
            building_area = buildings_in_zone.geometry.area.sum()
            # Calculate buffer zone area
            buffer_area = buffer_zone.area
            # Calculate coverage ratio
            coverage = building_area / buffer_area
        else:
            coverage = 0
            
        coverage_ratios.append(coverage)
    
    return coverage_ratios

def engineer_building_features(building_data, train_data):
    """Main function to calculate all building features"""
    # Create empty DataFrame for features
    building_features = pd.DataFrame()
    
    print("Calculating building density...")
    building_features['building_density'] = calculate_density(building_data, train_data)
    
    print("Calculating average building height...")
    building_features['avg_building_height'] = calculate_height(building_data, train_data)
    
    print("Calculating building coverage...")
    building_features['building_coverage'] = calculate_coverage(building_data, train_data)
    
    return building_features

In [9]:
def create_feature_matrix(train_data, weather_features, building_features):
    # Combine all features
    features = pd.DataFrame()
    

    # Add original features (excluding lat/lon)
    original_features = train_data.drop(['Longitude', 'Latitude', 'UHI Index'], axis=1)
    features = pd.concat([features, original_features], axis=1)
    
    # Add weather features
    features = features.merge(weather_features, on='location', how='left')
    
    # Add building features
    features = pd.concat([features, building_features], axis=1)
    
    # Add satellite features
    features['lst'] = train_data['lst']
    features['s2_features'] = train_data['s2_features'] 
    
    return features

In [10]:
# Modeling
def train_enhanced_model(features, target):
    # Setup with enhanced parameters
    setup(
        data=features,
        target=target,
        session_id=123,
        transformation=True,
        polynomial_features=False,
        feature_selection=True,
        remove_multicollinearity=True,
        fold=10
    )
    
    # Create and tune multiple models
    rf = create_model('rf')
    xgb = create_model('xgboost')
    lgb = create_model('lightgbm')
    
    # Tune each model
    tuned_models = []
    for model in [rf, xgb, lgb]:
        tuned = tune_model(model, optimize='R2')
        tuned_models.append(tuned)
    
    # Create stacked model
    stacked = stack_models(tuned_models, meta_model=rf, optimize='R2')
    
    # Final tuning of stacked model
    final_model = tune_model(stacked, optimize='R2')
    
    return final_model

In [11]:
# Prediction
def predict_uhi(model, submission_data, weather_features, building_features):
    # Prepare submission features
    submission_features = create_feature_matrix(
        submission_data,
        weather_features,
        building_features,
        lst_data,
        s2_data
    )
    
    # Make predictions
    predictions = predict_model(model, data=submission_features)
    return predictions['prediction_label']

Time to Run!

In [None]:
print("Loading data...")
# Load all required data
weather_data = pd.read_excel('NY_Mesonet_Weather.xlsx', sheet_name=['Manhattan', 'Bronx'])
building_data = gpd.read_file('Building_Footprint.kml')  # Add this line

manhattan_weather = weather_data['Manhattan']
bronx_weather = weather_data['Bronx']

print("\nData loaded successfully")
print(f"Manhattan shape: {manhattan_weather.shape}")
print(f"Bronx shape: {bronx_weather.shape}")
print(f"Building data shape: {building_data.shape}")

# Process weather data
weather_features = process_weather_data(manhattan_weather, bronx_weather)
print("\nWeather features created successfully")
print(weather_features.head())

# Process building data
building_features = engineer_building_features(building_data, train_df)
print("\nBuilding features created successfully")
print(building_features.head())

In [None]:
# Create feature matrix
features = create_feature_matrix(
  add_location_column(train_df),
  weather_features,
  building_features,
  lst_data,
  s2_data
)

In [None]:
# Train model
print("\nTraining model...")
model = train_enhanced_model(features, train_df['UHI Index'])
print("Model training completed")

# Make predictions on submission data
print("\nMaking predictions...")
submission_data = pd.read_csv('Submission_template_UHI2025-v2.csv')
submission_building_features = engineer_building_features(building_data, submission_data)
predictions = predict_uhi(
  model, 
  submission_data, 
  weather_features, 
  submission_building_features
)

# Save predictions
submission_data['UHI Index'] = predictions
submission_data[['Longitude','Latitude','UHI Index']].to_csv('submission_predictions.csv', index=False)
print("\nPredictions saved to 'UHI submission additional data.csv'")

# Print model performance metrics
print("\nModel Performance Metrics:")
print(pull())

# Plot feature importance
plot_model(model, plot='feature')