# **Combine building density with training, validation data**

In [1]:
import pandas as pd
from shapely.geometry import Point, Polygon
from shapely.wkt import loads
import geopandas as gpd

In [2]:
# Load the CSV file into a GeoDataFrame
def load_grid_data(csv_file):
    # Read the CSV file
    grid_data = pd.read_csv(csv_file)
    
    # Convert WKT geometries to shapely objects
    grid_data['geometry'] = grid_data['geometry_wkt'].apply(lambda wkt: loads(wkt))
    
    # Create a GeoDataFrame
    grid_gdf = gpd.GeoDataFrame(grid_data, geometry='geometry', crs="EPSG:4326")
    return grid_gdf

In [3]:
# Function to locate the density for a random point
def get_density_for_point(grid_gdf, point_lon, point_lat):
    # Create a point geometry
    point = Point(point_lon, point_lat)
    
    # Find the grid cell containing the point
    matching_cell = grid_gdf[grid_gdf.geometry.contains(point)]
    
    if not matching_cell.empty:
        # Extract the density value
        density = matching_cell['density'].iloc[0]
        return density
    else:
        return None  # Point is outside the grid

In [4]:
# Function to locate the density for a point
def get_density_for_point(grid_gdf, point_lon, point_lat):
    # Create a point geometry
    point = Point(point_lon, point_lat)
    
    # Find the grid cell containing the point
    matching_cell = grid_gdf[grid_gdf.geometry.contains(point)]
    
    if not matching_cell.empty:
        # Extract the density value
        density = matching_cell['density'].iloc[0]
        return density
    else:
        return None  # Point is outside the grid

In [5]:
# Add density feature to a DataFrame
def add_density_to_dataframe(points_df, grid_gdf):
    # Create a new 'density' column by applying the function to each row
    points_df['density'] = points_df.apply(
        lambda row: get_density_for_point(grid_gdf, row['Longitude'], row['Latitude']),
        axis=1
    )
    return points_df

In [6]:
# Load the grid data
grid_csv = '/kaggle/input/grid-density/grid_density.csv'  
grid_gdf = load_grid_data(grid_csv)

In [7]:
# Load the training data from csv file and display the first few rows to inspect the data
ground_df = pd.read_csv("/kaggle/input/uncorrupted-training-data/Training_data_uhi_index_2025-02-18.csv")
ground_df.head()

Unnamed: 0,Longitude,Latitude,datetime,UHI Index
0,-73.909167,40.813107,24-07-2021 15:53,1.030289
1,-73.909187,40.813045,24-07-2021 15:53,1.030289
2,-73.909215,40.812978,24-07-2021 15:53,1.023798
3,-73.909242,40.812908,24-07-2021 15:53,1.023798
4,-73.909257,40.812845,24-07-2021 15:53,1.021634


In [8]:
# Add density to the points DataFrame
final_df = add_density_to_dataframe(ground_df, grid_gdf)

# Display the resulting DataFrame
print(final_df)

       Longitude   Latitude          datetime  UHI Index  density
0     -73.909167  40.813107  24-07-2021 15:53   1.030289        8
1     -73.909187  40.813045  24-07-2021 15:53   1.030289        8
2     -73.909215  40.812978  24-07-2021 15:53   1.023798        8
3     -73.909242  40.812908  24-07-2021 15:53   1.023798        8
4     -73.909257  40.812845  24-07-2021 15:53   1.021634        8
...          ...        ...               ...        ...      ...
11224 -73.957050  40.790333  24-07-2021 15:57   0.972470        0
11225 -73.957063  40.790308  24-07-2021 15:57   0.972470        0
11226 -73.957093  40.790270  24-07-2021 15:57   0.981124        0
11227 -73.957112  40.790253  24-07-2021 15:59   0.981245        0
11228 -73.957128  40.790237  24-07-2021 15:59   0.983408        0

[11229 rows x 5 columns]


In [9]:
# Save the DataFrame to a CSV file
output_csv = "uncorrupted_training_data_with_density.csv"
final_df.to_csv(output_csv, index=False)

print(f"The DataFrame has been saved to {output_csv}")

The DataFrame has been saved to uncorrupted_training_data_with_density.csv


In [10]:
valid_df = pd.read_csv("/kaggle/input/validation-data/Submission_template_UHI2025-v2.csv")
valid_df.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Longitude,Latitude,UHI Index
0,-73.971665,40.788763,
1,-73.971928,40.788875,
2,-73.96708,40.78908,
3,-73.97255,40.789082,
4,-73.969697,40.787953,


In [11]:
# Add density to the points DataFrame
validation_df = add_density_to_dataframe(valid_df, grid_gdf)

# Display the resulting DataFrame
print(validation_df)

      Longitude   Latitude  UHI Index  density
0    -73.971665  40.788763        NaN       12
1    -73.971928  40.788875        NaN       12
2    -73.967080  40.789080        NaN        4
3    -73.972550  40.789082        NaN       12
4    -73.969697  40.787953        NaN        9
...         ...        ...        ...      ...
1035 -73.919388  40.813803        NaN       11
1036 -73.931033  40.833178        NaN       15
1037 -73.934647  40.854542        NaN       12
1038 -73.917223  40.815413        NaN        8
1039 -73.911645  40.804402        NaN        7

[1040 rows x 4 columns]


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


In [12]:
# Save the DataFrame to a CSV file
output_csv = "validation_data_with_density.csv"
validation_df.to_csv(output_csv, index=False)

print(f"The DataFrame has been saved to {output_csv}")

The DataFrame has been saved to validation_data_with_density.csv
