### Install modules in terminal

In [6]:
!pip install matplotlib seaborn numpy pandas xarray rioxarray geopandas rasterio pillow pyproj scikit-learn pystac-client planetary-computer tqdm stackstac odc.stac




[notice] A new release of pip available: 22.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip





### Import modules

In [7]:
# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Data Science
import numpy as np
import pandas as pd

# Multi-dimensional arrays and datasets
import xarray as xr

# Geospatial raster data handling
import rioxarray as rxr

# Geospatial data analysis
import geopandas as gpd

# Geospatial operations
import rasterio
from rasterio import windows  
from rasterio import features  
from rasterio import warp
from rasterio.warp import transform_bounds 
from rasterio.windows import from_bounds 

# Image Processing
from PIL import Image

# Coordinate transformations
from pyproj import Proj, Transformer, CRS

# Feature Engineering
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Machine Learning
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Planetary Computer Tools
import pystac_client
import planetary_computer as pc
from pystac.extensions.eo import EOExtension as eo

# Others
import os
from tqdm import tqdm

# Import common GIS tools
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
import rioxarray as rio
import rasterio
from matplotlib.cm import RdYlGn,jet,RdBu

# Import Planetary Computer tools
import stackstac
import pystac_client
import planetary_computer 
from odc.stac import stac_load

import config
import importlib
importlib.reload(config)

<module 'config' from 'c:\\Users\\winne\\Documents\\GitHub\\UHI\\config.py'>

#### Read dataset

###### This is just for display

In [8]:
# Read the bands from the GeoTIFF file
with rasterio.open(config.sentinel2_tiff) as src1:
    for i in range(src1.count): 
        band = src1.read(i+1)
        print(f"Band {i+1} shape: {band.shape}")

with rasterio.open(config.landsat_1_tiff) as src2:
    for i in range(src2.count):
        band = src2.read(i+1)
        print(f"Band {i+1} shape: {band.shape}")

with rasterio.open(config.landsat_2_tiff) as src3:
    for i in range(src3.count):
        band = src3.read(i+1)
        print(f"Band {i+1} shape: {band.shape}")

    
# # Plot the bands in a 2x3 grid
# fig, axes = plt.subplots(4, 3, figsize=(10, 10))

# # Flatten the axes for easier indexing
# axes = axes.flatten()

# # Plot the first band (B01)
# im1 = axes[0].imshow(band1, cmap='viridis')
# axes[0].set_title('Band [B01]')
# fig.colorbar(im1, ax=axes[0])

# # Plot the second band (B04)
# im2 = axes[1].imshow(band2, cmap='viridis')
# axes[1].set_title('Band [B04]')
# fig.colorbar(im2, ax=axes[1])

# # Plot the third band (B06)
# im3 = axes[2].imshow(band3, cmap='viridis')                 
# axes[2].set_title('Band [B06]')
# fig.colorbar(im3, ax=axes[2])

# # Plot the fourth band (B08)
# im4 = axes[3].imshow(band4, cmap='viridis')
# axes[3].set_title('Band [B08]')
# fig.colorbar(im4, ax=axes[3])

# # Plot the fifth band (NIR08)
# im5 = axes[4].imshow(band5, cmap='viridis')
# axes[4].set_title('Band [NIR08]')
# fig.colorbar(im5, ax=axes[4])

# # Plot the sixth band (Red)
# im6 = axes[5].imshow(band6, cmap='viridis')
# axes[5].set_title('Band [Red]')
# fig.colorbar(im6, ax=axes[5])

# # Plot the fifth band (LWIR11)
# im7 = axes[6].imshow(band7, cmap='viridis')
# axes[6].set_title('Band [LWIR11]')
# fig.colorbar(im7, ax=axes[6])

# # Plot the sixth band (B03)
# im8 = axes[7].imshow(band8, cmap='viridis')
# axes[7].set_title('Band [B03]')
# fig.colorbar(im8, ax=axes[7])

# # Plot the fifth band (B11)
# im9 = axes[8].imshow(band9, cmap='viridis')
# axes[8].set_title('Band [B11]')
# fig.colorbar(im9, ax=axes[8])

# plt.tight_layout()
# plt.show()

Band 1 shape: (1122, 1281)
Band 2 shape: (1122, 1281)
Band 3 shape: (1122, 1281)
Band 4 shape: (1122, 1281)
Band 5 shape: (1122, 1281)
Band 6 shape: (1122, 1281)
Band 7 shape: (1122, 1281)
Band 8 shape: (1122, 1281)
Band 9 shape: (1122, 1281)
Band 10 shape: (1122, 1281)
Band 11 shape: (1122, 1281)
Band 1 shape: (1122, 1281)
Band 2 shape: (1122, 1281)
Band 3 shape: (1122, 1281)
Band 4 shape: (1122, 1281)
Band 5 shape: (1122, 1281)
Band 6 shape: (1122, 1281)
Band 7 shape: (1122, 1281)
Band 1 shape: (1122, 1281)


#### Map data from datasets with desired coordinates

In [9]:
# Extracts satellite band values from a GeoTIFF based on coordinates from a csv file and returns them in a DataFrame.

def map_satellite_data(tiff_path, csv_path, bands):

    # Load the GeoTIFF data
    data = rxr.open_rasterio(tiff_path)
    data.close()
    tiff_crs = data.rio.crs

    # Read the Excel file using pandas
    df = pd.read_csv(csv_path)
    latitudes = df['Latitude'].values
    longitudes = df['Longitude'].values

    # 3. Convert lat/long to the GeoTIFF's CRS
    # Create a Proj object for EPSG:4326 (WGS84 - lat/long) and the GeoTIFF's CRS
    proj_wgs84 = Proj(init='epsg:4326')  # EPSG:4326 is the common lat/long CRS
    proj_tiff = Proj(tiff_crs)
    
    # Create a transformer object
    transformer = Transformer.from_proj(proj_wgs84, proj_tiff)

    with rasterio.open(tiff_path) as src:
        num_lists = src.count

    print(f"Number of bands of {tiff_path}: {num_lists}")

    bands_values = [[] for _ in range(num_lists)]


# Iterate over the latitudes and longitudes, and extract the corresponding band values
    for lat, lon in tqdm(zip(latitudes, longitudes), total=len(latitudes), desc="Mapping values for " + tiff_path):
    # Assuming the correct dimensions are 'y' and 'x' (replace these with actual names from data.coords)
        for index, band in enumerate(bands, start=1):
            band_value = data.sel(x=lon, y=lat, band=index, method="nearest").values
            bands_values[index-1].append(band_value)

    # Create a DataFrame with the band values
    # Create a DataFrame to store the band values
    df = pd.DataFrame()
    for index, band in enumerate(bands):
        df[band.upper()] = bands_values[index]
        
    return df



In [10]:
# Concatenates the data from the three satellite images into a single DataFrame.
def getConcatenatedData (csv):
    sentinel2_data = map_satellite_data(config.sentinel2_tiff, csv, config.sentinel2_bands)
    sentinel2_data.shape
    sentinel2_data.head()

    landsat_1_data = map_satellite_data(config.landsat_1_tiff, csv, config.landsat_1_bands)
    landsat_1_data.shape
    landsat_1_data.head()

    landsat_2_data = map_satellite_data(config.landsat_2_tiff, csv, config.landsat_2_bands)
    landsat_2_data.shape
    landsat_2_data.head()
    
    final_data = pd.concat([sentinel2_data, landsat_1_data, landsat_2_data], axis=1)
    final_data.head()

    return final_data


In [11]:
final_data = getConcatenatedData(config.training_data)
final_data.shape
final_data.head()

Number of bands of sentinel2.tiff: 11


Mapping values for sentinel2.tiff: 100%|██████████| 11229/11229 [01:49<00:00, 102.73it/s]


Number of bands of landsat_1.tiff: 7


Mapping values for landsat_1.tiff: 100%|██████████| 11229/11229 [01:08<00:00, 162.94it/s]


Number of bands of landsat_2.tiff: 1


Mapping values for landsat_2.tiff: 100%|██████████| 11229/11229 [00:09<00:00, 1187.34it/s]


Unnamed: 0,B01,B02,B03,B04,B05,B06,B07,B08,B8A,B11,B12,BLUE,GREEN,RED,NIR08,SWIR16,SWIR22,COASTAL,LWIR11
0,841.5,1053.0,1155.0,1206.0,1481.5,1660.5,1721.0,1832.0,1709.0,1792.0,1495.5,0.0805274999999999,0.1012624999999999,0.1067624999999999,0.203755,0.1597549999999999,0.137755,0.0688949999999999,36.606316640000045
1,841.5,646.0,823.0,777.0,1130.5,1883.0,2117.5,2241.0,2251.0,1548.0,1135.0,0.0805274999999999,0.1012624999999999,0.1067624999999999,0.203755,0.1597549999999999,0.137755,0.0688949999999999,36.606316640000045
2,841.5,625.0,766.0,741.5,1130.5,1883.0,2117.5,2200.0,2251.0,1548.0,1135.0,0.0693349999999999,0.0802525,0.0818749999999999,0.180655,0.1398725,0.101455,0.066585,36.24058850000006
3,841.5,659.5,763.0,708.5,1077.5,1783.0,2042.0,2161.0,2186.0,1617.5,1207.5,0.0693349999999999,0.0802525,0.0818749999999999,0.180655,0.1398725,0.101455,0.066585,36.24058850000006
4,841.5,659.5,763.0,708.5,1077.5,1783.0,2042.0,2161.0,2186.0,1617.5,1207.5,0.0693349999999999,0.0802525,0.0818749999999999,0.180655,0.1398725,0.101455,0.066585,36.24058850000006


In [12]:
final_data.shape

(11229, 19)

#### Process data before pass into model

In [13]:
def getAverages (lst1, lst2):
    return [(a + b) / 2 for a, b in zip(lst1, lst2)]

In [14]:
def calcIndices(data):
    # Calculate NDVI (Normalized Difference Vegetation Index) and handle division by zero by replacing infinities with NaN.
    data['NDVI_S2'] = (data['B08'] - data['B04']) / (data['B08'] + data['B04'])
    data['NDVI_S2'] = data['NDVI_S2'].replace([np.inf, -np.inf], np.nan) 

    data['NDVI_LST'] = (data['NIR08'] - data['RED']) / (data['NIR08'] + data['RED'])
    data['NDVI_LST'] = data['NDVI_LST'].replace([np.inf, -np.inf], np.nan)

    data['NDVI_S2_LST'] = getAverages((data['B08'] - data['B04']) / (data['B08'] + data['B04']), ((data['NIR08'] - data['RED']) / (data['NIR08'] + data['RED'])))
    data['NDVI_S2_LST'] = data['NDVI_S2_LST'].replace([np.inf, -np.inf], np.nan)
    data['NDWI_S2'] = (data['B03'] - data['B08']) / (data['B03'] + data['B08'])
    data['NDWI_S2'] = data['NDWI_S2'].replace([np.inf, -np.inf], np.nan)
    data['NDBI_S2'] = (data['B11'] - data['B08']) / (data['B11'] + data['B08'])
    data['NDBI_S2'] = data['NDBI_S2'].replace([np.inf, -np.inf], np.nan)


    data['MSI_LST'] = (data['SWIR16'] / data['NIR08'])
    data['MSI_LST'] = data['MSI_LST'].replace([np.inf, -np.inf], np.nan)

    return data


In [15]:
data_with_indices = calcIndices(final_data)

In [16]:
# Combine two datasets vertically (along columns) using pandas concat function.
def combine_two_datasets(dataset1,dataset2):
    data = pd.concat([dataset1,dataset2], axis=1)
    return data

In [17]:
# Combining ground data and final data into a single dataset.
uhi_data = combine_two_datasets(config.training_df,final_data)
uhi_data.head()

Unnamed: 0,Longitude,Latitude,datetime,UHI Index,B01,B02,B03,B04,B05,B06,...,SWIR16,SWIR22,COASTAL,LWIR11,NDVI_S2,NDVI_LST,NDVI_S2_LST,NDWI_S2,NDBI_S2,MSI_LST
0,-73.909167,40.813107,24-07-2021 15:53,1.030289,841.5,1053.0,1155.0,1206.0,1481.5,1660.5,...,0.1597549999999999,0.137755,0.0688949999999999,36.606316640000045,0.206057,0.312358,0.259207,-0.226649,-0.011038,0.784054
1,-73.909187,40.813045,24-07-2021 15:53,1.030289,841.5,646.0,823.0,777.0,1130.5,1883.0,...,0.1597549999999999,0.137755,0.0688949999999999,36.606316640000045,0.485089,0.312358,0.398724,-0.462794,-0.182898,0.784054
2,-73.909215,40.812978,24-07-2021 15:53,1.023798,841.5,625.0,766.0,741.5,1130.5,1883.0,...,0.1398725,0.101455,0.066585,36.24058850000006,0.495835,0.376262,0.436049,-0.483479,-0.173959,0.774252
3,-73.909242,40.812908,24-07-2021 15:53,1.023798,841.5,659.5,763.0,708.5,1077.5,1783.0,...,0.1398725,0.101455,0.066585,36.24058850000006,0.506186,0.376262,0.441224,-0.478112,-0.14384,0.774252
4,-73.909257,40.812845,24-07-2021 15:53,1.021634,841.5,659.5,763.0,708.5,1077.5,1783.0,...,0.1398725,0.101455,0.066585,36.24058850000006,0.506186,0.376262,0.441224,-0.478112,-0.14384,0.774252


In [18]:
# Remove duplicate rows from the DataFrame based on specified columns and keep the first occurrence
columns_to_check = ['B01','B04','B06','B08','LWIR11']
for col in columns_to_check:
    # Check if the value is a numpy array and has more than one dimension
    uhi_data[col] = uhi_data[col].apply(lambda x: tuple(x) if isinstance(x, np.ndarray) and x.ndim > 0 else x)

# Now remove duplicates
uhi_data = uhi_data.drop_duplicates(subset=columns_to_check, keep='first')
uhi_data.head()

Unnamed: 0,Longitude,Latitude,datetime,UHI Index,B01,B02,B03,B04,B05,B06,...,SWIR16,SWIR22,COASTAL,LWIR11,NDVI_S2,NDVI_LST,NDVI_S2_LST,NDWI_S2,NDBI_S2,MSI_LST
0,-73.909167,40.813107,24-07-2021 15:53,1.030289,841.5,1053.0,1155.0,1206.0,1481.5,1660.5,...,0.1597549999999999,0.137755,0.0688949999999999,36.606317,0.206057,0.312358,0.259207,-0.226649,-0.011038,0.784054
1,-73.909187,40.813045,24-07-2021 15:53,1.030289,841.5,646.0,823.0,777.0,1130.5,1883.0,...,0.1597549999999999,0.137755,0.0688949999999999,36.606317,0.485089,0.312358,0.398724,-0.462794,-0.182898,0.784054
2,-73.909215,40.812978,24-07-2021 15:53,1.023798,841.5,625.0,766.0,741.5,1130.5,1883.0,...,0.1398725,0.101455,0.066585,36.240589,0.495835,0.376262,0.436049,-0.483479,-0.173959,0.774252
3,-73.909242,40.812908,24-07-2021 15:53,1.023798,841.5,659.5,763.0,708.5,1077.5,1783.0,...,0.1398725,0.101455,0.066585,36.240589,0.506186,0.376262,0.441224,-0.478112,-0.14384,0.774252
5,-73.90928,40.812777,24-07-2021 15:53,1.021634,841.5,551.5,768.5,659.0,1077.5,1783.0,...,0.1596725,0.1134999999999999,0.0486824999999999,36.062851,0.579048,0.574442,0.576745,-0.52569,-0.20895,0.620244


In [19]:
# Resetting the index of the dataset
uhi_data=uhi_data.reset_index(drop=True)

#### Model Building

In [20]:
uhi_data = uhi_data[['B01','B02','B03','B04','B05','B06','B07','B08','B8A','B11','B12','NIR08','RED','GREEN','BLUE','SWIR16','SWIR22','COASTAL','LWIR11','NDVI_S2_LST','NDWI_S2','NDBI_S2','MSI_LST','UHI Index']]
uhi_data.head()

Unnamed: 0,B01,B02,B03,B04,B05,B06,B07,B08,B8A,B11,...,BLUE,SWIR16,SWIR22,COASTAL,LWIR11,NDVI_S2_LST,NDWI_S2,NDBI_S2,MSI_LST,UHI Index
0,841.5,1053.0,1155.0,1206.0,1481.5,1660.5,1721.0,1832.0,1709.0,1792.0,...,0.0805274999999999,0.1597549999999999,0.137755,0.0688949999999999,36.606317,0.259207,-0.226649,-0.011038,0.784054,1.030289
1,841.5,646.0,823.0,777.0,1130.5,1883.0,2117.5,2241.0,2251.0,1548.0,...,0.0805274999999999,0.1597549999999999,0.137755,0.0688949999999999,36.606317,0.398724,-0.462794,-0.182898,0.784054,1.030289
2,841.5,625.0,766.0,741.5,1130.5,1883.0,2117.5,2200.0,2251.0,1548.0,...,0.0693349999999999,0.1398725,0.101455,0.066585,36.240589,0.436049,-0.483479,-0.173959,0.774252,1.023798
3,841.5,659.5,763.0,708.5,1077.5,1783.0,2042.0,2161.0,2186.0,1617.5,...,0.0693349999999999,0.1398725,0.101455,0.066585,36.240589,0.441224,-0.478112,-0.14384,0.774252,1.023798
4,841.5,551.5,768.5,659.0,1077.5,1783.0,2042.0,2472.0,2186.0,1617.5,...,0.0553099999999999,0.1596725,0.1134999999999999,0.0486824999999999,36.062851,0.576745,-0.52569,-0.20895,0.620244,1.021634


In [21]:
X = uhi_data.drop(columns=['UHI Index']).values
y = uhi_data ['UHI Index'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=123)

In [22]:
# Scale the training and test data using standardscaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [23]:
# Train the Random Forest model on the training data
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train,y_train)

In [24]:
# Make predictions on the training data
insample_predictions = model.predict(X_train)

In [25]:
# calculate R-squared score for in-sample predictions
Y_train = y_train.tolist()
r2_score(Y_train, insample_predictions)

0.9378857425329744

In [26]:
# Make predictions on the test data
outsample_predictions = model.predict(X_test)

In [27]:
# calculate R-squared score for out-sample predictions
Y_test = y_test.tolist()
r2_score(Y_test, outsample_predictions)

0.5761503875153074

In [28]:
#Reading the coordinates for the submission
test_file = pd.read_csv(config.submission_template)
test_file.head()

Unnamed: 0,Longitude,Latitude,UHI Index
0,-73.971665,40.788763,
1,-73.971928,40.788875,
2,-73.96708,40.78908,
3,-73.97255,40.789082,
4,-73.969697,40.787953,


In [29]:
val_data = getConcatenatedData(config.submission_template)

Number of bands of sentinel2.tiff: 11


Mapping values for sentinel2.tiff:   0%|          | 0/1040 [00:00<?, ?it/s]

Mapping values for sentinel2.tiff: 100%|██████████| 1040/1040 [00:10<00:00, 100.59it/s]


Number of bands of landsat_1.tiff: 7


Mapping values for landsat_1.tiff: 100%|██████████| 1040/1040 [00:06<00:00, 163.56it/s]


Number of bands of landsat_2.tiff: 1


Mapping values for landsat_2.tiff: 100%|██████████| 1040/1040 [00:00<00:00, 1221.04it/s]


In [30]:
val_data.head()

Unnamed: 0,B01,B02,B03,B04,B05,B06,B07,B08,B8A,B11,B12,BLUE,GREEN,RED,NIR08,SWIR16,SWIR22,COASTAL,LWIR11
0,805.5,524.0,723.0,517.5,995.0,2207.5,2541.5,2821.0,2697.5,1654.0,1031.0,0.0882,0.109155,0.113115,0.1687199999999999,0.1603049999999999,0.1183949999999999,0.079125,32.614069280000024
1,1214.5,728.0,849.5,733.5,1288.0,2617.0,2961.5,3073.0,3023.5,2116.5,1481.0,0.0882,0.109155,0.113115,0.1687199999999999,0.1603049999999999,0.1183949999999999,0.079125,32.614069280000024
2,1061.0,1110.0,1185.0,1340.0,1329.5,1369.5,1430.0,1626.0,1476.5,1730.5,1665.0,0.0262699999999999,0.0456574999999999,0.0244274999999999,0.4292275,0.1656125,0.0707375,0.0265449999999999,26.923065980000047
3,1191.0,1205.0,1458.0,1699.0,1918.5,1958.5,2023.0,1920.0,2056.0,2393.5,2227.5,0.1383875,0.1662449999999999,0.1855224999999999,0.2467925,0.242145,0.23076,0.128295,35.25961676000003
4,1394.0,1837.0,1949.0,1869.0,1769.5,2288.5,2512.5,2741.0,2632.0,2409.0,2004.5,0.047995,0.07027,0.0631749999999999,0.3162025,0.142705,0.0856974999999999,0.0442275,27.312720259999992


In [31]:
val_data = calcIndices(val_data)

In [32]:
val_data.head()

Unnamed: 0,B01,B02,B03,B04,B05,B06,B07,B08,B8A,B11,...,SWIR16,SWIR22,COASTAL,LWIR11,NDVI_S2,NDVI_LST,NDVI_S2_LST,NDWI_S2,NDBI_S2,MSI_LST
0,805.5,524.0,723.0,517.5,995.0,2207.5,2541.5,2821.0,2697.5,1654.0,...,0.1603049999999999,0.1183949999999999,0.079125,32.614069280000024,0.689981,0.197296,0.443638,-0.591986,-0.260782,0.950124
1,1214.5,728.0,849.5,733.5,1288.0,2617.0,2961.5,3073.0,3023.5,2116.5,...,0.1603049999999999,0.1183949999999999,0.079125,32.614069280000024,0.614607,0.197296,0.405951,-0.566858,-0.184314,0.950124
2,1061.0,1110.0,1185.0,1340.0,1329.5,1369.5,1430.0,1626.0,1476.5,1730.5,...,0.1656125,0.0707375,0.0265449999999999,26.923065980000047,0.096426,0.892308,0.494367,-0.156884,0.031134,0.385839
3,1191.0,1205.0,1458.0,1699.0,1918.5,1958.5,2023.0,1920.0,2056.0,2393.5,...,0.242145,0.23076,0.128295,35.25961676000003,0.061067,0.141725,0.101396,-0.136767,0.109772,0.981168
4,1394.0,1837.0,1949.0,1869.0,1769.5,2288.5,2512.5,2741.0,2632.0,2409.0,...,0.142705,0.0856974999999999,0.0442275,27.312720259999992,0.189154,0.666954,0.428054,-0.16887,-0.064466,0.451309


In [33]:
submission_val_data=val_data.loc[:,['B01','B02','B03','B04','B05','B06','B07','B08','B8A','B11','B12','NIR08','RED','GREEN','BLUE','SWIR16','SWIR22','COASTAL','LWIR11','NDVI_S2_LST','NDWI_S2','NDBI_S2','MSI_LST']]
submission_val_data.head()

Unnamed: 0,B01,B02,B03,B04,B05,B06,B07,B08,B8A,B11,...,GREEN,BLUE,SWIR16,SWIR22,COASTAL,LWIR11,NDVI_S2_LST,NDWI_S2,NDBI_S2,MSI_LST
0,805.5,524.0,723.0,517.5,995.0,2207.5,2541.5,2821.0,2697.5,1654.0,...,0.109155,0.0882,0.1603049999999999,0.1183949999999999,0.079125,32.614069280000024,0.443638,-0.591986,-0.260782,0.950124
1,1214.5,728.0,849.5,733.5,1288.0,2617.0,2961.5,3073.0,3023.5,2116.5,...,0.109155,0.0882,0.1603049999999999,0.1183949999999999,0.079125,32.614069280000024,0.405951,-0.566858,-0.184314,0.950124
2,1061.0,1110.0,1185.0,1340.0,1329.5,1369.5,1430.0,1626.0,1476.5,1730.5,...,0.0456574999999999,0.0262699999999999,0.1656125,0.0707375,0.0265449999999999,26.923065980000047,0.494367,-0.156884,0.031134,0.385839
3,1191.0,1205.0,1458.0,1699.0,1918.5,1958.5,2023.0,1920.0,2056.0,2393.5,...,0.1662449999999999,0.1383875,0.242145,0.23076,0.128295,35.25961676000003,0.101396,-0.136767,0.109772,0.981168
4,1394.0,1837.0,1949.0,1869.0,1769.5,2288.5,2512.5,2741.0,2632.0,2409.0,...,0.07027,0.047995,0.142705,0.0856974999999999,0.0442275,27.312720259999992,0.428054,-0.16887,-0.064466,0.451309


In [34]:
submission_val_data = submission_val_data.values
transformed_submission_data = sc.transform(submission_val_data)

In [35]:
final_predictions = model.predict(transformed_submission_data)
final_prediction_series = pd.Series(final_predictions)

In [36]:
submission_df = pd.DataFrame({'Longitude':test_file['Longitude'].values, 'Latitude':test_file['Latitude'].values, 'UHI Index':final_prediction_series.values})

In [37]:
#Displaying the submission dataframe
print(submission_df)

      Longitude   Latitude  UHI Index
0    -73.971665  40.788763   0.981763
1    -73.971928  40.788875   0.977879
2    -73.967080  40.789080   0.979470
3    -73.972550  40.789082   0.971036
4    -73.969697  40.787953   0.987416
...         ...        ...        ...
1035 -73.919388  40.813803   1.024824
1036 -73.931033  40.833178   1.016706
1037 -73.934647  40.854542   0.990229
1038 -73.917223  40.815413   1.008778
1039 -73.911645  40.804402   1.028554

[1040 rows x 3 columns]


In [38]:
submission_df.to_csv("submission.csv",index = False)