In [28]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/buliding-density/validation_data_with_density.csv
/kaggle/input/buliding-density/training_data_with_density.csv
/kaggle/input/satellite-data/validation_data_with_satellite.csv
/kaggle/input/satellite-data/training_data_with_satellite.csv


In [29]:
# Feature Engineering
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Machine Learning
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBRegressor
import h2o
from h2o.automl import H2OAutoML

# Combine Datasets

In [30]:
train_density = pd.read_csv("/kaggle/input/buliding-density/training_data_with_density.csv")
train_density.head()

Unnamed: 0,Longitude,Latitude,datetime,UHI Index,density
0,-73.909167,40.813107,24-07-2021 15:53,1.030289,8
1,-73.909187,40.813045,24-07-2021 15:53,1.030289,8
2,-73.909215,40.812978,24-07-2021 15:53,1.023798,8
3,-73.909242,40.812908,24-07-2021 15:53,1.023798,8
4,-73.909257,40.812845,24-07-2021 15:53,1.021634,8


In [31]:
train_satellite = pd.read_csv("/kaggle/input/satellite-data/training_data_with_satellite.csv")
train_satellite.head()

Unnamed: 0,Longitude,Latitude,datetime,UHI Index,B01,B02,B03,B04,B05,B06,B07,B08,B8A,B11,B12,NDVI,NDBI,NDWI,LST
0,-73.909167,40.813107,24-07-2021 15:53,1.030289,825.0,990.0,1144.0,1172.0,1435.0,1688.0,1688.0,1830.0,1688.0,1788.0,1540.0,0.219187,-0.011609,-0.230666,38.393941
1,-73.909187,40.813045,24-07-2021 15:53,1.030289,825.0,990.0,1144.0,1172.0,1435.0,1688.0,1688.0,1830.0,1688.0,1788.0,1540.0,0.219187,-0.011609,-0.230666,38.393941
2,-73.909215,40.812978,24-07-2021 15:53,1.023798,825.0,622.0,785.0,744.0,1114.0,2006.0,2138.0,2318.0,2353.0,1566.0,1170.0,0.514043,-0.193615,-0.494038,37.785534
3,-73.909242,40.812908,24-07-2021 15:53,1.023798,825.0,619.0,750.0,738.0,1114.0,2006.0,2138.0,2302.0,2353.0,1566.0,1170.0,0.514474,-0.190279,-0.508519,37.785534
4,-73.909257,40.812845,24-07-2021 15:53,1.021634,825.0,556.0,758.0,660.0,1056.0,1891.0,2141.0,2280.0,2259.0,1658.0,1240.0,0.55102,-0.157948,-0.500987,37.358281


In [32]:
train_satellite = train_satellite.drop(["Longitude","Latitude","datetime","UHI Index"], axis=1)
train_concat = pd.concat([train_density, train_satellite], axis = 1)
train_concat.head()

Unnamed: 0,Longitude,Latitude,datetime,UHI Index,density,B01,B02,B03,B04,B05,B06,B07,B08,B8A,B11,B12,NDVI,NDBI,NDWI,LST
0,-73.909167,40.813107,24-07-2021 15:53,1.030289,8,825.0,990.0,1144.0,1172.0,1435.0,1688.0,1688.0,1830.0,1688.0,1788.0,1540.0,0.219187,-0.011609,-0.230666,38.393941
1,-73.909187,40.813045,24-07-2021 15:53,1.030289,8,825.0,990.0,1144.0,1172.0,1435.0,1688.0,1688.0,1830.0,1688.0,1788.0,1540.0,0.219187,-0.011609,-0.230666,38.393941
2,-73.909215,40.812978,24-07-2021 15:53,1.023798,8,825.0,622.0,785.0,744.0,1114.0,2006.0,2138.0,2318.0,2353.0,1566.0,1170.0,0.514043,-0.193615,-0.494038,37.785534
3,-73.909242,40.812908,24-07-2021 15:53,1.023798,8,825.0,619.0,750.0,738.0,1114.0,2006.0,2138.0,2302.0,2353.0,1566.0,1170.0,0.514474,-0.190279,-0.508519,37.785534
4,-73.909257,40.812845,24-07-2021 15:53,1.021634,8,825.0,556.0,758.0,660.0,1056.0,1891.0,2141.0,2280.0,2259.0,1658.0,1240.0,0.55102,-0.157948,-0.500987,37.358281


In [33]:
validation_density = pd.read_csv("/kaggle/input/buliding-density/validation_data_with_density.csv")
validation_density.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Longitude,Latitude,UHI Index,density
0,-73.971665,40.788763,,12
1,-73.971928,40.788875,,12
2,-73.96708,40.78908,,4
3,-73.97255,40.789082,,12
4,-73.969697,40.787953,,9


In [34]:
validation_satellite = pd.read_csv("/kaggle/input/satellite-data/validation_data_with_satellite.csv")
validation_satellite.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Longitude,Latitude,UHI Index,B01,B02,B03,B04,B05,B06,B07,B08,B8A,B11,B12,NDVI,NDBI,NDWI,LST
0,-73.971665,40.788763,,794.0,511.0,568.0,527.0,1112.0,2323.0,2649.0,2360.0,2784.0,1741.0,1218.0,0.634915,-0.150939,-0.612022,36.20299
1,-73.971928,40.788875,,1221.0,494.0,661.0,497.0,1315.0,2652.0,2964.0,3662.0,3102.0,2124.0,1541.0,0.761,-0.265814,-0.694194,36.20299
2,-73.96708,40.78908,,1049.0,841.0,968.0,1056.0,1028.0,1418.0,1610.0,1402.0,1636.0,1634.0,1198.0,0.140765,0.076416,-0.183122,36.069687
3,-73.97255,40.789082,,1189.0,903.0,984.0,1108.0,1971.0,1978.0,1970.0,1478.0,2049.0,2303.0,2219.0,0.143078,0.218196,-0.20065,36.886594
4,-73.969697,40.787953,,1018.0,716.0,1046.0,917.0,1286.0,2382.0,2778.0,3038.0,2912.0,2102.0,1705.0,0.536283,-0.182101,-0.487757,34.500816


In [35]:
validation_satellite = validation_satellite.drop(["Longitude","Latitude","UHI Index"], axis=1)
validation_concat = pd.concat([validation_density, validation_satellite], axis=1)
validation_concat.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Longitude,Latitude,UHI Index,density,B01,B02,B03,B04,B05,B06,B07,B08,B8A,B11,B12,NDVI,NDBI,NDWI,LST
0,-73.971665,40.788763,,12,794.0,511.0,568.0,527.0,1112.0,2323.0,2649.0,2360.0,2784.0,1741.0,1218.0,0.634915,-0.150939,-0.612022,36.20299
1,-73.971928,40.788875,,12,1221.0,494.0,661.0,497.0,1315.0,2652.0,2964.0,3662.0,3102.0,2124.0,1541.0,0.761,-0.265814,-0.694194,36.20299
2,-73.96708,40.78908,,4,1049.0,841.0,968.0,1056.0,1028.0,1418.0,1610.0,1402.0,1636.0,1634.0,1198.0,0.140765,0.076416,-0.183122,36.069687
3,-73.97255,40.789082,,12,1189.0,903.0,984.0,1108.0,1971.0,1978.0,1970.0,1478.0,2049.0,2303.0,2219.0,0.143078,0.218196,-0.20065,36.886594
4,-73.969697,40.787953,,9,1018.0,716.0,1046.0,917.0,1286.0,2382.0,2778.0,3038.0,2912.0,2102.0,1705.0,0.536283,-0.182101,-0.487757,34.500816


# Select features 

In [36]:
features = ['B01','B06','NDVI','NDBI','NDWI','LST','density']
#features = ['B01','B02','B03','B04','B05','B06','B07','B08','B8A','B11','B12','NDVI','NDBI','NDWI','LST','density']
train_df = train_concat[features + ["UHI Index"]]
train_df

Unnamed: 0,B01,B06,NDVI,NDBI,NDWI,LST,density,UHI Index
0,825.0,1688.0,0.219187,-0.011609,-0.230666,38.393941,8,1.030289
1,825.0,1688.0,0.219187,-0.011609,-0.230666,38.393941,8,1.030289
2,825.0,2006.0,0.514043,-0.193615,-0.494038,37.785534,8,1.023798
3,825.0,2006.0,0.514474,-0.190279,-0.508519,37.785534,8,1.023798
4,825.0,1891.0,0.551020,-0.157948,-0.500987,37.358281,8,1.021634
...,...,...,...,...,...,...,...,...
11224,432.0,2545.0,0.727473,-0.241216,-0.628594,30.293234,0,0.972470
11225,432.0,2545.0,0.766208,-0.303371,-0.659820,30.293234,0,0.972470
11226,432.0,2545.0,0.766208,-0.303371,-0.659820,30.440209,0,0.981124
11227,432.0,2545.0,0.766208,-0.303371,-0.659820,30.440209,0,0.981245


# Remove duplicates from training data

In [37]:
# Remove duplicate rows from the DataFrame based on specified columns and keep the first occurrence
for col in features:
    # Check if the value is a numpy array and has more than one dimension
    train_df[col] = train_df[col].apply(lambda x: tuple(x) if isinstance(x, np.ndarray) and x.ndim > 0 else x)

# Now remove duplicates
uhi_data = train_df.drop_duplicates(subset=features, keep='first')
uhi_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[col] = train_df[col].apply(lambda x: tuple(x) if isinstance(x, np.ndarray) and x.ndim > 0 else x)


Unnamed: 0,B01,B06,NDVI,NDBI,NDWI,LST,density,UHI Index
0,825.0,1688.0,0.219187,-0.011609,-0.230666,38.393941,8,1.030289
2,825.0,2006.0,0.514043,-0.193615,-0.494038,37.785534,8,1.023798
3,825.0,2006.0,0.514474,-0.190279,-0.508519,37.785534,8,1.023798
4,825.0,1891.0,0.55102,-0.157948,-0.500987,37.358281,8,1.021634
6,825.0,1891.0,0.612442,-0.219765,-0.553026,37.358281,8,1.015143


In [38]:
uhi_data.shape

(9054, 8)

In [39]:
# Resetting the index of the dataset
uhi_data=uhi_data.reset_index(drop=True)

In [40]:
uhi_data.isna().sum()

B01          0
B06          0
NDVI         0
NDBI         0
NDWI         0
LST          0
density      0
UHI Index    0
dtype: int64

# Model Building

In [41]:
# Split the data into features (X) and target (y), and then into training and testing sets
X = uhi_data.drop(columns=['UHI Index']).values
y = uhi_data ['UHI Index'].values
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

**Feature Scaling**

In [42]:
# Scale the training and test data using standardscaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_valid = sc.transform(X_valid)

**Model Training**

In [43]:
# Initialize H2O
h2o.init()

# Convert to H2OFrame
train_h2o = h2o.H2OFrame(pd.concat([pd.DataFrame(X_train), pd.Series(y_train, name="UHI Index")], axis=1))
test_h2o = h2o.H2OFrame(pd.concat([pd.DataFrame(X_valid), pd.Series(y_valid, name="UHI Index")], axis=1))

# Train AutoML with tree-based models
aml = H2OAutoML(max_models=100, seed=42)
aml.train(y="UHI Index", training_frame=train_h2o)

# Evaluate
perf = aml.leader.model_performance(test_h2o)
print("H2O AutoML MSE:", perf.mse())

# Leaderboard
print(aml.leaderboard)

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,2 mins 37 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,3 months and 1 day
H2O_cluster_name:,H2O_from_python_unknownUser_ia4968
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.498 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
H2O AutoML MSE: 0.0001145697632586859
model_id                                 rmse          mse         mae       rmsle    mean_residual_deviance
XGBoost_1_AutoML_2_20250203_210234  0.0120961  0.000146316  0.00933792  0.00604708               0.000146316
[1 row x 6 columns]



**Model Evaluation**

In [44]:
# Convert validation data to H2OFrame
X_valid_h2o = h2o.H2OFrame(pd.DataFrame(X_valid))

# Make predictions using the leader model
y_pred_h2o = aml.leader.predict(X_valid_h2o)

# Convert predictions to a Pandas DataFrame (optional)
y_pred = y_pred_h2o.as_data_frame()["predict"].tolist()

# Convert true values to list (if not already done)
Y_valid = y_valid.tolist()

# Calculate R² score
print("R² Score:", r2_score(Y_valid, y_pred))

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
xgboost prediction progress: |███████████████████████████████████████████████████| (done) 100%
R² Score: 0.5572675063153549





# Submission

In [45]:
validation_concat.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Longitude,Latitude,UHI Index,density,B01,B02,B03,B04,B05,B06,B07,B08,B8A,B11,B12,NDVI,NDBI,NDWI,LST
0,-73.971665,40.788763,,12,794.0,511.0,568.0,527.0,1112.0,2323.0,2649.0,2360.0,2784.0,1741.0,1218.0,0.634915,-0.150939,-0.612022,36.20299
1,-73.971928,40.788875,,12,1221.0,494.0,661.0,497.0,1315.0,2652.0,2964.0,3662.0,3102.0,2124.0,1541.0,0.761,-0.265814,-0.694194,36.20299
2,-73.96708,40.78908,,4,1049.0,841.0,968.0,1056.0,1028.0,1418.0,1610.0,1402.0,1636.0,1634.0,1198.0,0.140765,0.076416,-0.183122,36.069687
3,-73.97255,40.789082,,12,1189.0,903.0,984.0,1108.0,1971.0,1978.0,1970.0,1478.0,2049.0,2303.0,2219.0,0.143078,0.218196,-0.20065,36.886594
4,-73.969697,40.787953,,9,1018.0,716.0,1046.0,917.0,1286.0,2382.0,2778.0,3038.0,2912.0,2102.0,1705.0,0.536283,-0.182101,-0.487757,34.500816


In [46]:
submission_val_data = validation_concat[features]
submission_val_data.head()

Unnamed: 0,B01,B06,NDVI,NDBI,NDWI,LST,density
0,794.0,2323.0,0.634915,-0.150939,-0.612022,36.20299,12
1,1221.0,2652.0,0.761,-0.265814,-0.694194,36.20299,12
2,1049.0,1418.0,0.140765,0.076416,-0.183122,36.069687,4
3,1189.0,1978.0,0.143078,0.218196,-0.20065,36.886594,12
4,1018.0,2382.0,0.536283,-0.182101,-0.487757,34.500816,9


In [47]:
# Feature Scaling 
submission_val_data = submission_val_data.values
transformed_submission_data = sc.transform(submission_val_data)

In [48]:
# Convert transformed_submission_data to H2OFrame
transformed_submission_h2o = h2o.H2OFrame(pd.DataFrame(transformed_submission_data))

# Make predictions using the leader model
final_predictions_h2o = aml.leader.predict(transformed_submission_h2o)

# Convert the predictions to a Pandas Series
final_prediction_series = pd.Series(final_predictions_h2o.as_data_frame()["predict"])

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
xgboost prediction progress: |███████████████████████████████████████████████████| (done) 100%





In [49]:
#Combining the results into dataframe
submission_df = pd.DataFrame({'Longitude':validation_concat['Longitude'].values, 'Latitude':validation_concat['Latitude'].values, 'UHI Index':final_prediction_series.values})

In [50]:
#Displaying the sample submission dataframe
submission_df.head()

Unnamed: 0,Longitude,Latitude,UHI Index
0,-73.971665,40.788763,0.965341
1,-73.971928,40.788875,0.964271
2,-73.96708,40.78908,0.970355
3,-73.97255,40.789082,0.976976
4,-73.969697,40.787953,0.964062


In [51]:
#Dumping the predictions into a csv file.
submission_df.to_csv("submission.csv",index = False)