In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install autogluon

In [None]:
!pip install scikit-learn==1.4.0

In [None]:
# Feature Engineering
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Machine Learning
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBRegressor
from autogluon.tabular import TabularPredictor

# Combine Datasets

In [None]:
train_density = pd.read_csv("/kaggle/input/buliding-density/training_data_with_density.csv")
train_density.head()

In [None]:
train_satellite = pd.read_csv("/kaggle/input/satellite-data/training_data_with_satellite.csv")
train_satellite.head()

In [None]:
train_satellite = train_satellite.drop(["Longitude","Latitude","datetime","UHI Index"], axis=1)
train_concat = pd.concat([train_density, train_satellite], axis = 1)
train_concat.head()

In [None]:
validation_density = pd.read_csv("/kaggle/input/buliding-density/validation_data_with_density.csv")
validation_density.head()

In [None]:
validation_satellite = pd.read_csv("/kaggle/input/satellite-data/validation_data_with_satellite.csv")
validation_satellite.head()

In [None]:
validation_satellite = validation_satellite.drop(["Longitude","Latitude","UHI Index"], axis=1)
validation_concat = pd.concat([validation_density, validation_satellite], axis=1)
validation_concat.head()

# Select features 

In [None]:
features = ['B01','B06','NDVI','NDBI','NDWI','LST','density']
#features = ['B01','B02','B03','B04','B05','B06','B07','B08','B8A','B11','B12','NDVI','NDBI','NDWI','LST','density']
train_df = train_concat[features + ["UHI Index"]]
train_df

# Remove duplicates from training data

In [None]:
# Remove duplicate rows from the DataFrame based on specified columns and keep the first occurrence
for col in features:
    # Check if the value is a numpy array and has more than one dimension
    train_df[col] = train_df[col].apply(lambda x: tuple(x) if isinstance(x, np.ndarray) and x.ndim > 0 else x)

# Now remove duplicates
uhi_data = train_df.drop_duplicates(subset=features, keep='first')
uhi_data.head()

In [None]:
uhi_data.shape

In [None]:
# Resetting the index of the dataset
uhi_data=uhi_data.reset_index(drop=True)

In [None]:
uhi_data.isna().sum()

# Model Building

In [None]:
# Split the data into features (X) and target (y), and then into training and testing sets
X = uhi_data.drop(columns=['UHI Index']).values
y = uhi_data ['UHI Index'].values
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

**Feature Scaling**

In [None]:
# Scale the training and test data using standardscaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_valid = sc.transform(X_valid)

**Model Training**

In [None]:
# Convert to DataFrame for AutoGluon
train_data = pd.DataFrame(X_train)
train_data['UHI Index'] = y_train

test_data = pd.DataFrame(X_valid)
test_data['UHI Index'] = y_valid

# Train AutoGluon with an optimized XGBoost model and additional tree-based models
predictor = TabularPredictor(label="UHI Index", problem_type="regression").fit(
    train_data, 
    num_bag_folds=5,
    presets='best',
    num_stack_levels=6
)

# Evaluate AutoGluon ensemble
results = predictor.evaluate(test_data)
print("AutoGluon Ensemble Results:", results)

# Print final model architecture
print("AutoGluon Model Architecture:")
predictor.leaderboard(extra_info=True)

**Model Evaluation**

In [None]:
y_pred = predictor.predict(pd.DataFrame(X_valid))
Y_valid = y_valid.tolist()
print("R² Score:", r2_score(Y_valid, y_pred))

# Submission

In [None]:
validation_concat.head()

In [None]:
submission_val_data = validation_concat[features]
submission_val_data.head()

In [None]:
# Feature Scaling 
submission_val_data = submission_val_data.values
transformed_submission_data = sc.transform(submission_val_data)

In [None]:
#Making predictions
final_predictions = predictor.predict(pd.DataFrame(transformed_submission_data))
final_prediction_series = pd.Series(final_predictions)

In [None]:
#Combining the results into dataframe
submission_df = pd.DataFrame({'Longitude':validation_concat['Longitude'].values, 'Latitude':validation_concat['Latitude'].values, 'UHI Index':final_prediction_series.values})

In [None]:
#Displaying the sample submission dataframe
submission_df.head()

In [None]:
#Dumping the predictions into a csv file.
submission_df.to_csv("submission.csv",index = False)