In [None]:
import copy
import os
import warnings

import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import tqdm
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from srai.datasets import AirbnbMulticityDataset
from srai.embedders import Hex2VecEmbedder
from srai.joiners import IntersectionJoiner
from srai.loaders.osm_loaders import OSMPbfLoader
from srai.loaders.osm_loaders.filters import HEX2VEC_FILTER
from srai.neighbourhoods.h3_neighbourhood import H3Neighbourhood
from srai.plotting import plot_regions
from srai.regionalizers import H3Regionalizer

In [None]:
%load_ext dotenv

%dotenv

### Hyperparameters

In [None]:
resolution = 9
embedder_hidden_sizes = [150, 100, 50]
max_epochs_embedder = 10
batch_size_embedder = 100

### Dataset load

In [None]:
hf_token = os.getenv("HF_TOKEN")
airbnb_multicity = AirbnbMulticityDataset()
airbnb_multicity_gdf = airbnb_multicity.load(hf_token=hf_token)
gdf_paris = airbnb_multicity_gdf.loc[
    airbnb_multicity_gdf["city"].isin(["paris", "rotterdam"])
]  # , "rotterdam", "brussels"])]

In [None]:
airbnb_multicity_gdf.city.unique()

### Regionalizer

In [None]:
regionalizer = H3Regionalizer(resolution=resolution)
regions = regionalizer.transform(gdf_paris)
plot_regions(regions_gdf=regions)

In [None]:
regions

### OSM loader & joiner

In [None]:
loader = OSMPbfLoader()
features = loader.load(gdf_paris, HEX2VEC_FILTER)
joiner = IntersectionJoiner()
joint = joiner.transform(regions, features)

### H3 Embeddings 

In [None]:
neighbourhood = H3Neighbourhood(regions)
embedder = Hex2VecEmbedder(embedder_hidden_sizes)
device = "cuda" if torch.cuda.is_available() else "cpu"
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    embeddings = embedder.fit_transform(
        regions,
        features,
        joint,
        neighbourhood,
        trainer_kwargs={"max_epochs": max_epochs_embedder, "accelerator": device},
        batch_size=batch_size_embedder,
    )

In [None]:
embeddings.head()

In [None]:
embeddings_size = embeddings.values.shape[1]
print(f"Embeddings size: {embeddings_size}")

In [None]:
def concat_columns(row) -> np.ndarray:
    """Concatenate embedding values together.

    Args:
        row (_type_): row of embeddings

    Returns:
        np.ndarray: concatenated embedding
    """
    return np.concatenate([np.atleast_1d(val) for val in row.values])

In [None]:
joined_gdf = gpd.sjoin(gdf_paris, regions, how="left", op="within")
joined_gdf.rename(columns={"index_right": "h3_index"}, inplace=True)
# getting avg price per h3
average_hex_prices = joined_gdf.groupby("h3_index")["price"].mean()
average_hex_prices.head()

In [None]:
gdf_paris.describe()

### Add features

In [None]:
target = ["price"]
features_to_add = [
    "number_of_reviews",
    "minimum_nights",
    "availability_365",
    "calculated_host_listings_count",
    "number_of_reviews_ltm",
]
input_features = ["vector_embedding"] + features_to_add
columns_to_add = features_to_add + target

In [None]:
# getting avg price per h3
averages_hex = joined_gdf.groupby("h3_index")[columns_to_add].mean()
averages_hex.head()

In [None]:
# import pandas as pd
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# scaler.fit(gdf_paris[columns_to_add])
# gdf_paris[columns_to_add].head()
# standarized = pd.DataFrame(scaler.transform(gdf_paris[columns_to_add]), columns=columns_to_add)
# standarized.head()

In [None]:
embeddings["h3"] = embeddings.index

merged_gdf = embeddings.merge(averages_hex, how="inner", left_on="region_id", right_on="h3_index")
merged_gdf.head()

In [None]:
merge_columns = [col for col in merged_gdf.columns if col not in (["h3"] + target)]

In [None]:
X = merged_gdf[merge_columns].apply(concat_columns, axis=1).values
X_h3_idx = merged_gdf["h3"].values
y = merged_gdf["price"].values

### Test train split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True)
X_train = torch.tensor(X_train.tolist(), dtype=torch.float32).cuda()
y_train = torch.tensor(y_train, dtype=torch.float32).reshape(-1, 1).cuda()
X_test = torch.tensor(X_test.tolist(), dtype=torch.float32).cuda()
y_test = torch.tensor(y_test, dtype=torch.float32).reshape(-1, 1).cuda()

### Model

In [None]:
def train(
    model,
    n_epochs,
    optimizer,
    loss_fn,
    batch_size,
    batch_start,
) -> tuple[nn.Module, list, list]:
    best_mse = np.inf  # init to infinity
    best_weights = None
    l1_loss_eval = []
    l1_loss_train = []

    for epoch in range(n_epochs):
        loss_list = []
        model.train()
        with tqdm(batch_start, unit="batch", mininterval=0) as bar:
            bar.set_description(f"Epoch {epoch}")
            for start in bar:
                # take a batch
                X_batch = X_train[start : start + batch_size]
                y_batch = y_train[start : start + batch_size]
                # forward pass
                y_pred = model(X_batch)
                loss = loss_fn(y_pred, y_batch)
                # backward pass

                optimizer.zero_grad()
                loss.backward()
                # update weights
                optimizer.step()
                # print progress
                # bar.set_postfix(mse=float(loss))
                # mse_train.append(loss.item())
                loss_list.append(loss.item())
        print(f"Epoch [{epoch+1}/{n_epochs}], avg_loss: {np.mean(loss_list):.4f}")
        l1_loss_train.append(np.mean(loss_list))
        with torch.no_grad():
            model.eval()
            y_pred = model(X_test)
            # y_pred_train= model(X_train)
            # mae_train=loss_fn(y_pred_train, y_train)
            mse = loss_fn(y_pred, y_test)
            # mse_train.append(float(mae_train))
            l1_loss_eval.append(float(mse))
            if mse < best_mse:
                best_mse = mse
                best_weights = copy.deepcopy(model.state_dict())

    # restore model and return best accuracy
    model.load_state_dict(best_weights)
    return model, l1_loss_train, l1_loss_eval

In [None]:
class RegressionBaseModel(nn.Module):
    """Regression base module.

    Definition of Regression Module
    """

    def __init__(self, embeddings_size, linear_sizes=None):
        """Initializaiton of regression module.

        Args:
            embeddings_size (_type_): size of input embedding
            linear_sizes (_type_, optional): sizes of linear layers inside module. \
                Defaults to None.
        """
        super().__init__()
        if linear_sizes is None:
            linear_sizes = [500, 1000]
        self.model = torch.nn.Sequential()
        previous_size = embeddings_size
        for cnt, size in enumerate(linear_sizes):
            self.model.add_module(f"linear_{cnt}", nn.Linear(previous_size, size))
            self.model.add_module(f"ReLU_{cnt}", nn.ReLU())
            previous_size = size
            if cnt % 2:
                self.model.add_module(f"dropout_{cnt}", nn.Dropout(p=0.2))
        self.model.add_module("linear_final", nn.Linear(previous_size, 1))

    def forward(self, x):  # noqa: D102
        return self.model(x)

In [None]:
X_train.shape

### Model hyperparameters

In [None]:
n_epochs = 100
batch_size = 30
lr = 0.001
linear_sizes = [500, 1000]

batch_start = torch.arange(0, len(X_train), batch_size)

In [None]:
model = RegressionBaseModel(embeddings_size=X_train.shape[1], linear_sizes=linear_sizes)
model

In [None]:
model.cuda()
loss_fn = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=lr)

In [None]:
result_model, loss_train, loss_eval = train(
    model, n_epochs, optimizer, loss_fn, batch_size, batch_start
)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(16, 9))

ax[0].plot(loss_eval)
ax[1].plot(loss_train)

ax[0].set_xlabel("epoch")
ax[0].set_ylabel("L1 loss eval value")
ax[0].set_title("Plot of L1 loss eval results")

ax[1].set_xlabel("epoch")
ax[1].set_ylabel("L1 loss train value")
ax[1].set_title("Plot of L1 loss train results")
# Show the plot
plt.show()

In [None]:
np.min(loss_eval)

In [None]:
np.mean(y)