In [None]:
import geopandas as gpd
from datasets import load_dataset
import h3
import warnings
import pandas as pd
from srai.embedders import Hex2VecEmbedder, CountEmbedder
from srai.joiners import IntersectionJoiner
from srai.regionalizers import H3Regionalizer, geocode_to_region_gdf
from srai.loaders.osm_loaders.filters import HEX2VEC_FILTER
from srai.loaders.osm_loaders import OSMPbfLoader
from srai.neighbourhoods import H3Neighbourhood
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import tqdm
from tqdm import tqdm
import copy
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

Building representation vectors for a given city with OSM features (filtered to the ones used in hex2vec embedder) 

In [None]:
CITY = "Paris"
COUNTRY = "FRANCE"
HEX_RESOLUTION = 10
HF_KEY = ""

In [None]:
location = f"{CITY}, {COUNTRY}"

In [None]:
# getting osm features from given area. !TAKES TIME + doesnt work on windows? !
loader = OSMPbfLoader()
area_gdf = geocode_to_region_gdf(location)
features_gdf = loader.load(area_gdf, HEX2VEC_FILTER)
features_gdf

In [None]:
# getting all h3 indexes in given resolution together with their polygon geometry (important for later!)
regionalizer = H3Regionalizer(resolution=HEX_RESOLUTION)
regions_gdf = regionalizer.transform(area_gdf)

In [None]:
# joint df of hexes and their features
joiner = IntersectionJoiner()
joint_gdf = joiner.transform(regions_gdf, features_gdf)
joint_gdf

In [None]:
# For He2vecEmbedder neccessery to correct error in library
# In srai library, one has to cast neighbourhoods to set in neighbourhoods/_base.py

# embedding_layerss=[200,100,50]
# embedding_size=embeddings_layers[-1]
# neighbourhood = H3Neighbourhood(regions_gdf)
# embedder = Hex2VecEmbedder(embeddings_layers)

# with warnings.catch_warnings():
#     warnings.simplefilter("ignore")
#     embeddings = embedder.fit_transform(
#         regions_gdf,
#         features_gdf,
#         joint_gdf,
#         neighbourhood,
#         trainer_kwargs={"max_epochs": 5, "accelerator": "gpu"},
#         batch_size=100,
#     )
# embeddings

In [None]:
# Count embedder
embedder = CountEmbedder()
embeddings = embedder.transform(regions_gdf, features_gdf, joint_gdf)

In [None]:
# to get representation vector from counts
def concat_columns(row):
    return np.concatenate([np.atleast_1d(val) for val in row.values])


embedding_size = len(embeddings.columns)
embeddings["vector_embedding"] = embeddings.apply(concat_columns, axis=1)

Mapping of airbnb data to h3 indexes

In [None]:
dataset = load_dataset("kraina/airbnb_multicity", use_auth_token=HF_KEY)
df = gpd.GeoDataFrame(dataset["train"].to_pandas())
data_gdf = gpd.GeoDataFrame(
    df.drop(["latitude", "longitude"], axis=1),
    geometry=gpd.points_from_xy(x=df["longitude"], y=df["latitude"]),
    crs="EPSG:4326",
)

In [None]:
# because we got embeddings for particular city
data_gdf = data_gdf.loc[data_gdf["city"] == CITY.lower()]

In [None]:
data_regions_gdf = regionalizer.transform(data_gdf)
# assigns points into a h3 index ( 'within' polygon )
data_joined_gdf = gpd.sjoin(data_gdf, data_regions_gdf, how="left", op="within")
data_joined_gdf.rename(columns={"index_right": "h3_index"}, inplace=True)

In [None]:
# Create a new DataFrame with index as H3 names and column as average prices within
average_prices = data_joined_gdf.groupby("h3_index")["price"].mean()
average_prices_df = pd.DataFrame({"average_price": average_prices})

Combining data with embeddings vectors

In [None]:
# workaround because I kept loosing index somehow after merging those
embeddings["h3"] = embeddings.index
merged_gdf = embeddings.merge(
    average_prices_df, how="inner", left_on="region_id", right_on="h3_index"
)
# We need to think how it should work -> for now, inner join results just in regions that are both
# in area hexes and airbnb daya, so it leaves out regions that were not included in data - question, should we include them?

Data prep for model

In [None]:
X = merged_gdf["vector_embedding"].values
X_h3_idx = merged_gdf["h3"].values
y = merged_gdf["average_price"].values

In [None]:
# TODO
# While splitting to train test and dev, we should keep which indexes belong to which split
# so then we could divide X_h3_idx to same splits as well. This ways we could map
# h3_index to its vector and use it for single example inference

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True)
X_train = torch.tensor(X_train.tolist(), dtype=torch.float32).cuda()
y_train = torch.tensor(y_train, dtype=torch.float32).reshape(-1, 1).cuda()
X_test = torch.tensor(X_test.tolist(), dtype=torch.float32).cuda()
y_test = torch.tensor(y_test, dtype=torch.float32).reshape(-1, 1).cuda()

simple model & training loop

In [None]:
# embedding size comes from embedders -> in count embedder its number of features,
#  in hex2vec you define latent space dim
model = nn.Sequential(
    nn.Linear(embedding_size, 225),
    nn.Sigmoid(),
    nn.Dropout(0.2),
    nn.Linear(225, 100),
    nn.Sigmoid(),
    nn.Dropout(0.2),
    nn.Linear(100, 50),
    nn.Sigmoid(),
    nn.Dropout(0.2),
    nn.Linear(50, 25),
    nn.Sigmoid(),
    nn.Dropout(0.2),
    nn.Linear(25, 1),
    nn.ReLU(),
)
model.cuda()
loss_fn = nn.MSELoss()  # mean square error
optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [None]:
n_epochs = 100
batch_size = 10
batch_start = torch.arange(0, len(X_train), batch_size)

best_mse = np.inf  # init to infinity
best_weights = None
mse_eval = []
mse_train = []

In [None]:
for epoch in range(n_epochs):
    model.train()
    with tqdm(batch_start, unit="batch", mininterval=0) as bar:
        bar.set_description(f"Epoch {epoch}")
        for start in bar:
            # take a batch
            X_batch = X_train[start : start + batch_size]
            y_batch = y_train[start : start + batch_size]
            # forward pass
            y_pred = model(X_batch)
            loss = loss_fn(y_pred, y_batch)
            # backward pass
            optimizer.zero_grad()
            loss.backward()
            # update weights
            optimizer.step()
            # print progress
            bar.set_postfix(mse=float(loss))
            mse_train.append(float(loss))
    # evaluate at end of each epoch
    model.eval()
    y_pred = model(X_test)
    mse = loss_fn(y_pred, y_test)
    mse = float(mse)
    mse_eval.append(mse)
    if mse < best_mse:
        best_mse = mse
        best_weights = copy.deepcopy(model.state_dict())

# restore model and return best accuracy
model.load_state_dict(best_weights)

In [None]:
plt.plot(mse_eval)

plt.xlabel("epoch")
plt.ylabel("MSE eval value")
plt.title("Plot of MSE results")

# Show the plot
plt.show()