In [20]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
import sys
sys.path.append("../../")
import geopandas as gpd
from src.organized_datasets_creation.utils import resolve_nominatim_city_name
from src.graph_layering.graph_layer_creator import GraphLayerController
import pandas as pd
import os
from src.graph_layering.graph_layer_creator import SourceType
import warnings
from src.graph_layering.create_hetero_data import create_hetero_data
from sklearn.preprocessing import OneHotEncoder
from typing import cast
from tqdm import tqdm

import wandb.util
import wandb
import os


import numpy as np
from src.graph.create_osmnx_graph import OSMnxGraph
import json
from shapely.geometry import Point
from joblib import dump


from datetime import datetime
from sklearn.metrics import f1_score, roc_auc_score
from wandb.util import generate_id
from sklearn.linear_model import LogisticRegression
from src.training.train import train
from sklearn.preprocessing import StandardScaler

In [None]:
WANDB_API_KEY = os.environ.get("WANDB_API_KEY", None)
assert (
    WANDB_API_KEY is not None
), "WANDB_API_KEY is not set, did you forget it in the config file?"

In [22]:
# general settings
ORGANIZED_HEXES_LOCATION = "../../data/organized-hexes"
ORGANIZED_GRAPHS_LOCATION = "../../data/organized_graphs"
OSMNX_ALL_ATTRIBUTES_LOCATION = (
    "../../data/osmnx_attributes.json"
)

# downstream task settings
AIRBNB_LOCATION = "../../data/downstream_tasks/airbnb"
TRAIN_SAVE_DIR = "../../gradient_logs"

SWEEP_RUNS_COUNT = 2

ATTRIBUTES_CONFIGURATIONS = [
    {
        "USE_ORTOPHOTO": False,
        "USE_HEXES_ATTRS": True,
        "USE_OSMNX_ATTRS": True,
    },
    {
        "USE_ORTOPHOTO": False,
        "USE_HEXES_ATTRS": True,
        "USE_OSMNX_ATTRS": False,
    },
    {
        "USE_ORTOPHOTO": False,
        "USE_HEXES_ATTRS": False,
        "USE_OSMNX_ATTRS": True,
    },
]

WANDB_SWEEP_PARAMS_GRAPH_DATA = {
    "method": "bayes",
    "metric": {"name": "mean_f1", "goal": "maximize"},
    "parameters": {
        "hidden_channels": {"values": [10, 20, 30, 40, 50]},
        "learning_rate": {
            "distribution": "log_uniform_values",
            "min": 1e-5,
            "max": 1e-2,
        },
        "epochs": {
            "distribution": "int_uniform",
            "min": 10,
            "max": 20,
        },
        "num_conv_layers": {"values": [1, 2, 3, 4, 5]},
        "lin_layer_size": {"values": [8, 16, 32, 64, 128]},
        "num_lin_layers": {"values": [0, 1, 2, 3, 4]},
        "weight_decay": {
            "distribution": "log_uniform_values",
            "min": 1e-5,
            "max": 1e-2,
        },
    },
}

WANDB_SWEEP_PARAMS_TABULAR_DATA = {
    "method": "bayes",
    "metric": {"name": "mean_f1", "goal": "maximize"},
    "parameters": {
        "solver_penalty": {
            "values": [
                "lbfgs;l2",
                "liblinear;l1",
                "liblinear;l2",
                "newton-cg;l2",
                "newton-cholesky;l2",
                "sag;l2",
                "saga;elasticnet",
                "saga;l1",
                "saga;l2",
            ]
        },
        "C": {
            "distribution": "log_uniform_values",
            "min": 1e-5,
            "max": 1,
        },
    },
}

# Load airbnb data


In [23]:
def create_gdf_airbnb_price(data):
    gdf = gpd.GeoDataFrame(
                data.drop(["latitude", "longitude"], axis=1),
                geometry=gpd.points_from_xy(x=data["longitude"], y=data["latitude"]),
                crs="EPSG:4326",
            )
    gdf_price = gdf[['geometry', 'price']]
    gdf_price.reset_index(inplace=True, drop=True)
    
    return gdf_price

In [24]:
airbnb_ny = pd.read_csv(f"{AIRBNB_LOCATION}/airbnb_new_york_cleaned.csv")
airbnb_ny = create_gdf_airbnb_price(airbnb_ny)
airbnb_ny["mie_nazwa"] = "new_york"
airbnb_ny

Unnamed: 0,geometry,price,mie_nazwa
0,POINT (-73.95512 40.68535),72.0,new_york
1,POINT (-73.99454 40.66265),253.0,new_york
2,POINT (-73.95342 40.70935),81.0,new_york
3,POINT (-73.94255 40.80107),65.0,new_york
4,POINT (-73.94759 40.78778),60.0,new_york
...,...,...,...
23756,POINT (-74.00952 40.71445),721.0,new_york
23757,POINT (-74.00940 40.71647),721.0,new_york
23758,POINT (-74.01149 40.71617),1080.0,new_york
23759,POINT (-73.94935 40.78102),198.0,new_york


In [25]:
airbnb_st = pd.read_csv(f"{AIRBNB_LOCATION}/airbnb_seattle_cleaned.csv")
airbnb_st = create_gdf_airbnb_price(airbnb_st)
airbnb_st["mie_nazwa"] = "seattle"
airbnb_st

Unnamed: 0,geometry,price,mie_nazwa
0,POINT (-122.33629 47.65444),99.0,seattle
1,POINT (-122.31937 47.55017),72.0,seattle
2,POINT (-122.38663 47.55495),166.0,seattle
3,POINT (-122.38607 47.55627),125.0,seattle
4,POINT (-122.37196 47.67947),81.0,seattle
...,...,...,...
6073,POINT (-122.34716 47.61584),307.0,seattle
6074,POINT (-122.37198 47.70236),145.0,seattle
6075,POINT (-122.34887 47.61576),153.0,seattle
6076,POINT (-122.35028 47.62791),186.0,seattle


In [26]:
airbnb = pd.concat([airbnb_ny, airbnb_st])
airbnb

Unnamed: 0,geometry,price,mie_nazwa
0,POINT (-73.95512 40.68535),72.0,new_york
1,POINT (-73.99454 40.66265),253.0,new_york
2,POINT (-73.95342 40.70935),81.0,new_york
3,POINT (-73.94255 40.80107),65.0,new_york
4,POINT (-73.94759 40.78778),60.0,new_york
...,...,...,...
6073,POINT (-122.34716 47.61584),307.0,seattle
6074,POINT (-122.37198 47.70236),145.0,seattle
6075,POINT (-122.34887 47.61576),153.0,seattle
6076,POINT (-122.35028 47.62791),186.0,seattle


In [27]:
def add_class_to_df(data):
    
    q1 = data['price'].quantile(0.25)
    q2 = data['price'].quantile(0.5)
    q3 = data['price'].quantile(0.75)
    
    bins = [0, q1,  q2, q3, float("inf")]
    # label_names = ['low', 'medium', 'high', 'very high']
    label_names = [0, 1, 2, 3]
    data['price_class'] = pd.cut(data['price'], bins=bins, labels=label_names, right=True)

    return data

In [28]:
airbnb = add_class_to_df(airbnb)
airbnb.drop("price", axis=1, inplace=True)
airbnb

Unnamed: 0,geometry,mie_nazwa,price_class
0,POINT (-73.95512 40.68535),new_york,0
1,POINT (-73.99454 40.66265),new_york,3
2,POINT (-73.95342 40.70935),new_york,0
3,POINT (-73.94255 40.80107),new_york,0
4,POINT (-73.94759 40.78778),new_york,0
...,...,...,...
6073,POINT (-122.34716 47.61584),seattle,3
6074,POINT (-122.37198 47.70236),seattle,2
6075,POINT (-122.34887 47.61576),seattle,2
6076,POINT (-122.35028 47.62791),seattle,2


In [29]:
def add_airbnb_to_osmnx_nodes(
    airbnb: gpd.GeoDataFrame,
    nodes: gpd.GeoDataFrame,
    edges: gpd.GeoDataFrame,
    city_name: str,
):
    with open(OSMNX_ALL_ATTRIBUTES_LOCATION) as f:
        all_attributes = json.load(f)

    osmnx_graph = OSMnxGraph(
        airbnb.loc[
            airbnb["mie_nazwa"] == city_name, :
        ],
        nodes,
        edges,
        all_attributes,
        y_column_name='price_class'
    )
    osmnx_graph._aggregate(element_type="node", aggregation_method="count")
    return osmnx_graph.gdf_nodes


def create_gdfs(city_name: str):
    osmnx_nodes = gpd.read_parquet(
        os.path.join(ORGANIZED_GRAPHS_LOCATION, city_name, "nodes.parquet")
    )
    osmnx_nodes = osmnx_nodes.reset_index()
    osmnx_nodes.index.names = ["node_id"]
    osmnx_nodes["x"] = osmnx_nodes["geometry"].x
    osmnx_nodes["y"] = osmnx_nodes["geometry"].y

    osmnx_edges = gpd.read_parquet(
        os.path.join(ORGANIZED_GRAPHS_LOCATION, city_name, "edges.parquet")
    )
    osmnx_edges = osmnx_edges.reset_index().rename(columns={"index": "edge_id"})
    osmnx_edges.index.names = ["edge_id"]
    if "level_4" in osmnx_edges.columns:
        osmnx_edges = osmnx_edges.drop("level_4", axis=1)
    osmnx_edges = osmnx_edges.fillna(0)

    assert osmnx_nodes.crs == osmnx_edges.crs
    assert osmnx_nodes.crs == airbnb.crs

    osmnx_nodes = add_airbnb_to_osmnx_nodes(
        airbnb=airbnb,
        nodes=osmnx_nodes,
        city_name=city_name,
        edges=osmnx_edges,
    )

    hexes_years_folder = os.path.join(ORGANIZED_HEXES_LOCATION, city_name)

    subfolders = [
        int(f)
        for f in os.listdir(hexes_years_folder)
        if os.path.isdir(os.path.join(hexes_years_folder, f))
    ]
    highest_year = subfolders[np.argmax(subfolders)]

    hexes: gpd.GeoDataFrame = gpd.read_parquet(
        os.path.join(
            ORGANIZED_HEXES_LOCATION,
            f"{city_name}/{highest_year}/h9/count-embedder/dataset.parquet",
        )
    )


    hexes = hexes.rename(columns={"region_id": "h3_id"}).rename_axis(
        "region_id", axis=0
    )

    return dict(osmnx_nodes=osmnx_nodes, osmnx_edges=osmnx_edges, hexes=hexes)


print("Creating gdfs...")
gdfs_dict = {
    city_name: create_gdfs(city_name) for city_name in tqdm(["new_york", "seattle"])
}

Creating gdfs...


  osmnx_edges = osmnx_edges.fillna(0)
  osmnx_edges = osmnx_edges.fillna(0)
100%|██████████| 2/2 [00:02<00:00,  1.02s/it]


In [30]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for gdf_for_city in gdfs_dict.values():
        gdf_for_city["controller"] = GraphLayerController(
            gdf_for_city["hexes"],
            gdf_for_city["osmnx_nodes"],
            gdf_for_city["osmnx_edges"],
        )

In [31]:
def patch_hexes_with_y(
    osmnx_nodes: gpd.GeoDataFrame,
    hexes: gpd.GeoDataFrame,
    controller: GraphLayerController,
):
    virtual_edges = controller.get_virtual_edges_to_hexes(SourceType.OSMNX_NODES)
    hexes_with_y = cast(
        gpd.GeoDataFrame,
        hexes.merge(
            virtual_edges.merge(osmnx_nodes, left_on="source_id", right_index=True)[
                ["region_id", "price_class"]
            ]
            .groupby("region_id")
            .sum(),
            left_index=True,
            right_index=True,
            how="left",
        ).fillna(0),
    )
    # czy mapować tutaj klasy jako 0, 1, 2, 3, 4??
    controller.hexes_gdf = hexes_with_y
    controller._hexes_centroids_gdf = controller._create_hexes_centroids_gdf()

In [32]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for gdfs in gdfs_dict.values():
        patch_hexes_with_y(gdfs['osmnx_nodes'], gdfs["hexes"], gdfs["controller"])

In [33]:
gdfs_dict['new_york']['hexes']

Unnamed: 0_level_0,h3_id,geometry,amenity_gym,building_garages,landuse_gravel,sport_ultimate,office_bakery,natural_shrubbery,landuse_gress,building_guard_cabin,...,shop_eggs,historic_heritage,building_government,aeroway_navigationaid,historic_park,historic_train_station,shop_hobby,building_floating_home,amenity_vacuum_cleaner,building_castle
region_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,892a1000003ffff,"POLYGON ((-73.78199 40.86018, -73.78410 40.859...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,892a1000007ffff,"POLYGON ((-73.77975 40.85759, -73.78187 40.856...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,892a100000bffff,"POLYGON ((-73.78628 40.86007, -73.78839 40.859...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,892a100000fffff,"POLYGON ((-73.78404 40.85748, -73.78615 40.856...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,892a1000013ffff,"POLYGON ((-73.77994 40.86288, -73.78205 40.861...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10752,892a10776cbffff,"POLYGON ((-74.02133 40.67672, -74.02343 40.675...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10753,892a10776cfffff,"POLYGON ((-74.01909 40.67414, -74.02119 40.673...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10754,892a10776d3ffff,"POLYGON ((-74.01503 40.67954, -74.01713 40.678...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10755,892a10776d7ffff,"POLYGON ((-74.01279 40.67696, -74.01489 40.676...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
def create_graph_data(
    osmnx_nodes,
    osmnx_edges,
    hexes,
    controller: GraphLayerController,
    use_hexes_attr: bool,
    use_ortophoto: bool,
):

    edges_attr_columns = osmnx_edges.columns[
        ~osmnx_edges.columns.isin(["u", "v", "key", "geometry"])
    ]
    nodes_attr_columns = osmnx_nodes.columns[
        ~osmnx_nodes.columns.isin(["geometry", "x", "y", "osmid"])
    ]
    hexes_attr_columns = (
        hexes.columns[~hexes.columns.isin(["geometry", "h3_id", "price_class"])]
        if use_hexes_attr
        else []
    )

    data = create_hetero_data(
        controller,
        hexes_attrs_columns_names=hexes_attr_columns,
        osmnx_edge_attrs_columns_names=edges_attr_columns,
        osmnx_node_attrs_columns_names=nodes_attr_columns,
        virtual_edge_attrs_columns_names=[],
        hexes_y_columns_names=["price_class"],
    )
    return data


graph_data_dict = {
    city_name: create_graph_data(**gdfs, use_ortophoto=False, use_hexes_attr=True)
    for city_name, gdfs in gdfs_dict.items()
}

In [35]:
def create_tabular_data(
    hexes: pd.DataFrame,
    controller: GraphLayerController,
    use_hexes_attr: bool,
    use_ortophoto: bool,
):
    assert use_ortophoto or use_hexes_attr, "Provide at least one data source"

    hexes_attr_columns = (
        hexes.columns[~hexes.columns.isin(["geometry", "h3_id"])]
        if use_hexes_attr
        else []
    )

    hexes_y_columns_names = ["price_class"]

    X = hexes[hexes_attr_columns]
    y = controller.hexes_centroids_gdf[hexes_y_columns_names]

    return {"X": X, "y": y}


tabular_data_dict = {
    city_name: create_tabular_data(
        gdfs["hexes"], gdfs["controller"], use_ortophoto=False, use_hexes_attr=True
    )
    for city_name, gdfs in gdfs_dict.items()
}

In [36]:
def shift_elements_right(lst):
    shifted_lst = [lst[-1]] + lst[:-1]
    return shifted_lst


cities_names_list = list(graph_data_dict.keys())
cities_names_list.sort(key=lambda x: str(x))

# val + test
folds_tuples = list(zip(shift_elements_right(cities_names_list), cities_names_list))
display(folds_tuples)

[('seattle', 'new_york'), ('new_york', 'seattle')]

In [16]:
from typing import cast
from wandb.sdk.wandb_run import Run
import torch


def run_k_fold_graph_data(closure_config, sweep_id):
    # pass external config (i.e. what attributes are used in the data), closure to avoid passing it to the function directly
    def wrapped():
        run: Run = cast(Run, wandb.init())

        config = wandb.config

        for k, v in closure_config.items():
            run.log({k: 1 if v else 0})

        run.log({"data_structure": "graph"})

        # create hparams
        if hasattr(config, "lin_layer_size") and hasattr(config, "num_lin_layers"):
            lin_layer_sizes = [config.lin_layer_size] * config.num_lin_layers
        else:
            lin_layer_sizes = config.lin_layer_sizes
        hparams = {
            "hidden_channels": config.hidden_channels,
            "lr": config.learning_rate,
            "num_conv_layers": config.num_conv_layers,
            "lin_layer_sizes": lin_layer_sizes,
            "weight_decay": config.weight_decay,
        }

        epochs = config.epochs

        aucs = []
        accuracies = []
        f1s = []

        fold_group_id = generate_id()

        # log data as artifact if no data was logged in the sweep before
        # dataset is uploaded only on the first run in sweep, because it does not change across runs in sweep
        # in wandb, dataset will be visible on the first run in the sweep
        artifact_path = os.path.join(TRAIN_SAVE_DIR, f"graph_data_{sweep_id}.pkl")
        if not os.path.exists(artifact_path):
            dump(
                graph_data_dict,
                artifact_path,
                protocol=5,
            )
            artifact = wandb.Artifact(
                name="graph_data", type="dataset", metadata=closure_config
            )
            artifact.add_file(local_path=artifact_path)
            run.log_artifact(artifact)

        # run k-fold
        for index, (_, test_city_name) in enumerate(folds_tuples):
            # prepare data
            train_data = [
                v.to("cpu").clone()
                for k, v in graph_data_dict.items()
                if k != test_city_name
            ]
            test_data = graph_data_dict[test_city_name].to("cpu").clone()

            # run training with checkpointing on lowest val_loss, return test metrics for the best model and its path
            # builtin preprocessing - scaling to N(0, 1)
            auc, accuracy, f1, model_path = train(
                train_data=train_data,
                val_data=None,
                test_data=test_data,
                epochs=epochs,
                hparams=hparams,
                train_save_dir=TRAIN_SAVE_DIR,
                num_classes=torch.unique(train_data[0]["hex"].y).shape[0],
            )

            # logging - single fold
            run.log_model(
                path=model_path,
                name=f"model_{fold_group_id}_fold_{index}",
            )
            run.log({f"auc_fold_{index}": auc})
            run.log({f"accuracy_fold_{index}": accuracy})
            run.log({f"f1_fold_{index}": f1})

            aucs.append(auc)
            accuracies.append(accuracy)
            f1s.append(f1)

        # logging - summary statistics
        mean_auc = sum(aucs) / len(aucs)
        mean_accuracy = sum(accuracies) / len(accuracies)
        mean_f1 = sum(f1s) / len(f1s)
        run.log({"mean_auc": mean_auc})
        run.log({"mean_accuracy": mean_accuracy})
        run.log({"mean_f1": mean_f1})

    return wrapped


def run_k_fold_tabular_data(closure_config, sweep_id):
    # analogously to the graph data, but for tabular data
    def wrapped():
        run: Run = cast(Run, wandb.init())

        config = wandb.config

        for k, v in closure_config.items():
            run.log({k: 1 if v else 0})

        run.log({"data_structure": "tabular"})

        hparams = {}
        hparams["C"] = config["C"]
        solver, penalty = config["solver_penalty"].split(";")
        hparams["solver"] = solver
        if penalty == "None":
            penalty = None
        hparams["penalty"] = penalty

        aucs = []
        accuracies = []
        f1s = []

        fold_group_id = generate_id()

        # log data as artifact
        artifact_path = os.path.join(TRAIN_SAVE_DIR, f"tabular_data_{sweep_id}.pkl")

        if not os.path.exists(artifact_path):
            dump(
                tabular_data_dict,
                artifact_path,
                protocol=5,
            )
            artifact = wandb.Artifact(
                name="tabular_data", type="dataset", metadata=closure_config
            )
            artifact.add_file(local_path=artifact_path)
            run.log_artifact(artifact)

        timestamp = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")

        for index, test_city_name in enumerate(cities_names_list):
            scaler = StandardScaler()
            X = pd.concat(
                [
                    m["X"]
                    for key, m in tabular_data_dict.items()
                    if key != test_city_name
                ]
            ).to_numpy()
            y = (
                pd.concat(
                    [
                        m["y"]
                        for key, m in tabular_data_dict.items()
                        if key != test_city_name
                    ]
                )
                .to_numpy()
                .ravel()
            )

            X = scaler.fit_transform(X)

            logistic_regression = LogisticRegression(
                C=hparams["C"],
                solver=hparams["solver"],
                penalty=hparams["penalty"],
                dual=False,
                tol=1e-4,
                fit_intercept=True,
                intercept_scaling=1,
                class_weight="balanced",
                random_state=1124,
                max_iter=1000,
                multi_class="auto",
                warm_start=False,
                n_jobs=-1,
                l1_ratio=0.5,
            )
            logistic_regression.fit(X, y)

            test_X = tabular_data_dict[test_city_name]["X"].to_numpy()
            test_X = scaler.transform(test_X)
            test_y = tabular_data_dict[test_city_name]["y"].to_numpy().ravel()

            encoder = OneHotEncoder(sparse=False)
            test_y_ohe = encoder.fit_transform(test_y.reshape(-1, 1))

            y_pred = logistic_regression.predict(test_X)
            y_proba = logistic_regression.predict_proba(test_X)

            auc = roc_auc_score(
                test_y_ohe, y_proba, average="weighted", multi_class="ovr"
            )
            accuracy = (y_pred == test_y).mean()
            f1 = f1_score(
                test_y,
                y_pred,
                average="weighted",
            )

            model_dir = os.path.join(TRAIN_SAVE_DIR, timestamp)

            os.makedirs(model_dir, exist_ok=True)

            model_path = os.path.join(
                model_dir, f"model_{fold_group_id}_fold_{index}.pkl"
            )

            with open(model_path, "wb") as f:
                dump(logistic_regression, f, protocol=5)

            run.log_model(
                path=model_path,
                name=f"model_{fold_group_id}_fold_{index}",
            )
            run.log({f"auc_fold_{index}": auc})
            run.log({f"accuracy_fold_{index}": accuracy})
            run.log({f"f1_fold_{index}": f1})

            aucs.append(auc)
            accuracies.append(accuracy)
            f1s.append(f1)

        mean_auc = sum(aucs) / len(aucs)
        mean_accuracy = sum(accuracies) / len(accuracies)
        mean_f1 = sum(f1s) / len(f1s)
        run.log({"mean_auc": mean_auc})
        run.log({"mean_accuracy": mean_accuracy})
        run.log({"mean_f1": mean_f1})

    return wrapped


def run_sweep_graph_data(config):
    try:
        wandb.login(key=WANDB_API_KEY)
        sweep_id = wandb.sweep(
            WANDB_SWEEP_PARAMS_GRAPH_DATA, project="airbnb-downstream-task"
        )

        wandb.agent(
            sweep_id,
            function=run_k_fold_graph_data(config, sweep_id),
            count=SWEEP_RUNS_COUNT,
        )
    except Exception as e:
        print(e)
        wandb.finish()
        wandb.sweep
        raise e


def run_sweep_tabular_data(config):
    try:
        wandb.login(key=WANDB_API_KEY)

        sweep_id = wandb.sweep(
            WANDB_SWEEP_PARAMS_TABULAR_DATA, project="airbnb-downstream-task"
        )

        wandb.agent(
            sweep_id,
            function=run_k_fold_tabular_data(config, sweep_id),
            count=SWEEP_RUNS_COUNT,
        )
    except Exception as e:
        print(e)
        wandb.finish()
        raise e

In [17]:
def derive_data_structure(attr_config):
    if attr_config["USE_OSMNX_ATTRS"]:
        return "graph"
    return "tabular"


configs_size = len(ATTRIBUTES_CONFIGURATIONS)

for index, attr_config in enumerate(ATTRIBUTES_CONFIGURATIONS):
    print("Sweep for config {}/{} in progress...".format(index + 1, configs_size))

    assert "USE_ORTOPHOTO" in attr_config, "Provide USE_ORTOPHOTO key"
    assert "USE_HEXES_ATTRS" in attr_config, "Provide USE_HEXES_ATTRS key"
    assert "USE_OSMNX_ATTRS" in attr_config, "Provide USE_OSMNX_ATTRS key"

    data_structure = derive_data_structure(attr_config)

    if data_structure == "graph":
        graph_data_dict = {
            city_name: create_graph_data(
                **gdfs,
                use_ortophoto=attr_config["USE_ORTOPHOTO"],
                use_hexes_attr=attr_config["USE_HEXES_ATTRS"],
            )
            for city_name, gdfs in gdfs_dict.items()
        }
        run_sweep_graph_data(attr_config)
    elif data_structure == "tabular":
        tabular_data_dict = {
            city_name: create_tabular_data(
                gdfs["hexes"],
                gdfs["controller"],
                use_ortophoto=attr_config["USE_ORTOPHOTO"],
                use_hexes_attr=attr_config["USE_HEXES_ATTRS"],
            )
            for city_name, gdfs in gdfs_dict.items()
        }
        run_sweep_tabular_data(attr_config)
    else:
        raise ValueError("Unknown data structure")

Sweep for config 1/3 in progress...


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mstanislaw-markowski[0m ([33mgradient_pwr[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/staszek/.netrc


Create sweep with ID: hjg0cuxy
Sweep URL: https://wandb.ai/gradient_pwr/airbnb-downstream-task/sweeps/hjg0cuxy


[34m[1mwandb[0m: Agent Starting Run: vkmnmruh with config:
[34m[1mwandb[0m: 	epochs: 16
[34m[1mwandb[0m: 	hidden_channels: 30
[34m[1mwandb[0m: 	learning_rate: 5.788733127115965e-05
[34m[1mwandb[0m: 	lin_layer_size: 128
[34m[1mwandb[0m: 	num_conv_layers: 5
[34m[1mwandb[0m: 	num_lin_layers: 0
[34m[1mwandb[0m: 	weight_decay: 1.2683752684248012e-05
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type      | Params
------------------------------------
0 | model | HeteroGNN | 226 K 
------------------------------------
226 K     Trainable params
0         Non-trainable params
226 K     Total params
0.904     Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=16` reached.


VBox(children=(Label(value='207.286 MB of 207.286 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
USE_HEXES_ATTRS,▁
USE_ORTOPHOTO,▁
USE_OSMNX_ATTRS,▁
accuracy_fold_0,▁
auc_fold_0,▁
f1_fold_0,▁
mean_accuracy,▁
mean_auc,▁
mean_f1,▁

0,1
USE_HEXES_ATTRS,1
USE_ORTOPHOTO,0
USE_OSMNX_ATTRS,1
accuracy_fold_0,0.09566
auc_fold_0,0.56454
data_structure,graph
f1_fold_0,0.05387
mean_accuracy,0.09566
mean_auc,0.56454
mean_f1,0.05387


[34m[1mwandb[0m: Agent Starting Run: kaho1lyh with config:
[34m[1mwandb[0m: 	epochs: 18
[34m[1mwandb[0m: 	hidden_channels: 40
[34m[1mwandb[0m: 	learning_rate: 0.00013779356174398844
[34m[1mwandb[0m: 	lin_layer_size: 64
[34m[1mwandb[0m: 	num_conv_layers: 1
[34m[1mwandb[0m: 	num_lin_layers: 3
[34m[1mwandb[0m: 	weight_decay: 9.020676324039596e-05
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type      | Params
------------------------------------
0 | model | HeteroGNN | 285 K 
------------------------------------
285 K     Trainable params
0         Non-trainable params
285 K     Total params
1.143     Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=18` reached.


VBox(children=(Label(value='1.109 MB of 1.109 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
USE_HEXES_ATTRS,▁
USE_ORTOPHOTO,▁
USE_OSMNX_ATTRS,▁
accuracy_fold_0,▁
auc_fold_0,▁
f1_fold_0,▁
mean_accuracy,▁
mean_auc,▁
mean_f1,▁

0,1
USE_HEXES_ATTRS,1
USE_ORTOPHOTO,0
USE_OSMNX_ATTRS,1
accuracy_fold_0,0.63986
auc_fold_0,0.8328
data_structure,graph
f1_fold_0,0.59304
mean_accuracy,0.63986
mean_auc,0.8328
mean_f1,0.59304


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/staszek/.netrc


Sweep for config 2/3 in progress...
Create sweep with ID: q0zh4kxl
Sweep URL: https://wandb.ai/gradient_pwr/airbnb-downstream-task/sweeps/q0zh4kxl


[34m[1mwandb[0m: Agent Starting Run: frezf69u with config:
[34m[1mwandb[0m: 	C: 3.0476128634607504e-05
[34m[1mwandb[0m: 	solver_penalty: saga;l2
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.




VBox(children=(Label(value='246.020 MB of 246.020 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
USE_HEXES_ATTRS,▁
USE_ORTOPHOTO,▁
USE_OSMNX_ATTRS,▁
accuracy_fold_0,▁
accuracy_fold_1,▁
auc_fold_0,▁
auc_fold_1,▁
f1_fold_0,▁
f1_fold_1,▁
mean_accuracy,▁

0,1
USE_HEXES_ATTRS,1
USE_ORTOPHOTO,0
USE_OSMNX_ATTRS,0
accuracy_fold_0,0.66236
accuracy_fold_1,0.4749
auc_fold_0,0.85888
auc_fold_1,0.75099
data_structure,tabular
f1_fold_0,0.65945
f1_fold_1,0.52376


[34m[1mwandb[0m: Agent Starting Run: wq01iof9 with config:
[34m[1mwandb[0m: 	C: 0.00038966599496010175
[34m[1mwandb[0m: 	solver_penalty: liblinear;l2
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.




VBox(children=(Label(value='0.169 MB of 0.169 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
USE_HEXES_ATTRS,▁
USE_ORTOPHOTO,▁
USE_OSMNX_ATTRS,▁
accuracy_fold_0,▁
accuracy_fold_1,▁
auc_fold_0,▁
auc_fold_1,▁
f1_fold_0,▁
f1_fold_1,▁
mean_accuracy,▁

0,1
USE_HEXES_ATTRS,1
USE_ORTOPHOTO,0
USE_OSMNX_ATTRS,0
accuracy_fold_0,0.63521
accuracy_fold_1,0.51083
auc_fold_0,0.87526
auc_fold_1,0.8324
data_structure,tabular
f1_fold_0,0.67764
f1_fold_1,0.56163


Sweep for config 3/3 in progress...


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/staszek/.netrc


Create sweep with ID: 0b734yxx
Sweep URL: https://wandb.ai/gradient_pwr/airbnb-downstream-task/sweeps/0b734yxx


[34m[1mwandb[0m: Agent Starting Run: i501rj1u with config:
[34m[1mwandb[0m: 	epochs: 16
[34m[1mwandb[0m: 	hidden_channels: 10
[34m[1mwandb[0m: 	learning_rate: 0.004414493139081901
[34m[1mwandb[0m: 	lin_layer_size: 32
[34m[1mwandb[0m: 	num_conv_layers: 3
[34m[1mwandb[0m: 	num_lin_layers: 1
[34m[1mwandb[0m: 	weight_decay: 0.0006443006784313302
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type      | Params
------------------------------------
0 | model | HeteroGNN | 4.6 K 
------------------------------------
4.6 K     Trainable params
0         Non-trainable params
4.6 K     Total params
0.018     Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=16` reached.


VBox(children=(Label(value='83.675 MB of 83.675 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
USE_HEXES_ATTRS,▁
USE_ORTOPHOTO,▁
USE_OSMNX_ATTRS,▁
accuracy_fold_0,▁
auc_fold_0,▁
f1_fold_0,▁
mean_accuracy,▁
mean_auc,▁
mean_f1,▁

0,1
USE_HEXES_ATTRS,0
USE_ORTOPHOTO,0
USE_OSMNX_ATTRS,1
accuracy_fold_0,0.59059
auc_fold_0,0.82539
data_structure,graph
f1_fold_0,0.58472
mean_accuracy,0.59059
mean_auc,0.82539
mean_f1,0.58472


[34m[1mwandb[0m: Agent Starting Run: dlm0e5kw with config:
[34m[1mwandb[0m: 	epochs: 17
[34m[1mwandb[0m: 	hidden_channels: 20
[34m[1mwandb[0m: 	learning_rate: 0.003169567882768765
[34m[1mwandb[0m: 	lin_layer_size: 32
[34m[1mwandb[0m: 	num_conv_layers: 5
[34m[1mwandb[0m: 	num_lin_layers: 0
[34m[1mwandb[0m: 	weight_decay: 0.0006669555100067706
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type      | Params
------------------------------------
0 | model | HeteroGNN | 16.8 K
------------------------------------
16.8 K    Trainable params
0         Non-trainable params
16.8 K    Total params
0.067     Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=17` reached.


VBox(children=(Label(value='0.108 MB of 0.108 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
USE_HEXES_ATTRS,▁
USE_ORTOPHOTO,▁
USE_OSMNX_ATTRS,▁
accuracy_fold_0,▁
auc_fold_0,▁
f1_fold_0,▁
mean_accuracy,▁
mean_auc,▁
mean_f1,▁

0,1
USE_HEXES_ATTRS,0
USE_ORTOPHOTO,0
USE_OSMNX_ATTRS,1
accuracy_fold_0,0.56735
auc_fold_0,0.8027
data_structure,graph
f1_fold_0,0.60167
mean_accuracy,0.56735
mean_auc,0.8027
mean_f1,0.60167
