In [1]:
import os
import sys

notebook_path = os.path.abspath("__file__")
notebook_directory = os.path.dirname(notebook_path)
parent_directory = os.path.dirname(notebook_directory)

parent_parent_directory = os.path.dirname(parent_directory)

sys.path.append(parent_parent_directory)

In [2]:
import geopandas as gpd
from src.organized_datasets_creation.utils import resolve_nominatim_city_name
from src.graph_layering.create_dataframes import create_osmnx_dataframes
from src.organized_datasets_creation.utils import convert_nominatim_name_to_filename
from src.graph_layering.graph_layer_creator import GraphLayerController
import pandas as pd
from typing import cast
import os
from src.graph_layering.graph_layer_creator import SourceType
import warnings
from src.graph_layering.create_hetero_data import create_hetero_data

from tqdm import tqdm

from tqdm import tqdm
# import wandb.util
# import wandb
import os

import glob
import numpy as np

In [3]:
# WANDB_API_KEY = os.environ.get("WANDB_API_KEY", None)
# assert (
#     WANDB_API_KEY is not None
# ), "WANDB_API_KEY is not set, did you forget it in the config file?"

In [52]:
# general settings
# ORGANIZED_DATASETS_LOCATION = (
#     "/home/staszek/mgr/gradient/gradient/data/organized-datasets"
# )
ORGANIZED_DATASETS_LOCATION = (
    "C:/Users/Natalia/Desktop/gradient_new/gradient/data/organized-datasets"
)

# downstream task settings
# ACCIDENTS_LOCATION = "/home/staszek/mgr/gradient/gradient/data/wypadki-pl/accidents.csv"
ACCIDENTS_LOCATION = "C:/Users/Natalia/Desktop/gradient_new/gradient/data/wypadki-pl/accidents.csv"
# TRAIN_SAVE_DIR = "/media/staszek/m2-mint/gradient_logs/"

SWEEP_RUNS_COUNT = 50
EPOCHS = 300

H3_RESOLUTION = 9
H3_YEAR = 2017
H3_EMBEDDING_METHOD = "count-embedder"

DATASET_CONTRUCTION_METHOD = "drop-non-matching-columns"

WANDB_SWEEP_PARAMS = {
    "method": "bayes",
    "metric": {"name": "mean_f1", "goal": "maximize"},
    "parameters": {
        "hidden_channels": {"values": [10, 20, 30, 40, 50]},
        "learning_rate": {
            "distribution": "log_uniform_values",
            "min": 1e-5,
            "max": 1e-2,
        },
        "num_conv_layers": {"values": [1, 2, 3, 4, 5]},
        "lin_layer_size": {"values": [8, 16, 32, 64, 128]},
        "num_lin_layers": {"values": [0, 1, 2, 3, 4]},
        "weight_decay": {
            "distribution": "log_uniform_values",
            "min": 1e-5,
            "max": 1e-2,
        },
    },
}

In [5]:
available_h3_embedding_methods = glob.glob(ORGANIZED_DATASETS_LOCATION + "/**/**/**/*")
available_h3_embedding_methods = [
    os.path.basename(x) for x in available_h3_embedding_methods
]
available_h3_embedding_methods = np.unique(available_h3_embedding_methods).tolist()
print("Available h3 embedding methods:")
assert H3_EMBEDDING_METHOD in available_h3_embedding_methods, (
    f"Chosen h3 embedding method is not available. "
    f"Available methods are: {available_h3_embedding_methods}"
)
print(available_h3_embedding_methods)

Available h3 embedding methods:
['count-embedder', 'hex2vec', 'highway2vec']


In [6]:
accidents = gpd.read_file(ACCIDENTS_LOCATION)

In [7]:
cities = list(map(lambda x: x + ", Poland", accidents["mie_nazwa"].unique()))
print("Cities:")
print(cities)

Cities:
['WrocÅ‚aw, Poland', 'Szczecin, Poland', 'PoznaÅ„, Poland', 'KrakÃ³w, Poland', 'Warszawa, Poland']


In [8]:
import pandas as pd
import geopandas as gpd

# Read the file using pandas with the correct encoding
accidents = pd.read_csv(ACCIDENTS_LOCATION, encoding='utf-8')

# Convert the DataFrame to a GeoDataFrame if necessary
# Assuming 'wsp_gps_x' and 'wsp_gps_y' are the longitude and latitude
geometry = gpd.points_from_xy(accidents.wsp_gps_x, accidents.wsp_gps_y)
gdf = gpd.GeoDataFrame(accidents, geometry=geometry)

# Extract and process city names
cities = list(map(lambda x: x + ", Poland", gdf["mie_nazwa"].unique()))
print("Cities:")
print(cities)


Cities:
['Wrocław, Poland', 'Szczecin, Poland', 'Poznań, Poland', 'Kraków, Poland', 'Warszawa, Poland']


In [9]:
def create_gdfs(city_name: str, h3_resolution: int, year: int, method: str):
    osmnx_nodes, osmnx_edges = create_osmnx_dataframes(
        df_accidents=accidents, nominatim_city_name=city_name
    )
    assert (
        method in available_h3_embedding_methods
    ), f"H3 embedding method {method} not available, available methods: {available_h3_embedding_methods}"
    hexes: gpd.GeoDataFrame = gpd.read_parquet(
        os.path.join(
            ORGANIZED_DATASETS_LOCATION,
            f"{convert_nominatim_name_to_filename(resolve_nominatim_city_name(city_name))}/{year}/h{h3_resolution}/{method}/dataset.parquet",
        )
    )
    hexes = (
        hexes.rename(columns={"region_id": "h3_id"})
        .rename_axis("region_id", axis=0)
        .drop(columns="accidents_count")
    )  # we will be using different aggregation type than the one in the dataset

    return dict(osmnx_nodes=osmnx_nodes, osmnx_edges=osmnx_edges, hexes=hexes)


# with warnings.catch_warnings():
#     warnings.simplefilter("ignore")
#     print("Creating gdfs...")
#     gdfs_dict = {}
#     for city_name in tqdm(cities):
#         print(city_name)
#         gdfs_dict[city_name] = create_gdfs(
#             city_name, H3_RESOLUTION, H3_YEAR, method=H3_EMBEDDING_METHOD
#         )
        


# Usuwanie kolumn, które nie są wspólne dla wszystkich miast


In [94]:
def get_presence_df(gdfs_dict, tested_df_name):
    presence_df = pd.DataFrame(
        list(
            map(
                lambda v: (v[0], v[1][tested_df_name].columns.to_list()),
                list(gdfs_dict.items()),
            )
        ),
        columns=["city_name", "col"],
    ).explode("col")
    presence_df = (
        pd.get_dummies(presence_df, columns=["col"], prefix="", prefix_sep="")
        .groupby("city_name")
        .sum()
    )
    return presence_df


def filter_presence_df(df):
    return df.loc[:, df.sum(axis=0) == len(cities)]


def get_common_columns(gdfs_dict, tested_df_name):
    df_columns_presence = get_presence_df(gdfs_dict, tested_df_name)
    df_common_columns = filter_presence_df(df_columns_presence)
    return df_common_columns

In [95]:
df_osmnx_node_common_columns = get_common_columns(gdfs_dict, "osmnx_nodes")
df_osmnx_node_common_columns

Unnamed: 0_level_0,accidents_count,crossing,geometry,give_way,mini_roundabout,motorway_junction,osmid,passing_place,stop,street_count,traffic_signals,turning_circle,x,y
city_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
"Kraków, Poland",1,1,1,1,1,1,1,1,1,1,1,1,1,1
"Poznań, Poland",1,1,1,1,1,1,1,1,1,1,1,1,1,1
"Szczecin, Poland",1,1,1,1,1,1,1,1,1,1,1,1,1,1
"Warszawa, Poland",1,1,1,1,1,1,1,1,1,1,1,1,1,1
"Wrocław, Poland",1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [96]:
df_osmnx_edge_common_columns = get_common_columns(gdfs_dict, "osmnx_edges")
df_osmnx_edge_common_columns

Unnamed: 0_level_0,access_0,access_destination,access_no,access_permissive,access_yes,bridge_0,bridge_viaduct,bridge_yes,geometry,highway_living_street,...,length,maxspeed,oneway,reversed,tunnel_0,tunnel_building_passage,tunnel_yes,u,v,width
city_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Kraków, Poland",1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
"Poznań, Poland",1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
"Szczecin, Poland",1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
"Warszawa, Poland",1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
"Wrocław, Poland",1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [97]:
df_hexes_common_columns = get_common_columns(gdfs_dict, "hexes")
df_hexes_common_columns

Unnamed: 0_level_0,aeroway_aerodrome,aeroway_helipad,aeroway_runway,amenity_animal_shelter,amenity_arts_centre,amenity_atm,amenity_bank,amenity_bar,amenity_bbq,amenity_bench,...,water_pond,water_reservoir,water_river,water_wastewater,waterway_canal,waterway_ditch,waterway_drain,waterway_river,waterway_stream,waterway_weir
city_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Kraków, Poland",1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
"Poznań, Poland",1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
"Szczecin, Poland",1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
"Warszawa, Poland",1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
"Wrocław, Poland",1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [98]:
print("Deleting columns that are not common for all cities...")
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for gdf_for_city in gdfs_dict.values():
        osmnx_nodes = gdf_for_city["osmnx_nodes"]
        osmnx_edges = gdf_for_city["osmnx_edges"]
        hexes = gdf_for_city["hexes"]

        osmnx_nodes.drop(
            columns=osmnx_nodes.columns.difference(
                df_osmnx_node_common_columns.columns
            ),
            inplace=True,
        )
        osmnx_edges.drop(
            columns=osmnx_edges.columns.difference(
                df_osmnx_edge_common_columns.columns
            ),
            inplace=True,
        )
        hexes.drop(
            columns=hexes.columns.difference(df_hexes_common_columns.columns),
            inplace=True,
        )

        gdf_for_city["osmnx_nodes"] = osmnx_nodes.reindex(
            columns=df_osmnx_node_common_columns.columns
        )
        gdf_for_city["osmnx_edges"] = osmnx_edges.reindex(
            columns=df_osmnx_edge_common_columns.columns
        )
        gdf_for_city["hexes"] = hexes.reindex(columns=df_hexes_common_columns.columns)

        gdf_for_city["controller"] = GraphLayerController(
            gdf_for_city["hexes"],
            gdf_for_city["osmnx_nodes"],
            gdf_for_city["osmnx_edges"],
        )

Deleting columns that are not common for all cities...


In [99]:
def patch_hexes_with_y(
    osmnx_nodes: gpd.GeoDataFrame,
    hexes: gpd.GeoDataFrame,
    controller: GraphLayerController,
):
    virtual_edges = controller.get_virtual_edges_to_hexes(SourceType.OSMNX_NODES)
    hexes_with_y = cast(
        gpd.GeoDataFrame,
        hexes.merge(
            virtual_edges.merge(osmnx_nodes, left_on="source_id", right_index=True)[
                ["region_id", "accidents_count"]
            ]
            .groupby("region_id")
            .sum(),
            left_index=True,
            right_index=True,
            how="left",
        ).fillna(0),
    )
    hexes_with_y["accident_occured"] = (hexes_with_y["accidents_count"] > 0).astype(int)
    hexes_with_y.drop(columns="accidents_count", inplace=True)
    controller.hexes_gdf = hexes_with_y
    controller._hexes_centroids_gdf = controller._create_hexes_centroids_gdf()

In [100]:
print("Patching hexes with y...")
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for gdfs in gdfs_dict.values():
        patch_hexes_with_y(gdfs["osmnx_nodes"], gdfs["hexes"], gdfs["controller"])

Patching hexes with y...


In [103]:
def create_torch_geometric_hetero_data(
    osmnx_nodes, osmnx_edges, hexes, controller: GraphLayerController
):
    edges_attr_columns = osmnx_edges.columns[
        ~osmnx_edges.columns.isin(["u", "v", "key", "geometry"])
    ]
    nodes_attr_columns = osmnx_nodes.columns[
        ~osmnx_nodes.columns.isin(["geometry", "x", "y", "osmid", "accidents_count"])
    ]
    hexes_attr_columns = hexes.columns[~hexes.columns.isin(["geometry", "h3_id"])]

    data = create_hetero_data(
        controller,
        hexes_attrs_columns_names=hexes_attr_columns,
        osmnx_edge_attrs_columns_names=edges_attr_columns,
        osmnx_node_attrs_columns_names=nodes_attr_columns,
        virtual_edge_attrs_columns_names=[],
        hexes_y_columns_names=["accident_occured"],
    )
    return data


data_dict = {
    city_name: create_torch_geometric_hetero_data(**gdfs)
    for city_name, gdfs in gdfs_dict.items()
}

In [None]:
def shift_elements_right(lst):
    shifted_lst = [lst[-1]] + lst[:-1]
    return shifted_lst


cities_names_list = list(data_dict.keys())

# val + test
folds_tuples = list(zip(shift_elements_right(cities_names_list), cities_names_list))

In [58]:
print("Folds tuples (val, test)")
print(folds_tuples)

Folds tuples (val, test)
[('Warszawa, Poland', 'Wrocław, Poland'), ('Wrocław, Poland', 'Szczecin, Poland'), ('Szczecin, Poland', 'Poznań, Poland'), ('Poznań, Poland', 'Kraków, Poland'), ('Kraków, Poland', 'Warszawa, Poland')]


In [26]:
configs_list = [
    {
        "hidden_channels": 10,
        "learning_rate": 1e-5,
        "num_conv_layers": 1,
        "lin_layer_size": 8,
        "num_lin_layers": 0,
        "weight_decay": 1e-5,
    },
    {
        "hidden_channels": 30,
        "learning_rate": 1e-5,
        "num_conv_layers": 3,
        "lin_layer_size": 32,
        "num_lin_layers": 2,
        "weight_decay": 1e-5,
    },
    {
        "hidden_channels": 50,
        "learning_rate": 1e-5,
        "num_conv_layers": 5,
        "lin_layer_size": 128,
        "num_lin_layers": 4,
        "weight_decay": 1e-5,
    }
]

for config in configs_list:
    print(config)

{'hidden_channels': 10, 'learning_rate': 1e-05, 'num_conv_layers': 1, 'lin_layer_size': 8, 'num_lin_layers': 0, 'weight_decay': 1e-05}
{'hidden_channels': 30, 'learning_rate': 1e-05, 'num_conv_layers': 3, 'lin_layer_size': 32, 'num_lin_layers': 2, 'weight_decay': 1e-05}
{'hidden_channels': 50, 'learning_rate': 1e-05, 'num_conv_layers': 5, 'lin_layer_size': 128, 'num_lin_layers': 4, 'weight_decay': 1e-05}


In [60]:
# from wandb.util import generate_id

from src.training.train import train


def run_k_fold(hparams):
    # run = wandb.init()
    epochs = EPOCHS

    # config = wandb.config

    lin_layer_sizes = [hparams['lin_layer_size']] * hparams['num_lin_layers']


    hparams = {
        "hidden_channels": hparams['hidden_channels'],
        "lr": hparams['learning_rate'],
        "num_conv_layers": hparams['num_conv_layers'],
        "lin_layer_sizes": lin_layer_sizes,
        "weight_decay": hparams['weight_decay'],
    }

    aucs = []
    accuracies = []
    f1s = []

    # fold_group_id = generate_id()
    TRAIN_SAVE_DIR='res'
    for index, (val_city_name, test_city_name) in enumerate(folds_tuples):
        val_data = [data_dict[val_city_name].to("cpu").clone()]
        train_data = [
            v.to("cpu").clone()
            for k, v in data_dict.items()
            if k != val_city_name and k != test_city_name
        ]
        test_data = data_dict[test_city_name].to("cpu").clone()

        auc, accuracy, f1, model_path = train(
            train_data=train_data,
            val_data=val_data,
            test_data=test_data,
            epochs=epochs,
            hparams=hparams,
            train_save_dir=TRAIN_SAVE_DIR,
        )
        # run.log_model(
        #     path=model_path,
        #     name=f"model_{fold_group_id}_fold_{index}",
        # )
        # run.log({f"auc_fold_{index}": auc})
        # run.log({f"accuracy_fold_{index}": accuracy})
        # run.log({f"f1_fold_{index}": f1})

        aucs.append(auc)
        accuracies.append(accuracy)
        f1s.append(f1)

    mean_auc = sum(aucs) / len(aucs)
    mean_accuracy = sum(accuracies) / len(accuracies)
    mean_f1 = sum(f1s) / len(f1s)
    # run.log({"mean_auc": mean_auc})
    # run.log({"mean_accuracy": mean_accuracy})
    # run.log({"mean_f1": mean_f1})
    return {
        "mean_auc": mean_auc,
        "mean_accuracy": mean_accuracy,
        "mean_f1": mean_f1,
        "aucs": aucs,
        "accuracies": accuracies,
        "f1s": f1s,
    }

def main():
    results = []
    # print(sweep_configs)
    for config in configs_list:
        result = run_k_fold(config)
        results.append(result)
        print(f"Config: {config}")
        print(f"Mean AUC: {result['mean_auc']}")
        print(f"Mean Accuracy: {result['mean_accuracy']}")
        print(f"Mean F1: {result['mean_f1']}")
        print("-" * 50)
    
    # Optionally, you can display aggregated results in a more comprehensive format
    mean_aucs = [res["mean_auc"] for res in results]
    mean_accuracies = [res["mean_accuracy"] for res in results]
    mean_f1s = [res["mean_f1"] for res in results]

    print("Overall Results")
    print(f"Mean AUC: {np.mean(mean_aucs)} ± {np.std(mean_aucs)}")
    print(f"Mean Accuracy: {np.mean(mean_accuracies)} ± {np.std(mean_accuracies)}")
    print(f"Mean F1: {np.mean(mean_f1s)} ± {np.std(mean_f1s)}")
    return result
        


result = main()

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name  | Type      | Params
------------------------------------
0 | model | HeteroGNN | 14.2 K
------------------------------------
14.2 K    Trainable params
0         Non-trainable params
14.2 K    Total params
0.057     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=300` reached.


Best model path:
res2024_05_22_09_17_18/checkpoints\model-checkpoint-epoch-epoch=296.ckpt


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name  | Type      | Params
------------------------------------
0 | model | HeteroGNN | 14.2 K
------------------------------------
14.2 K    Trainable params
0         Non-trainable params
14.2 K    Total params
0.057     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=300` reached.


Best model path:
res2024_05_22_09_31_25/checkpoints\model-checkpoint-epoch-epoch=299.ckpt


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name  | Type      | Params
------------------------------------
0 | model | HeteroGNN | 14.2 K
------------------------------------
14.2 K    Trainable params
0         Non-trainable params
14.2 K    Total params
0.057     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=300` reached.


Best model path:
res2024_05_22_09_40_36/checkpoints\model-checkpoint-epoch-epoch=299.ckpt


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name  | Type      | Params
------------------------------------
0 | model | HeteroGNN | 14.2 K
------------------------------------
14.2 K    Trainable params
0         Non-trainable params
14.2 K    Total params
0.057     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=300` reached.


Best model path:
res2024_05_22_09_50_13/checkpoints\model-checkpoint-epoch-epoch=298.ckpt


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name  | Type      | Params
------------------------------------
0 | model | HeteroGNN | 14.2 K
------------------------------------
14.2 K    Trainable params
0         Non-trainable params
14.2 K    Total params
0.057     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=300` reached.


Best model path:
res2024_05_22_10_00_54/checkpoints\model-checkpoint-epoch-epoch=298.ckpt
Config: {'hidden_channels': 10, 'learning_rate': 1e-05, 'num_conv_layers': 1, 'lin_layer_size': 8, 'num_lin_layers': 0, 'weight_decay': 1e-05}
Mean AUC: 0.7579575483505506
Mean Accuracy: 0.6100248464264086
Mean F1: 0.48377396916813814
--------------------------------------------------


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name  | Type      | Params
------------------------------------
0 | model | HeteroGNN | 60.0 K
------------------------------------
60.0 K    Trainable params
0         Non-trainable params
60.0 K    Total params
0.240     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=300` reached.


Best model path:
res2024_05_22_10_10_03/checkpoints\model-checkpoint-epoch-epoch=278.ckpt


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name  | Type      | Params
------------------------------------
0 | model | HeteroGNN | 60.0 K
------------------------------------
60.0 K    Trainable params
0         Non-trainable params
60.0 K    Total params
0.240     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=300` reached.


Best model path:
res2024_05_22_10_44_01/checkpoints\model-checkpoint-epoch-epoch=299.ckpt


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name  | Type      | Params
------------------------------------
0 | model | HeteroGNN | 60.0 K
------------------------------------
60.0 K    Trainable params
0         Non-trainable params
60.0 K    Total params
0.240     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=300` reached.


Best model path:
res2024_05_22_11_08_59/checkpoints\model-checkpoint-epoch-epoch=299.ckpt


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name  | Type      | Params
------------------------------------
0 | model | HeteroGNN | 60.0 K
------------------------------------
60.0 K    Trainable params
0         Non-trainable params
60.0 K    Total params
0.240     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=300` reached.


Best model path:
res2024_05_22_11_32_53/checkpoints\model-checkpoint-epoch-epoch=299.ckpt


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name  | Type      | Params
------------------------------------
0 | model | HeteroGNN | 60.0 K
------------------------------------
60.0 K    Trainable params
0         Non-trainable params
60.0 K    Total params
0.240     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=300` reached.


Best model path:
res2024_05_22_12_00_17/checkpoints\model-checkpoint-epoch-epoch=297.ckpt
Config: {'hidden_channels': 30, 'learning_rate': 1e-05, 'num_conv_layers': 3, 'lin_layer_size': 32, 'num_lin_layers': 2, 'weight_decay': 1e-05}
Mean AUC: 0.813785759984907
Mean Accuracy: 0.7482739187074932
Mean F1: 0.6170900183333908
--------------------------------------------------


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name  | Type      | Params
------------------------------------
0 | model | HeteroGNN | 251 K 
------------------------------------
251 K     Trainable params
0         Non-trainable params
251 K     Total params
1.006     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=300` reached.


Best model path:
res2024_05_22_12_21_24/checkpoints\model-checkpoint-epoch-epoch=32.ckpt


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name  | Type      | Params
------------------------------------
0 | model | HeteroGNN | 251 K 
------------------------------------
251 K     Trainable params
0         Non-trainable params
251 K     Total params
1.006     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=300` reached.


Best model path:
res2024_05_22_13_25_30/checkpoints\model-checkpoint-epoch-epoch=195.ckpt


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name  | Type      | Params
------------------------------------
0 | model | HeteroGNN | 251 K 
------------------------------------
251 K     Trainable params
0         Non-trainable params
251 K     Total params
1.006     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=300` reached.


Best model path:
res2024_05_22_14_38_18/checkpoints\model-checkpoint-epoch-epoch=297.ckpt


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name  | Type      | Params
------------------------------------
0 | model | HeteroGNN | 251 K 
------------------------------------
251 K     Trainable params
0         Non-trainable params
251 K     Total params
1.006     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=300` reached.


Best model path:
res2024_05_22_15_54_14/checkpoints\model-checkpoint-epoch-epoch=273.ckpt


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name  | Type      | Params
------------------------------------
0 | model | HeteroGNN | 251 K 
------------------------------------
251 K     Trainable params
0         Non-trainable params
251 K     Total params
1.006     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=300` reached.


Best model path:
res2024_05_22_20_04_03/checkpoints\model-checkpoint-epoch-epoch=187.ckpt
Config: {'hidden_channels': 50, 'learning_rate': 1e-05, 'num_conv_layers': 5, 'lin_layer_size': 128, 'num_lin_layers': 4, 'weight_decay': 1e-05}
Mean AUC: 0.8422804209997838
Mean Accuracy: 0.7746590601440132
Mean F1: 0.6227139914656291
--------------------------------------------------
Overall Results
Mean AUC: 0.8046745764450804 ± 0.03502234488700992
Mean Accuracy: 0.7109859417593051 ± 0.07219834254916535
Mean F1: 0.5745259929890527 ± 0.06421243202133534


In [71]:
# import json 

# with open('base_results.json', 'w') as file:
#     json.dump(result, file)

___

### Dodanie ortofotomap

dla jednego hexsa embeding jest 197x768

In [14]:
import pandas as pd

test = pd.read_parquet('C:/Users/Natalia/Desktop/gradient_new/gradient/data/maps_embeddings/regions_8_emb_Poznań.parquet')

In [19]:
test.iloc()[0]['emb'].shape

(151296,)

In [10]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print("Creating gdfs...")
    gdfs_dict2 = {}
    for city_name in tqdm(cities):
        print(city_name)
        gdfs_dict2[city_name] = create_gdfs(
            city_name, H3_RESOLUTION, H3_YEAR, method=H3_EMBEDDING_METHOD
        )
        

Creating gdfs...


  0%|          | 0/5 [00:00<?, ?it/s]

Wrocław, Poland


 20%|██        | 1/5 [03:08<12:35, 188.76s/it]

Szczecin, Poland


 40%|████      | 2/5 [05:41<08:22, 167.35s/it]

Poznań, Poland


 60%|██████    | 3/5 [12:07<08:54, 267.45s/it]

Kraków, Poland


 80%|████████  | 4/5 [15:40<04:05, 245.80s/it]

Warszawa, Poland


100%|██████████| 5/5 [27:37<00:00, 331.56s/it]


In [11]:
import copy 
import pandas as pd
import geopandas as gpd


def get_presence_df(gdfs_dict, tested_df_name):
    
    presence_df = pd.DataFrame(
        list(
            map(
                lambda v: (v[0], v[1][tested_df_name].columns.to_list()),
                list(gdfs_dict.items()),
            )
        ),
        columns=["city_name", "col"],
    ).explode("col")
    presence_df = (
        pd.get_dummies(presence_df, columns=["col"], prefix="", prefix_sep="")
        .groupby("city_name")
        .sum()
    )
    return presence_df


def filter_presence_df(df):
    return df.loc[:, df.sum(axis=0) == len(cities)]


def get_common_columns(gdfs_dict, tested_df_name):
    df_columns_presence = get_presence_df(gdfs_dict, tested_df_name)
    df_common_columns = filter_presence_df(df_columns_presence)
    return df_common_columns

df_osmnx_node_common_columns = get_common_columns(gdfs_dict2, "osmnx_nodes")
# df_osmnx_node_common_columns

df_osmnx_edge_common_columns = get_common_columns(gdfs_dict2, "osmnx_edges")
# df_osmnx_edge_common_columns


df_hexes_common_columns = get_common_columns(gdfs_dict2, "hexes")
# df_hexes_common_columns
# print("Deleting columns that are not common for all cities...")



with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for gdf_for_city_key in list(gdfs_dict2.keys()):
        print(gdf_for_city_key)
        gdf_for_city = gdfs_dict2[gdf_for_city_key]
        city = gdf_for_city_key.split(',')[0]
        # print(gdf_for_city)
        emb_8 = pd.read_parquet(f'C:/Users/Natalia/Desktop/gradient_new/gradient/data/maps_embeddings/regions_8_emb_{city}.parquet')

        osmnx_nodes = gdf_for_city["osmnx_nodes"]
        osmnx_edges = gdf_for_city["osmnx_edges"]
        hexes = gdf_for_city["hexes"]

        osmnx_nodes.drop(
            columns=osmnx_nodes.columns.difference(
                df_osmnx_node_common_columns.columns
            ),
            inplace=True,
        )
        osmnx_edges.drop(
            columns=osmnx_edges.columns.difference(
                df_osmnx_edge_common_columns.columns
            ),
            inplace=True,
        )
        hexes.drop(
            columns=hexes.columns.difference(df_hexes_common_columns.columns),
            inplace=True,
        )
        # display('h3_id' in hexes.columns)
        #  dodac ortofotomapy
        gdf_for_city["osmnx_nodes"] = osmnx_nodes.reindex(
            columns=df_osmnx_node_common_columns.columns
        )
        gdf_for_city["osmnx_edges"] = osmnx_edges.reindex(
            columns=df_osmnx_edge_common_columns.columns
        )
        # gdf_for_city["hexes"] = hexes.reindex(columns=df_hexes_common_columns.columns)

        # Path to your GeoJSON file
        geojson_file = f'C:/Users/Natalia/Desktop/gradient_new/gradient/data/maps_embeddings/regions_9_emb_{city}.geojson'

        # Read the GeoJSON file
        gdf_8_to_9 = gpd.read_file(geojson_file)[['region_id', 'region_id_res_8']]
        gdf_8_to_9.rename(columns={'region_id': 'h3_id'}, inplace=True) 
        gdf_8_to_9.rename(columns={'region_id_res_8': 'region_id'}, inplace=True)

        emb_9  = pd.merge(gdf_8_to_9, emb_8, on='region_id', how='left')  # You can change 'inner' to 'outer', 'left', or 'right' depending on your needs

        emb_9.drop(
            columns='region_id',
            inplace=True,
        )
        # Display the GeoDataFrame
  
        gdf_for_city["hexes"] = pd.merge(hexes, emb_9, on='h3_id', how='left')  # You can change 'inner' to 'outer', 'left', or 'right' depending on your needs

        zeros_array = np.zeros(151296)

        # Find the indices of rows with NaN values in the specified column
        nan_indices = gdf_for_city["hexes"].index[gdf_for_city["hexes"]['emb'].isna()]

        # Fill NaN values in the specified column with the zeros array
        for idx in nan_indices:
            gdf_for_city["hexes"].at[idx, 'emb'] = zeros_array

        # expanded_emb = gdf_for_city["hexes"]['emb'].apply(pd.Series)

        # # Nadanie nowych nazw kolumn
        # expanded_emb.columns = [f'emb_{i+1}' for i in range(len(expanded_emb.columns))]

        # # Połączenie DataFrame'ów
        # gdf_for_city["hexes"] = pd.concat([gdf_for_city["hexes"].drop(columns=['emb']), expanded_emb], axis=1)


        gdf_for_city["hexes"] = gdf_for_city["hexes"].reindex(columns=list(df_hexes_common_columns.columns)+['emb'])

        gdf_for_city["hexes"] = gdf_for_city["hexes"].reset_index().set_index('index').rename_axis('region_id')
        
        gdf_for_city["controller"] = GraphLayerController(
            gdf_for_city["hexes"],
            gdf_for_city["osmnx_nodes"],
            gdf_for_city["osmnx_edges"],
        )
        

Wrocław, Poland
Szczecin, Poland
Poznań, Poland
Kraków, Poland
Warszawa, Poland


In [12]:
# gdfs_dict22 = copy.deepcopy(gdfs_dict2)


def patch_hexes_with_y(
    osmnx_nodes: gpd.GeoDataFrame,
    hexes: gpd.GeoDataFrame,
    controller: GraphLayerController,
):
    virtual_edges = controller.get_virtual_edges_to_hexes(SourceType.OSMNX_NODES)
    hexes_with_y = cast(
        gpd.GeoDataFrame,
        hexes.merge(
            virtual_edges.merge(osmnx_nodes, left_on="source_id", right_index=True)[
                ["region_id", "accidents_count"]
            ]
            .groupby("region_id")
            .sum(),
            left_index=True,
            right_index=True,
            how="left",
        ).fillna(0),
    )
    hexes_with_y["accident_occured"] = (hexes_with_y["accidents_count"] > 0).astype(int)
    hexes_with_y.drop(columns="accidents_count", inplace=True)
    controller.hexes_gdf = hexes_with_y
    controller._hexes_centroids_gdf = controller._create_hexes_centroids_gdf()
    
print("Patching hexes with y...")
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for gdfs in gdfs_dict2.values():
        patch_hexes_with_y(gdfs["osmnx_nodes"], gdfs["hexes"], gdfs["controller"])

Patching hexes with y...


In [45]:
import psutil
import gc
def memory_usage():
    process = psutil.Process()
    mem_info = process.memory_info()
    return mem_info.rss / 1024 ** 2  # Return memory usage in MB

print("Memory usage before deletion:", memory_usage(), "MB")

Memory usage before deletion: 202.16015625 MB


In [18]:
del zeros_array, nan_indices, gdfs
gc.collect()
print("Memory usage after deletion:", memory_usage(), "MB")

Memory usage after deletion: 2152.06640625 MB


In [24]:


# del df_osmnx_node_common_columns, df_osmnx_edge_common_columns, df_hexes_common_columns, city, emb_8, osmnx_nodes, osmnx_edges, hexes, geojson_file, gdf_8_to_9, emb_9, gdf_for_city, zeros_array, nan_indices, gdfs




Memory usage before deletion: 2322.4296875 MB


NameError: name 'city' is not defined

In [16]:
# np.set_printoptions(threshold=np.inf)
# dataset = copy.deepcopy(gdfs_dict2)
# # # Save dataset to HDF5 file
# with h5py.File('dataset_citis.h5', 'w') as f:
#     for column, rows in dataset.items():
#         for row, geo_df in rows.items():
#             # Create a dataset name that includes both column and row names
#             dataset_name = f'{column}/{row}'
#             # Convert the GeoDataFrame to a format suitable for saving
#             # For example, you can convert it to a CSV string
#             csv_string = geo_df.to_csv()
            
#             if row=='hexes':
#                 print('h')
#                 # Convert the 'emb' column from string representations to NumPy arrays
                

#                 geo_df['emb'] = geo_df['emb'].apply(lambda x: np.array2string(x, separator=',', formatter={'float_kind': lambda x: "%.8f" % x}, suppress_small=True, max_line_width=np.inf))
#                 # print(geo_df['emb'][0])
#             # Save the GeoDataFrame to the HDF5 file
#             f.create_dataset(dataset_name, data=csv_string.encode('utf-8'))


In [58]:
# # Load dataset from HDF5 file
# def string_to_polygon(string_rep):
#     return loads(string_rep)
    
# loaded_dataset = {}
# with h5py.File('dataset.h5', 'r') as f:
#     for column in f.keys():
#         loaded_dataset[column] = {}
#         for row in f[column].keys():
#             # Retrieve the CSV string from the dataset
#             csv_string = f[column][row][()]
#             # Convert the CSV string back to a DataFrame
#             # For example, you can use pandas to read the CSV string and preserve index and column labels
#             geo_df = pd.read_csv(io.StringIO(csv_string.decode('utf-8')), index_col=0)  # Assuming the first column is the index
#             geo_df['geometry'] = geo_df['geometry'].apply(string_to_polygon)
#             loaded_dataset[column][row] = geo_df


In [17]:
# display((dataset['Kraków, Poland']['osmnx_edges']==loaded_dataset['Kraków, Poland']['osmnx_edges']).apply(pd.Series.value_counts))
# display((dataset['Kraków, Poland']['osmnx_nodes']==loaded_dataset['Kraków, Poland']['osmnx_nodes']).apply(pd.Series.value_counts))
# display((dataset['Kraków, Poland']['hexes']==loaded_dataset['Kraków, Poland']['hexes']).apply(pd.Series.value_counts))


In [18]:
# def string_to_array(string_rep):
#     # Remove brackets and split the string to get individual elements
#     print(string_rep)
#     elements = string_rep.strip('[]').split()
#     print(elements)
#     # Convert elements to floats and create a NumPy array
#     array = np.array([float(element) for element in elements])


#     print(array)
#     return np.array([int(element) for element in elements])

# # Apply the function to the 'emb_str' column
# loaded_dataset['Kraków, Poland']['hexes']['emb'] = loaded_dataset['Kraków, Poland']['hexes']['emb'].apply(string_to_array)

In [62]:
# type(dataset['Kraków, Poland']['hexes']['emb'][0])

numpy.ndarray

In [19]:
# from shapely.wkt import loads

# def string_to_polygon(string_rep):
#     return loads(string_rep)

# # Apply the function to the 'geometry_str' column
# # loaded_dataset['Kraków, Poland']['hexes']['geometry'] = loaded_dataset['Kraków, Poland']['hexes']['geometry'].apply(string_to_polygon)
# loaded_dataset['Kraków, Poland']['osmnx_edges']['geometry'] = loaded_dataset['Kraków, Poland']['osmnx_edges']['geometry'].apply(string_to_polygon)

# # (dataset['Kraków, Poland']['hexes']['geometry'][0]== loaded_dataset['Kraków, Poland']['hexes']['geometry'][0])


In [20]:
# dataset['Kraków, Poland']['osmnx_edges']['geometry'][0]

In [19]:
from typing import Iterable
import torch
import gc
from src.graph_layering.city_hetero_data import CityHeteroData
from src.graph_layering.graph_layer_creator import GraphLayerController, SourceType


def create_hetero_data_emb(
    controller: GraphLayerController,
    hexes_attrs_columns_names: Iterable[str],
    osmnx_node_attrs_columns_names: Iterable[str],
    osmnx_edge_attrs_columns_names: Iterable[str],
    virtual_edge_attrs_columns_names: Iterable[str],
    hexes_y_columns_names: Iterable[str],
    emb:str,
    squeeze_y: bool = True
) -> CityHeteroData:
    data = CityHeteroData()
    edges_between_hexes = controller.get_edges_between_hexes()
    edges_between_source_and_hexes = controller.get_virtual_edges_to_hexes(
        SourceType.OSMNX_NODES
    )
    # print(controller.hexes_centroids_gdf)
    osm = torch.tensor(
        controller.hexes_centroids_gdf[hexes_attrs_columns_names].to_numpy(),
        dtype=torch.float32,
    )
     
    s = np.stack(controller.hexes_centroids_gdf[emb].values)

    emb_map = torch.tensor(s, dtype=torch.float16)
    del s
    gc.collect()
    data.hex.x = torch.cat((osm, emb_map), dim=1)
    del osm, emb_map
    gc.collect()
    print(data.hex.x.shape)

    data.hex.y = torch.tensor(
        controller.hexes_centroids_gdf[hexes_y_columns_names].to_numpy(),
        dtype=torch.float32,
    ).to(torch.int64)

    data.osmnx_node.x = torch.tensor(
        controller.osmnx_nodes_gdf[osmnx_node_attrs_columns_names].to_numpy(),
        dtype=torch.float32,
    )

    data.hex_connected_to_hex.edge_index = torch.tensor(
        edges_between_hexes.merge(
            controller.hexes_gdf.reset_index(),
            left_on="u",
            right_on="h3_id",
        )
        .rename(columns={"region_id": "u_region_id"})
        .merge(
            controller.hexes_gdf.reset_index(),
            left_on="v",
            right_on="h3_id",
        )
        .rename(columns={"region_id": "v_region_id"})[["u_region_id", "v_region_id"]]
        .to_numpy()
        .T
    )

    node_to_node_connections = (
        controller.osmnx_edges_gdf.merge(
            controller.osmnx_nodes_gdf.reset_index(), left_on="u", right_on="osmid"
        )
        .rename(columns={"node_id": "u_node_id"})
        .merge(controller.osmnx_nodes_gdf.reset_index(), left_on="v", right_on="osmid")
        .rename(columns={"node_id": "v_node_id"})
    )

    data.osmnx_node_connected_to_osmnx_node.edge_index = torch.tensor(
        node_to_node_connections[["u_node_id", "v_node_id"]].to_numpy().T
    )

    data.osmnx_node_connected_to_osmnx_node.edge_attr = torch.tensor(
        node_to_node_connections[osmnx_edge_attrs_columns_names].to_numpy(),
        dtype=torch.float32,
    )

    data.osmnx_node_connected_to_hex.edge_index = torch.tensor(
        edges_between_source_and_hexes[["source_id", "region_id"]].to_numpy().T
    )

    data.osmnx_node_connected_to_hex.edge_attr = torch.tensor(
        edges_between_source_and_hexes[virtual_edge_attrs_columns_names].to_numpy(),
        dtype=torch.float32,
    )
    del node_to_node_connections
    gc.collect()
    if squeeze_y:
        data.hex.y = data.hex.y.squeeze()

    return data


In [21]:
def create_torch_geometric_hetero_data(
    osmnx_nodes, osmnx_edges, hexes, controller: GraphLayerController
):
    edges_attr_columns = osmnx_edges.columns[
        ~osmnx_edges.columns.isin(["u", "v", "key", "geometry"])
    ]
    nodes_attr_columns = osmnx_nodes.columns[
        ~osmnx_nodes.columns.isin(["geometry", "x", "y", "osmid", "accidents_count"])
    ]
    hexes_attr_columns = hexes.columns[~hexes.columns.isin(["geometry", "h3_id", 'emb'])]

    data = create_hetero_data_emb(
        controller,
        hexes_attrs_columns_names=hexes_attr_columns,
        osmnx_edge_attrs_columns_names=edges_attr_columns,
        osmnx_node_attrs_columns_names=nodes_attr_columns,
        virtual_edge_attrs_columns_names=[],
        hexes_y_columns_names=["accident_occured"],
        emb='emb',
    )
    return data

# data_dict = {}
for city_name, gdfs in list(gdfs_dict2.items()):
    data_dict[city_name]= create_torch_geometric_hetero_data(**gdfs)

# def shift_elements_right(lst):
#     shifted_lst = [lst[-1]] + lst[:-1]
#     return shifted_lst


# cities_names_list = list(data_dict.keys())

# # val + test
# folds_tuples = list(zip(shift_elements_right(cities_names_list), cities_names_list))

torch.Size([3168, 151745])
torch.Size([3534, 151745])
torch.Size([2945, 151745])


In [24]:

def shift_elements_right(lst):
    shifted_lst = [lst[-1]] + lst[:-1]
    return shifted_lst


cities_names_list = list(data_dict.keys())

# val + test
folds_tuples = list(zip(shift_elements_right(cities_names_list), cities_names_list))

In [25]:
folds_tuples

[('Poznań, Poland', 'Kraków, Poland'),
 ('Kraków, Poland', 'Warszawa, Poland'),
 ('Warszawa, Poland', 'Wrocław, Poland'),
 ('Wrocław, Poland', 'Szczecin, Poland'),
 ('Szczecin, Poland', 'Poznań, Poland')]

In [68]:
import gc

for name in dir():
    if name != "data_dict" and name != "folds_tuples" and name != "gc" and name != "configs_list" and not name.startswith('_'):
        del globals()[name]

# Call garbage collector to free up memory
gc.collect()

import psutil

def memory_usage():
    process = psutil.Process()
    mem_info = process.memory_info()
    return mem_info.rss / 1024 ** 2  # Return memory usage in MB

print("Memory usage before deletion:", memory_usage(), "MB")

Memory usage before deletion: 2548.04296875 MB


In [60]:
import os
import sys

notebook_path = os.path.abspath("__file__")
notebook_directory = os.path.dirname(notebook_path)
parent_directory = os.path.dirname(notebook_directory)

parent_parent_directory = os.path.dirname(parent_directory)

sys.path.append(parent_parent_directory)

In [61]:
print("Memory usage before deletion:", memory_usage(), "MB")

Memory usage before deletion: 308.0078125 MB


In [64]:
import psutil

def get_free_memory():
    # Get virtual memory statistics
    mem = psutil.virtual_memory()
    
    # Return the amount of free memory in MB
    return mem.available / (1024 ** 2)

# Example usage:
free_memory_before = get_free_memory()
print(f"Free memory before cleaning variables: {free_memory_before:.2f} MB")


Free memory before cleaning variables: 1442.79 MB


In [63]:
from src.training.train import train
import json 
# import torch
# import psutil
import gc
# import copy 
# import pandas as pd
# import geopandas as gpd
import geopandas as gpd
# from src.organized_datasets_creation.utils import resolve_nominatim_city_name
# from src.graph_layering.create_dataframes import create_osmnx_dataframes
# from src.organized_datasets_creation.utils import convert_nominatim_name_to_filename
# from src.graph_layering.graph_layer_creator import GraphLayerController
# import pandas as pd
# from typing import cast
# import os
# from src.graph_layering.graph_layer_creator import SourceType
# import warnings
# from src.graph_layering.create_hetero_data import create_hetero_data

# from tqdm import tqdm

# from tqdm import tqdm
# # import wandb.util
# # import wandb
# import os

# import glob
import numpy as np



def run_k_fold(hparams):
    # run = wandb.init()
    epochs = 5#EPOCHS

    # config = wandb.config

    lin_layer_sizes = [hparams['lin_layer_size']] * hparams['num_lin_layers']


    hparams = {
        "hidden_channels": hparams['hidden_channels'],
        "lr": hparams['learning_rate'],
        "num_conv_layers": hparams['num_conv_layers'],
        "lin_layer_sizes": lin_layer_sizes,
        "weight_decay": hparams['weight_decay'],
    }

    aucs = []
    accuracies = []
    f1s = []

    # fold_group_id = generate_id()
    TRAIN_SAVE_DIR='res'
    for index, (val_city_name, test_city_name) in enumerate(folds_tuples):
        val_data = [data_dict[val_city_name].to("cpu").clone()]
        train_data = [
            v.to("cpu").clone()
            for k, v in data_dict.items()
            if k != val_city_name and k != test_city_name
        ]
        test_data = data_dict[test_city_name].to("cpu").clone()

        auc, accuracy, f1, model_path = train(
            train_data=train_data,
            val_data=val_data,
            test_data=test_data,
            epochs=epochs,
            hparams=hparams,
            train_save_dir=TRAIN_SAVE_DIR,
        )
        # run.log_model(
        #     path=model_path,
        #     name=f"model_{fold_group_id}_fold_{index}",
        # )
        # run.log({f"auc_fold_{index}": auc})
        # run.log({f"accuracy_fold_{index}": accuracy})
        # run.log({f"f1_fold_{index}": f1})

        aucs.append(auc)
        accuracies.append(accuracy)
        f1s.append(f1)

    mean_auc = sum(aucs) / len(aucs)
    mean_accuracy = sum(accuracies) / len(accuracies)
    mean_f1 = sum(f1s) / len(f1s)
    # run.log({"mean_auc": mean_auc})
    # run.log({"mean_accuracy": mean_accuracy})
    # run.log({"mean_f1": mean_f1})

    del auc, accuracy, f1, model_path, test_data, train_data, val_data
    gc.collect()
    return {
        "mean_auc": mean_auc,
        "mean_accuracy": mean_accuracy,
        "mean_f1": mean_f1,
        "aucs": aucs,
        "accuracies": accuracies,
        "f1s": f1s,
    }

def main():
    results = []
    # print(sweep_configs)
    for i, config in enumerate(configs_list):
        result = run_k_fold(config)
        results.append(result)
        print(f"Config: {config}")
        print(f"Mean AUC: {result['mean_auc']}")
        print(f"Mean Accuracy: {result['mean_accuracy']}")
        print(f"Mean F1: {result['mean_f1']}")
        print("-" * 50)

        with open(f'emb_results_{i}.json', 'w') as file:
            json.dump(result, file)
    del result
    gc.collect() 
    # Optionally, you can display aggregated results in a more comprehensive format
    mean_aucs = [res["mean_auc"] for res in results]
    mean_accuracies = [res["mean_accuracy"] for res in results]
    mean_f1s = [res["mean_f1"] for res in results]

    print("Overall Results")
    print(f"Mean AUC: {np.mean(mean_aucs)} ± {np.std(mean_aucs)}")
    print(f"Mean Accuracy: {np.mean(mean_accuracies)} ± {np.std(mean_accuracies)}")
    print(f"Mean F1: {np.mean(mean_f1s)} ± {np.std(mean_f1s)}")
    return results
        


result = main()

RuntimeError: [enforce fail at alloc_cpu.cpp:80] data. DefaultCPUAllocator: not enough memory: you tried to allocate 2074050660 bytes.

In [70]:

def print_memory_usage():
    mem = psutil.virtual_memory()
    print(f"Total: {mem.total / (1024 ** 2):.2f} MB")
    print(f"Available: {mem.available / (1024 ** 2):.2f} MB")
    print(f"Used: {mem.used / (1024 ** 2):.2f} MB")
    print(f"Free: {mem.free / (1024 ** 2):.2f} MB")

print_memory_usage()

Total: 8055.59 MB
Available: 953.99 MB
Used: 7101.60 MB
Free: 953.99 MB
