## Auxiliares

In [None]:
import requests
from bs4 import BeautifulSoup
import zipfile
import io
import os

def baixar_shapefile_estado(estado):
    base_url = f'https://catalog.data.gov/dataset/tiger-line-shapefile-2021-state-{estado}-census-tracts'
    response = requests.get(base_url)
    if response.status_code != 200:
        return None
    soup = BeautifulSoup(response.text, 'html.parser')
    link_tag = soup.find('a', href=lambda href: href and href.endswith('.zip'))
    if not link_tag:
        return None
    download_url = link_tag['href']
    if not download_url.startswith('http'):
        download_url = 'https://catalog.data.gov' + download_url
    zip_response = requests.get(download_url)
    if zip_response.status_code != 200:
        return None
    with zipfile.ZipFile(io.BytesIO(zip_response.content)) as zip_ref:
        zip_ref.extractall('/content/')


In [None]:
from google.colab import auth
auth.authenticate_user()

from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
import io, os
drive_service = build("drive", "v3")

def download_from_folder_by_name(service, folder_id: str, filename: str, dest_path: str):
    q = f"'{folder_id}' in parents and name = '{filename}' and trashed = false"
    res = service.files().list(
        q=q,
        fields="files(id,name,size,mimeType)",
        supportsAllDrives=True,
        includeItemsFromAllDrives=True,
        pageSize=1,
    ).execute()
    files = res.get("files", [])
    if not files:
        raise FileNotFoundError(f"Arquivo não encontrado na pasta {folder_id}: {filename}")

    file_id = files[0]["id"]
    os.makedirs(os.path.dirname(dest_path), exist_ok=True)

    request = service.files().get_media(fileId=file_id, supportsAllDrives=True)
    with open(dest_path, "wb") as fh:
        downloader = MediaIoBaseDownload(fh, request)
        done = False
        while not done:
            status, done = downloader.next_chunk()
    return dest_path

# Georgia


Census: https://catalog.data.gov/dataset/tiger-line-shapefile-2021-state-alabama-census-tracts

In [None]:
ESTADO = "Georgia"

In [None]:
diretorio_principal = "/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/"

In [None]:
baixar_shapefile_estado("georgia")

In [None]:
import os
import geopandas as gpd

arquivos = [os.path.join("/content", f) for f in os.listdir("/content") if f.endswith(".shp")]
arquivo = max(arquivos, key=os.path.getmtime)
tl = gpd.read_file(arquivo).to_crs("EPSG:4326")
tl.plot(edgecolor="black")


In [None]:
tl[["GEOID","geometry"]] ##TODO: Passar arquivo para HGI (CSV)

## BORO -> GEOID

In [None]:
import geopandas as gpd
from shapely import wkt

boroughs = tl[["GEOID", "geometry"]].copy()
boroughs["geometry"] = boroughs["geometry"].apply(lambda g: g.wkt)

boroughs.to_csv(f"{diretorio_principal}/boroughs_area.csv", index=False)


In [None]:
boroughs

In [None]:
!rm -rf /usr/local/lib/python3.12/dist-packages/~orch*
!pip cache purge


In [None]:
%pip -q install -U pip setuptools wheel
%pip -q uninstall -y torch torchvision torchaudio torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv pyg-lib
%pip -q install --no-cache-dir geopandas shapely libpysal h3 h3ronpy pyarrow scipy scikit-learn


In [None]:
%pip -q install --no-cache-dir torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cpu
%pip -q install --no-cache-dir pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-2.4.0+cpu.html


In [None]:
%cd {diretorio_principal}

## POI Embedding

In [None]:
FOLDER_ID_CRUS = "1cV807NNGn4gSDX-7fkJ83rlr0nRo4y89"
FOLDER_ID_SEPARATED = "1XUWhd59YDe8dSrTb6eZlLvhVcLSGpZ7n"

filename = f"checkins_{ESTADO}.csv"

path_crus = f"estados/crus/{filename}"
path_sep  = f"estados/separated/{filename}"

print("Baixando (crus)...")
download_from_folder_by_name(drive_service,FOLDER_ID_CRUS, filename, path_crus)

print("Baixando (separated)...")
download_from_folder_by_name(drive_service,FOLDER_ID_SEPARATED, filename, path_sep)

print("Concluído:", path_crus, "e", path_sep)


In [None]:
import pandas as pd, numpy as np, ast
import geopandas as gpd
from shapely.geometry import Point


CHECKIN_NAO_CRU = f"estados/crus/checkins_{ESTADO}.csv"
CHECKIN_CRU     = f"estados/separated/checkins_{ESTADO}.csv"
OUT_POIS        = "pois_gowalla.csv"

df_labeled = pd.read_csv(CHECKIN_NAO_CRU)
df_raw     = pd.read_csv(CHECKIN_CRU)

lon_col_raw = "lng" if "lng" in df_raw.columns else "longitude"
lat_col_raw = "lat" if "lat" in df_raw.columns else "latitude"
lon_col_lab = "lng" if "lng" in df_labeled.columns else ("longitude" if "longitude" in df_labeled.columns else None)
lat_col_lab = "lat" if "lat" in df_labeled.columns else ("latitude" if "latitude" in df_labeled.columns else None)

def parse_names(cell):
    try:
        lst = ast.literal_eval(cell)
        if isinstance(lst, list):
            return [d.get("name") for d in lst if isinstance(d, dict) and "name" in d]
    except Exception:
        pass
    return []

df_raw["__cat_names"] = df_raw["spot_categories"].fillna("[]").apply(parse_names)

def first_or_none(lst):
    return lst[0] if (isinstance(lst, list) and len(lst) > 0) else None

df_raw["__fclass_name"] = df_raw["__cat_names"].apply(first_or_none)

fclass_by_place = (df_raw.dropna(subset=["__fclass_name"])
                          .groupby("placeid")["__fclass_name"]
                          .agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.iloc[0]))

cat_by_place = (df_labeled.dropna(subset=["category"])
                          .groupby("placeid")["category"]
                          .agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.iloc[0]))

coords_raw = (
    df_raw[[lat_col_raw, lon_col_raw, "placeid"]]
    .rename(columns={lat_col_raw: "latitude", lon_col_raw: "longitude"})
    .dropna(subset=["latitude", "longitude", "placeid"])
    .copy()
)


coords_raw = coords_raw.dropna()

pois = pd.DataFrame({"feature_id": coords_raw.index})
pois["feature_id"] = pois["feature_id"].astype(int)

pois["fclass_name"]   = fclass_by_place.reindex(pois["feature_id"]).values
pois["category_name"] = cat_by_place.reindex(pois["feature_id"]).values

pois = pois.dropna(subset=["fclass_name", "category_name"]).reset_index(drop=True)

gdf = gpd.GeoDataFrame(
    pois,
    geometry=gpd.points_from_xy(coords_raw.loc[pois["feature_id"], "longitude"].values,
                                coords_raw.loc[pois["feature_id"], "latitude"].values),
    crs="EPSG:4326"
)
gdf["geometry"] = gdf.geometry.apply(lambda p: p.wkt)

fclass_vocab = {n:i for i,n in enumerate(pd.Series(gdf["fclass_name"]).dropna().unique())}
cat_vocab    = {n:i for i,n in enumerate(pd.Series(gdf["category_name"]).dropna().unique())}

gdf["fclass"]   = gdf["fclass_name"].map(lambda n: fclass_vocab.get(n, -1)).astype(int)
gdf["category"] = gdf["category_name"].map(lambda n: cat_vocab.get(n, -1)).astype(int)

gdf = gdf[(gdf["fclass"]>=0) & (gdf["category"]>=0)].reset_index(drop=True)

pois_out = gdf[["feature_id", "category", "fclass", "geometry"]].copy()
pois_out.to_csv(OUT_POIS, index=False)



In [None]:
# @title
import geopandas as gpd
from shapely.geometry import box

lat_min, lat_max = df["latitude"].min(), df["latitude"].max()
lon_min, lon_max = df["longitude"].min(), df["longitude"].max()
area = box(lon_min, lat_min, lon_max, lat_max).buffer(0.01)

boroughs = gpd.GeoDataFrame(geometry=[area], crs="EPSG:4326")

boroughs["geometry"] = boroughs["geometry"].apply(lambda g: g.wkt)
boroughs.to_csv("boroughs_area.csv", index=False)


In [None]:
import sys
module_dir = f'{diretorio_principal}/region-embedding/baselines/poi-encoder'

sys.path.insert(0, module_dir)

In [None]:
from POIEmbedding import PreProcess

PreProcess("pois_gowalla.csv", "boroughs_area.csv", h3=False).run() ##TODO H3 FALSO


In [None]:
from POIEmbedding import POI2Vec
p = POI2Vec()
p.train()
p.save_walks()


In [None]:
from POIEmbedding import POI2Vec
from model import POISet, EmbeddingModel
import torch, torch.utils.data as tud

poi2vec = POI2Vec()
poi2vec.read_walks()
poi2vec.get_global_second_class_walks()

second_class_hierarchy_pairs = list(set([tuple(x) for x in poi2vec.pois[["category","fclass"]].to_numpy()]))
dataset = POISet(
    poi2vec.second_class_number,
    poi2vec.second_class_walks,
    poi2vec.global_second_class_walks,
    k=5
)
model = EmbeddingModel(
    vocab_size=poi2vec.second_class_number,
    embed_size=64,
    second_class_hierarchy_pairs=second_class_hierarchy_pairs,
    le_lambda=1e-8
)

loader = tud.DataLoader(dataset, batch_size=2048, shuffle=True)
opt = torch.optim.Adam(model.parameters(), lr=5e-2)
for e in range(5):
    for i,(inp,pos,neg) in enumerate(loader):
        opt.zero_grad()
        loss,_ = model(inp.long(), pos.long(), neg.long())
        loss.backward()
        opt.step()

torch.save({"in_embed_georgia.weight": model.clone_input_embedding()}, "poi-encoder-gowalla-h3_georgia.tensor")


## HGI

In [None]:
import sys
module_dir = f'{diretorio_principal}/region-embedding/baselines/HGI/preprocess'

sys.path.insert(0, module_dir)

In [None]:
import torch, numpy as np, pandas as pd
from torch_geometric.data import Data
from main import Preprocess

POIS = "pois_gowalla.csv"
REGS = "boroughs_area.csv"

data_dict = Preprocess(POIS, REGS, emb_filename=None, h3=False).get_data_torch()

loc_pt_path = "/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/poi_embeddings_location-georgia.pt"
blob = torch.load(loc_pt_path, map_location="cpu")
E = blob["embeddings"].detach().cpu().numpy()
placeids = [str(p) for p in blob["placeids"]]
placeid2idx = {pid: i for i, pid in enumerate(placeids)}
D = E.shape[1]

order = pd.read_csv("poi_index.csv")
order["feature_id"] = order["feature_id"].astype(str)

X = np.zeros((len(order), D), dtype=np.float32)
for i, pid in enumerate(order["feature_id"].tolist()):
    idx = placeid2idx.get(pid)
    if idx is None:
        raise KeyError(f"placeid {pid} não encontrado em {loc_pt_path}. Gere embeddings para todos os POIs.")
    X[i] = E[idx]

import numpy as np
ei = np.asarray(data_dict['edge_index'])

g = Data(
    x=torch.tensor(X, dtype=torch.float32),
    edge_index=torch.tensor(data_dict['edge_index'], dtype=torch.long),
    edge_weight=torch.tensor(data_dict['edge_weight'], dtype=torch.float32),
)
g.region_id  = torch.tensor(data_dict['region_id'], dtype=torch.long)
g.region_area = torch.tensor(data_dict['region_area'], dtype=torch.float32)
g.coarse_region_similarity = torch.tensor(data_dict['coarse_region_similarity'], dtype=torch.float32)
g.region_adjacency = torch.tensor(data_dict['region_adjacency'], dtype=torch.long)

torch.save(g, "gowalla.pt")


In [None]:
import os, pickle as pkl, torch
from torch_geometric.data import Data

os.makedirs("./data", exist_ok=True)

g = torch.load("./gowalla.pt", map_location="cpu")

data_dict = {
    "node_features": g.x.detach().cpu().numpy(),
    "edge_index": g.edge_index.detach().cpu().numpy(),
    "edge_weight": g.edge_weight.detach().cpu().numpy(),
    "region_id": g.region_id.detach().cpu().numpy(),
    "region_area": g.region_area.detach().cpu().numpy(),
    "coarse_region_similarity": g.coarse_region_similarity.detach().cpu().numpy(),
    "region_adjacency": g.region_adjacency.detach().cpu().numpy(),
}

with open("./data/gowalla_hgi_data.pkl", "wb") as f:
    pkl.dump(data_dict, f)



In [None]:
import numpy as np

R_from_id = int(np.max(data_dict["region_id"])) + 1
R_area    = len(data_dict["region_area"])
R_adj     = int(data_dict["region_adjacency"].max()) + 1
R_sim     = data_dict["coarse_region_similarity"].shape[0]

print("R from id :", R_from_id)
print("R area    :", R_area)
print("R adj     :", R_adj)
print("R sim     :", R_sim)

assert R_from_id == R_area == R_adj == R_sim, "Desalinhad"


In [None]:
!python {diretorio_principal}/region-embedding/baselines/HGI/train.py --city gowalla --dim 64 --alpha 0.5 --attention_head 4 --epoch 300 --device cpu --save_name gowalla_h3


# Nebraska


Census: https://catalog.data.gov/dataset/tiger-line-shapefile-2021-state-nebraska-census-tracts

In [None]:
ESTADO = "Nebraska"

In [None]:
diretorio_principal = "/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/"

In [None]:
baixar_shapefile_estado("nebraska")

In [None]:
import os
import geopandas as gpd

arquivos = [os.path.join("/content", f) for f in os.listdir("/content") if f.endswith(".shp")]
arquivo = max(arquivos, key=os.path.getmtime)
tl = gpd.read_file(arquivo).to_crs("EPSG:4326")
tl.plot(edgecolor="black")


In [None]:
tl[["GEOID","geometry"]] ##TODO: Passar arquivo para HGI (CSV)

## BORO -> GEOID

In [None]:
import geopandas as gpd
from shapely import wkt

boroughs = tl[["GEOID", "geometry"]].copy()
boroughs["geometry"] = boroughs["geometry"].apply(lambda g: g.wkt)

boroughs.to_csv(f"{diretorio_principal}/boroughs_area.csv", index=False)


In [None]:
boroughs

In [None]:
!rm -rf /usr/local/lib/python3.12/dist-packages/~orch*
!pip cache purge


In [None]:
%pip -q install -U pip setuptools wheel
%pip -q uninstall -y torch torchvision torchaudio torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv pyg-lib
%pip -q install --no-cache-dir geopandas shapely libpysal h3 h3ronpy pyarrow scipy scikit-learn


In [None]:
%pip -q install --no-cache-dir torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cpu
%pip -q install --no-cache-dir pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-2.4.0+cpu.html


In [None]:
%cd {diretorio_principal}

## POI Embedding

In [None]:
FOLDER_ID_CRUS = "1cV807NNGn4gSDX-7fkJ83rlr0nRo4y89"
FOLDER_ID_SEPARATED = "1XUWhd59YDe8dSrTb6eZlLvhVcLSGpZ7n"

filename = f"checkins_{ESTADO}.csv"

path_crus = f"estados/crus/{filename}"
path_sep  = f"estados/separated/{filename}"

print("Baixando (crus)...")
download_from_folder_by_name(drive_service,FOLDER_ID_CRUS, filename, path_crus)

print("Baixando (separated)...")
download_from_folder_by_name(drive_service,FOLDER_ID_SEPARATED, filename, path_sep)

print("Concluído:", path_crus, "e", path_sep)


In [None]:
import pandas as pd, numpy as np, ast
import geopandas as gpd
from shapely.geometry import Point


CHECKIN_NAO_CRU = f"estados/crus/checkins_{ESTADO}.csv"
CHECKIN_CRU     = f"estados/separated/checkins_{ESTADO}.csv"
OUT_POIS        = "pois_gowalla.csv"

df_labeled = pd.read_csv(CHECKIN_NAO_CRU)
df_raw     = pd.read_csv(CHECKIN_CRU)

lon_col_raw = "lng" if "lng" in df_raw.columns else "longitude"
lat_col_raw = "lat" if "lat" in df_raw.columns else "latitude"
lon_col_lab = "lng" if "lng" in df_labeled.columns else ("longitude" if "longitude" in df_labeled.columns else None)
lat_col_lab = "lat" if "lat" in df_labeled.columns else ("latitude" if "latitude" in df_labeled.columns else None)

def parse_names(cell):
    try:
        lst = ast.literal_eval(cell)
        if isinstance(lst, list):
            return [d.get("name") for d in lst if isinstance(d, dict) and "name" in d]
    except Exception:
        pass
    return []

df_raw["__cat_names"] = df_raw["spot_categories"].fillna("[]").apply(parse_names)

def first_or_none(lst):
    return lst[0] if (isinstance(lst, list) and len(lst) > 0) else None

df_raw["__fclass_name"] = df_raw["__cat_names"].apply(first_or_none)

fclass_by_place = (df_raw.dropna(subset=["__fclass_name"])
                          .groupby("placeid")["__fclass_name"]
                          .agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.iloc[0]))

cat_by_place = (df_labeled.dropna(subset=["category"])
                          .groupby("placeid")["category"]
                          .agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.iloc[0]))

coords_raw = (df_raw.groupby("placeid")[[lat_col_raw, lon_col_raw]]
                    .mean()
                    .rename(columns={lat_col_raw: "latitude", lon_col_raw: "longitude"}))

if coords_raw.empty and lon_col_lab and lat_col_lab and (lon_col_lab in df_labeled.columns) and (lat_col_lab in df_labeled.columns):
    coords_raw = (df_labeled.groupby("placeid")[[lat_col_lab, lon_col_lab]]
                           .mean()
                           .rename(columns={lat_col_lab: "latitude", lon_col_lab: "longitude"}))

coords_raw = coords_raw.dropna()

pois = pd.DataFrame({"feature_id": coords_raw.index})
pois["feature_id"] = pois["feature_id"].astype(int)

pois["fclass_name"]   = fclass_by_place.reindex(pois["feature_id"]).values
pois["category_name"] = cat_by_place.reindex(pois["feature_id"]).values

pois = pois.dropna(subset=["fclass_name", "category_name"]).reset_index(drop=True)

gdf = gpd.GeoDataFrame(
    pois,
    geometry=gpd.points_from_xy(coords_raw.loc[pois["feature_id"], "longitude"].values,
                                coords_raw.loc[pois["feature_id"], "latitude"].values),
    crs="EPSG:4326"
)
gdf["geometry"] = gdf.geometry.apply(lambda p: p.wkt)

fclass_vocab = {n:i for i,n in enumerate(pd.Series(gdf["fclass_name"]).dropna().unique())}
cat_vocab    = {n:i for i,n in enumerate(pd.Series(gdf["category_name"]).dropna().unique())}

gdf["fclass"]   = gdf["fclass_name"].map(lambda n: fclass_vocab.get(n, -1)).astype(int)
gdf["category"] = gdf["category_name"].map(lambda n: cat_vocab.get(n, -1)).astype(int)

gdf = gdf[(gdf["fclass"]>=0) & (gdf["category"]>=0)].reset_index(drop=True)

pois_out = gdf[["feature_id", "category", "fclass", "geometry"]].copy()
pois_out.to_csv(OUT_POIS, index=False)



In [None]:
# @title
import geopandas as gpd
from shapely.geometry import box

lat_min, lat_max = df["latitude"].min(), df["latitude"].max()
lon_min, lon_max = df["longitude"].min(), df["longitude"].max()
area = box(lon_min, lat_min, lon_max, lat_max).buffer(0.01)

boroughs = gpd.GeoDataFrame(geometry=[area], crs="EPSG:4326")

boroughs["geometry"] = boroughs["geometry"].apply(lambda g: g.wkt)
boroughs.to_csv("boroughs_area.csv", index=False)


In [None]:
import sys
module_dir = f'{diretorio_principal}/region-embedding/baselines/poi-encoder'

sys.path.insert(0, module_dir)

In [None]:
from POIEmbedding import PreProcess

PreProcess("pois_gowalla.csv", "boroughs_area.csv", h3=False).run() ##TODO H3 FALSO


In [None]:
import pandas as pd
import numpy as np
import torch
from shapely import wkt

def build_node_features_from_location_encoder(pois_csv_path: str,
                                              loc_embed_pt_path: str,
                                              placeid_col: str = "feature_id"):

    df_pois = pd.read_csv(pois_csv_path)
    if df_pois["geometry"].dtype == object:
        df_pois["geometry"] = df_pois["geometry"].apply(wkt.loads)

    blob = torch.load(loc_embed_pt_path, map_location="cpu")
    E = blob["embeddings"].detach().cpu().numpy()
    placeids = [str(p) for p in blob["placeids"]]
    placeid2idx = {pid: i for i, pid in enumerate(placeids)}

    ids = df_pois[placeid_col].astype(str).tolist()
    D = E.shape[1]
    X = np.zeros((len(ids), D), dtype=np.float32)

    for i, pid in enumerate(ids):
        idx = placeid2idx.get(pid)
        if idx is None:
            raise KeyError(f"placeid {pid} não encontrado em {loc_embed_pt_path}. "
                           f"")
        X[i] = E[idx]

    return X, df_pois


In [None]:
# @title
from POIEmbedding import POI2Vec
p = POI2Vec()
p.train()
p.save_walks()


In [None]:
# @title
from POIEmbedding import POI2Vec
from model import POISet, EmbeddingModel
import torch, torch.utils.data as tud

poi2vec = POI2Vec()
poi2vec.read_walks()
poi2vec.get_global_second_class_walks()

second_class_hierarchy_pairs = list(set([tuple(x) for x in poi2vec.pois[["category","fclass"]].to_numpy()]))
dataset = POISet(
    poi2vec.second_class_number,
    poi2vec.second_class_walks,
    poi2vec.global_second_class_walks,
    k=5
)
model = EmbeddingModel(
    vocab_size=poi2vec.second_class_number,
    embed_size=64,
    second_class_hierarchy_pairs=second_class_hierarchy_pairs,
    le_lambda=1e-8
)

loader = tud.DataLoader(dataset, batch_size=2048, shuffle=True)
opt = torch.optim.Adam(model.parameters(), lr=5e-2)
for e in range(5):
    for i,(inp,pos,neg) in enumerate(loader):
        opt.zero_grad()
        loss,_ = model(inp.long(), pos.long(), neg.long())
        loss.backward()
        opt.step()

torch.save({"in_embed.weight": model.clone_input_embedding()}, "poi-encoder-gowalla-h3.tensor")


## HGI

In [None]:
import sys
module_dir = f'{diretorio_principal}/region-embedding/baselines/HGI/preprocess'

sys.path.insert(0, module_dir)

In [None]:
import torch, numpy as np, pandas as pd
from torch_geometric.data import Data
from main import Preprocess

POIS = "pois_gowalla.csv"
REGS = "boroughs_area.csv"

data_dict = Preprocess(POIS, REGS, emb_filename=None, h3=False).get_data_torch()

loc_pt_path = "/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/poi_embeddings_location.pt"
blob = torch.load(loc_pt_path, map_location="cpu")
E = blob["embeddings"].detach().cpu().numpy()
placeids = [str(p) for p in blob["placeids"]]
placeid2idx = {pid: i for i, pid in enumerate(placeids)}
D = E.shape[1]

order = pd.read_csv("poi_index.csv")
order["feature_id"] = order["feature_id"].astype(str)

X = np.zeros((len(order), D), dtype=np.float32)
for i, pid in enumerate(order["feature_id"].tolist()):
    idx = placeid2idx.get(pid)
    if idx is None:
        raise KeyError(f"placeid {pid} não encontrado em {loc_pt_path}. Gere embeddings para todos os POIs.")
    X[i] = E[idx]

import numpy as np
ei = np.asarray(data_dict['edge_index'])
assert ei.max() < X.shape[0], "edge_index referencia nó >= len(X) — ordem quebrou"

g = Data(
    x=torch.tensor(X, dtype=torch.float32),
    edge_index=torch.tensor(data_dict['edge_index'], dtype=torch.long),
    edge_weight=torch.tensor(data_dict['edge_weight'], dtype=torch.float32),
)
g.region_id  = torch.tensor(data_dict['region_id'], dtype=torch.long)
g.region_area = torch.tensor(data_dict['region_area'], dtype=torch.float32)
g.coarse_region_similarity = torch.tensor(data_dict['coarse_region_similarity'], dtype=torch.float32)
g.region_adjacency = torch.tensor(data_dict['region_adjacency'], dtype=torch.long)

torch.save(g, "gowalla.pt")


In [None]:
import os, pickle as pkl, torch
from torch_geometric.data import Data

os.makedirs("./data", exist_ok=True)

g = torch.load("./gowalla.pt", map_location="cpu")

data_dict = {
    "node_features": g.x.detach().cpu().numpy(),
    "edge_index": g.edge_index.detach().cpu().numpy(),
    "edge_weight": g.edge_weight.detach().cpu().numpy(),
    "region_id": g.region_id.detach().cpu().numpy(),
    "region_area": g.region_area.detach().cpu().numpy(),
    "coarse_region_similarity": g.coarse_region_similarity.detach().cpu().numpy(),
    "region_adjacency": g.region_adjacency.detach().cpu().numpy(),
}

with open("./data/gowalla_hgi_data.pkl", "wb") as f:
    pkl.dump(data_dict, f)



In [None]:
import numpy as np

R_from_id = int(np.max(data_dict["region_id"])) + 1
R_area    = len(data_dict["region_area"])
R_adj     = int(data_dict["region_adjacency"].max()) + 1
R_sim     = data_dict["coarse_region_similarity"].shape[0]

print("R from id :", R_from_id)
print("R area    :", R_area)
print("R adj     :", R_adj)
print("R sim     :", R_sim)

assert R_from_id == R_area == R_adj == R_sim, "Desalinhad"


In [None]:
!python {diretorio_principal}/region-embedding/baselines/HGI/train.py --city gowalla --dim 64 --alpha 0.5 --attention_head 4 --epoch 300 --device cpu --save_name gowalla_h3


# Texas


Census: https://catalog.data.gov/dataset/tiger-line-shapefile-2021-state-texas-census-tracts

In [None]:
ESTADO = "Texas"

In [None]:
diretorio_principal = "/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/"

In [None]:
baixar_shapefile_estado("texas")

In [None]:
import os
import geopandas as gpd

arquivos = [os.path.join("/content", f) for f in os.listdir("/content") if f.endswith(".shp")]
arquivo = max(arquivos, key=os.path.getmtime)
tl = gpd.read_file(arquivo).to_crs("EPSG:4326")
tl.plot(edgecolor="black")


In [None]:
tl[["GEOID","geometry"]] ##TODO: Passar arquivo para HGI (CSV)

## BORO -> GEOID

In [None]:
import geopandas as gpd
from shapely import wkt

boroughs = tl[["GEOID", "geometry"]].copy()
boroughs["geometry"] = boroughs["geometry"].apply(lambda g: g.wkt)

boroughs.to_csv(f"{diretorio_principal}/boroughs_area.csv", index=False)


In [None]:
boroughs

In [None]:
!rm -rf /usr/local/lib/python3.12/dist-packages/~orch*
!pip cache purge


In [None]:
%pip -q install -U pip setuptools wheel
%pip -q uninstall -y torch torchvision torchaudio torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv pyg-lib
%pip -q install --no-cache-dir geopandas shapely libpysal h3 h3ronpy pyarrow scipy scikit-learn


In [None]:
%pip -q install --no-cache-dir torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cpu
%pip -q install --no-cache-dir pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-2.4.0+cpu.html


In [None]:
%cd {diretorio_principal}

## POI Embedding

In [None]:
FOLDER_ID_CRUS = "1cV807NNGn4gSDX-7fkJ83rlr0nRo4y89"
FOLDER_ID_SEPARATED = "1XUWhd59YDe8dSrTb6eZlLvhVcLSGpZ7n"

filename = f"checkins_{ESTADO}.csv"

path_crus = f"estados/crus/{filename}"
path_sep  = f"estados/separated/{filename}"

print("Baixando (crus)...")
download_from_folder_by_name(drive_service,FOLDER_ID_CRUS, filename, path_crus)

print("Baixando (separated)...")
download_from_folder_by_name(drive_service,FOLDER_ID_SEPARATED, filename, path_sep)

print("Concluído:", path_crus, "e", path_sep)


In [None]:
import pandas as pd, numpy as np, ast
import geopandas as gpd
from shapely.geometry import Point


CHECKIN_NAO_CRU = f"estados/crus/checkins_{ESTADO}.csv"
CHECKIN_CRU     = f"estados/separated/checkins_{ESTADO}.csv"
OUT_POIS        = "pois_gowalla.csv"

df_labeled = pd.read_csv(CHECKIN_NAO_CRU)
df_raw     = pd.read_csv(CHECKIN_CRU)

lon_col_raw = "lng" if "lng" in df_raw.columns else "longitude"
lat_col_raw = "lat" if "lat" in df_raw.columns else "latitude"
lon_col_lab = "lng" if "lng" in df_labeled.columns else ("longitude" if "longitude" in df_labeled.columns else None)
lat_col_lab = "lat" if "lat" in df_labeled.columns else ("latitude" if "latitude" in df_labeled.columns else None)

def parse_names(cell):
    try:
        lst = ast.literal_eval(cell)
        if isinstance(lst, list):
            return [d.get("name") for d in lst if isinstance(d, dict) and "name" in d]
    except Exception:
        pass
    return []

df_raw["__cat_names"] = df_raw["spot_categories"].fillna("[]").apply(parse_names)

def first_or_none(lst):
    return lst[0] if (isinstance(lst, list) and len(lst) > 0) else None

df_raw["__fclass_name"] = df_raw["__cat_names"].apply(first_or_none)

fclass_by_place = (df_raw.dropna(subset=["__fclass_name"])
                          .groupby("placeid")["__fclass_name"]
                          .agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.iloc[0]))

cat_by_place = (df_labeled.dropna(subset=["category"])
                          .groupby("placeid")["category"]
                          .agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.iloc[0]))

coords_raw = (
    df_raw[[lat_col_raw, lon_col_raw, "placeid"]]
    .rename(columns={lat_col_raw: "latitude", lon_col_raw: "longitude"})
    .dropna(subset=["latitude", "longitude", "placeid"])
    .copy()
)


coords_raw = coords_raw.dropna()

pois = pd.DataFrame({"feature_id": coords_raw.index})
pois["feature_id"] = pois["feature_id"].astype(int)

pois["fclass_name"]   = fclass_by_place.reindex(pois["feature_id"]).values
pois["category_name"] = cat_by_place.reindex(pois["feature_id"]).values

pois = pois.dropna(subset=["fclass_name", "category_name"]).reset_index(drop=True)

gdf = gpd.GeoDataFrame(
    pois,
    geometry=gpd.points_from_xy(coords_raw.loc[pois["feature_id"], "longitude"].values,
                                coords_raw.loc[pois["feature_id"], "latitude"].values),
    crs="EPSG:4326"
)
gdf["geometry"] = gdf.geometry.apply(lambda p: p.wkt)

fclass_vocab = {n:i for i,n in enumerate(pd.Series(gdf["fclass_name"]).dropna().unique())}
cat_vocab    = {n:i for i,n in enumerate(pd.Series(gdf["category_name"]).dropna().unique())}

gdf["fclass"]   = gdf["fclass_name"].map(lambda n: fclass_vocab.get(n, -1)).astype(int)
gdf["category"] = gdf["category_name"].map(lambda n: cat_vocab.get(n, -1)).astype(int)

gdf = gdf[(gdf["fclass"]>=0) & (gdf["category"]>=0)].reset_index(drop=True)

pois_out = gdf[["feature_id", "category", "fclass", "geometry"]].copy()
pois_out.to_csv(OUT_POIS, index=False)



In [None]:
# @title
import geopandas as gpd
from shapely.geometry import box

lat_min, lat_max = df["latitude"].min(), df["latitude"].max()
lon_min, lon_max = df["longitude"].min(), df["longitude"].max()
area = box(lon_min, lat_min, lon_max, lat_max).buffer(0.01)

boroughs = gpd.GeoDataFrame(geometry=[area], crs="EPSG:4326")

boroughs["geometry"] = boroughs["geometry"].apply(lambda g: g.wkt)
boroughs.to_csv("boroughs_area.csv", index=False)


In [None]:
import sys
module_dir = f'{diretorio_principal}/region-embedding/baselines/poi-encoder'

sys.path.insert(0, module_dir)

In [None]:
from POIEmbedding import PreProcess

PreProcess("pois_gowalla.csv", "boroughs_area.csv", h3=False).run() ##TODO H3 FALSO


In [None]:
test = pd.read_csv("pois_gowalla.csv")
len(test)

In [None]:
import pandas as pd
import numpy as np
import torch
from shapely import wkt

def build_node_features_from_location_encoder(pois_csv_path: str,
                                              loc_embed_pt_path: str,
                                              placeid_col: str = "feature_id"):

    df_pois = pd.read_csv(pois_csv_path)
    if df_pois["geometry"].dtype == object:
        df_pois["geometry"] = df_pois["geometry"].apply(wkt.loads)

    blob = torch.load(loc_embed_pt_path, map_location="cpu")
    E = blob["embeddings"].detach().cpu().numpy()
    placeids = [str(p) for p in blob["placeids"]]
    placeid2idx = {pid: i for i, pid in enumerate(placeids)}

    ids = df_pois[placeid_col].astype(str).tolist()
    D = E.shape[1]
    X = np.zeros((len(ids), D), dtype=np.float32)

    for i, pid in enumerate(ids):
        idx = placeid2idx.get(pid)
        if idx is None:
            raise KeyError(f"placeid {pid} não encontrado em {loc_embed_pt_path}. "
                           f"")
        X[i] = E[idx]

    return X, df_pois


In [None]:
# @title
from POIEmbedding import POI2Vec
p = POI2Vec()
p.train()
p.save_walks()


In [None]:
# @title
from POIEmbedding import POI2Vec
from model import POISet, EmbeddingModel
import torch, torch.utils.data as tud

poi2vec = POI2Vec()
poi2vec.read_walks()
poi2vec.get_global_second_class_walks()

second_class_hierarchy_pairs = list(set([tuple(x) for x in poi2vec.pois[["category","fclass"]].to_numpy()]))
dataset = POISet(
    poi2vec.second_class_number,
    poi2vec.second_class_walks,
    poi2vec.global_second_class_walks,
    k=5
)
model = EmbeddingModel(
    vocab_size=poi2vec.second_class_number,
    embed_size=64,
    second_class_hierarchy_pairs=second_class_hierarchy_pairs,
    le_lambda=1e-8
)

loader = tud.DataLoader(dataset, batch_size=2048, shuffle=True)
opt = torch.optim.Adam(model.parameters(), lr=5e-2)
for e in range(5):
    for i,(inp,pos,neg) in enumerate(loader):
        opt.zero_grad()
        loss,_ = model(inp.long(), pos.long(), neg.long())
        loss.backward()
        opt.step()

torch.save({"in_embed_texas.weight": model.clone_input_embedding()}, "poi-encoder-gowalla-h3_texas.tensor")


## HGI

In [None]:
import sys
module_dir = f'{diretorio_principal}/region-embedding/baselines/HGI/preprocess'

sys.path.insert(0, module_dir)

In [None]:
import torch, numpy as np, pandas as pd
from torch_geometric.data import Data
from main import Preprocess

POIS = "pois_gowalla.csv"
REGS = "boroughs_area.csv"

data_dict = Preprocess(POIS, REGS, emb_filename=None, h3=False).get_data_torch()

loc_pt_path = "/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/poi_embeddings_location-texas.pt"
blob = torch.load(loc_pt_path, map_location="cpu")
E = blob["embeddings"].detach().cpu().numpy()
placeids = [str(p) for p in blob["placeids"]]
placeid2idx = {pid: i for i, pid in enumerate(placeids)}
D = E.shape[1]

order = pd.read_csv("poi_index.csv")
order["feature_id"] = order["feature_id"].astype(str)

X = np.zeros((len(order), D), dtype=np.float32)
for i, pid in enumerate(order["feature_id"].tolist()):
    idx = placeid2idx.get(pid)
    if idx is None:
        raise KeyError(f"placeid {pid} não encontrado em {loc_pt_path}. Gere embeddings para todos os POIs.")
    X[i] = E[idx]

import numpy as np
ei = np.asarray(data_dict['edge_index'])
assert ei.max() < X.shape[0], "edge_index referencia nó >= len(X) — ordem quebrou"

g = Data(
    x=torch.tensor(X, dtype=torch.float32),
    edge_index=torch.tensor(data_dict['edge_index'], dtype=torch.long),
    edge_weight=torch.tensor(data_dict['edge_weight'], dtype=torch.float32),
)
g.region_id  = torch.tensor(data_dict['region_id'], dtype=torch.long)
g.region_area = torch.tensor(data_dict['region_area'], dtype=torch.float32)
g.coarse_region_similarity = torch.tensor(data_dict['coarse_region_similarity'], dtype=torch.float32)
g.region_adjacency = torch.tensor(data_dict['region_adjacency'], dtype=torch.long)

torch.save(g, "gowalla.pt")


In [None]:
import os, pickle as pkl, torch
from torch_geometric.data import Data

os.makedirs("./data", exist_ok=True)

g = torch.load("./gowalla.pt", map_location="cpu")

data_dict = {
    "node_features": g.x.detach().cpu().numpy(),
    "edge_index": g.edge_index.detach().cpu().numpy(),
    "edge_weight": g.edge_weight.detach().cpu().numpy(),
    "region_id": g.region_id.detach().cpu().numpy(),
    "region_area": g.region_area.detach().cpu().numpy(),
    "coarse_region_similarity": g.coarse_region_similarity.detach().cpu().numpy(),
    "region_adjacency": g.region_adjacency.detach().cpu().numpy(),
}

with open("./data/gowalla_hgi_data.pkl", "wb") as f:
    pkl.dump(data_dict, f)



In [None]:
data_dict

In [None]:
import numpy as np

R_from_id = int(np.max(data_dict["region_id"])) + 1
R_area    = len(data_dict["region_area"])
R_adj     = int(data_dict["region_adjacency"].max()) + 1
R_sim     = data_dict["coarse_region_similarity"].shape[0]

print("R from id :", R_from_id)
print("R area    :", R_area)
print("R adj     :", R_adj)
print("R sim     :", R_sim)

assert R_from_id == R_area == R_adj == R_sim, "Desalinhad"


In [None]:
!python {diretorio_principal}/region-embedding/baselines/HGI/train.py --city gowalla --dim 64 --alpha 0.5 --attention_head 4 --epoch 300 --device cpu --save_name gowalla_h3


# California


Census: https://catalog.data.gov/dataset/tiger-line-shapefile-2021-state-texas-census-tracts

In [None]:
ESTADO = "California"

In [None]:
diretorio_principal = "/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/"

In [None]:
baixar_shapefile_estado("california")

In [None]:
import os
import geopandas as gpd

arquivos = [os.path.join("/content", f) for f in os.listdir("/content") if f.endswith(".shp")]
arquivo = max(arquivos, key=os.path.getmtime)
tl = gpd.read_file(arquivo).to_crs("EPSG:4326")
tl.plot(edgecolor="black")


In [None]:
tl[["GEOID","geometry"]] ##TODO: Passar arquivo para HGI (CSV)

## BORO -> GEOID

In [None]:
import geopandas as gpd
from shapely import wkt

boroughs = tl[["GEOID", "geometry"]].copy()
boroughs["geometry"] = boroughs["geometry"].apply(lambda g: g.wkt)

boroughs.to_csv(f"{diretorio_principal}/boroughs_area.csv", index=False)


In [None]:
boroughs

In [None]:
!rm -rf /usr/local/lib/python3.12/dist-packages/~orch*
!pip cache purge


In [None]:
%pip -q install -U pip setuptools wheel
%pip -q uninstall -y torch torchvision torchaudio torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv pyg-lib
%pip -q install --no-cache-dir geopandas shapely libpysal h3 h3ronpy pyarrow scipy scikit-learn


In [None]:
%pip -q install --no-cache-dir torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cpu
%pip -q install --no-cache-dir pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-2.4.0+cpu.html


In [None]:
%cd {diretorio_principal}

## POI Embedding

In [None]:
FOLDER_ID_CRUS = "1cV807NNGn4gSDX-7fkJ83rlr0nRo4y89"
FOLDER_ID_SEPARATED = "1XUWhd59YDe8dSrTb6eZlLvhVcLSGpZ7n"

filename = f"checkins_{ESTADO}.csv"

path_crus = f"estados/crus/{filename}"
path_sep  = f"estados/separated/{filename}"

print("Baixando (crus)...")
download_from_folder_by_name(drive_service,FOLDER_ID_CRUS, filename, path_crus)

print("Baixando (separated)...")
download_from_folder_by_name(drive_service,FOLDER_ID_SEPARATED, filename, path_sep)

print("Concluído:", path_crus, "e", path_sep)


In [None]:
import pandas as pd, numpy as np, ast
import geopandas as gpd
from shapely.geometry import Point


CHECKIN_NAO_CRU = f"estados/crus/checkins_{ESTADO}.csv"
CHECKIN_CRU     = f"estados/separated/checkins_{ESTADO}.csv"
OUT_POIS        = "pois_gowalla.csv"

df_labeled = pd.read_csv(CHECKIN_NAO_CRU)
df_raw     = pd.read_csv(CHECKIN_CRU)

lon_col_raw = "lng" if "lng" in df_raw.columns else "longitude"
lat_col_raw = "lat" if "lat" in df_raw.columns else "latitude"
lon_col_lab = "lng" if "lng" in df_labeled.columns else ("longitude" if "longitude" in df_labeled.columns else None)
lat_col_lab = "lat" if "lat" in df_labeled.columns else ("latitude" if "latitude" in df_labeled.columns else None)

def parse_names(cell):
    try:
        lst = ast.literal_eval(cell)
        if isinstance(lst, list):
            return [d.get("name") for d in lst if isinstance(d, dict) and "name" in d]
    except Exception:
        pass
    return []

df_raw["__cat_names"] = df_raw["spot_categories"].fillna("[]").apply(parse_names)

def first_or_none(lst):
    return lst[0] if (isinstance(lst, list) and len(lst) > 0) else None

df_raw["__fclass_name"] = df_raw["__cat_names"].apply(first_or_none)

fclass_by_place = (df_raw.dropna(subset=["__fclass_name"])
                          .groupby("placeid")["__fclass_name"]
                          .agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.iloc[0]))

cat_by_place = (df_labeled.dropna(subset=["category"])
                          .groupby("placeid")["category"]
                          .agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.iloc[0]))

coords_raw = (
    df_raw[[lat_col_raw, lon_col_raw, "placeid"]]
    .rename(columns={lat_col_raw: "latitude", lon_col_raw: "longitude"})
    .dropna(subset=["latitude", "longitude", "placeid"])
    .copy()
)


coords_raw = coords_raw.dropna()

pois = pd.DataFrame({"feature_id": coords_raw.index})
pois["feature_id"] = pois["feature_id"].astype(int)

pois["fclass_name"]   = fclass_by_place.reindex(pois["feature_id"]).values
pois["category_name"] = cat_by_place.reindex(pois["feature_id"]).values

pois = pois.dropna(subset=["fclass_name", "category_name"]).reset_index(drop=True)

gdf = gpd.GeoDataFrame(
    pois,
    geometry=gpd.points_from_xy(coords_raw.loc[pois["feature_id"], "longitude"].values,
                                coords_raw.loc[pois["feature_id"], "latitude"].values),
    crs="EPSG:4326"
)
gdf["geometry"] = gdf.geometry.apply(lambda p: p.wkt)

fclass_vocab = {n:i for i,n in enumerate(pd.Series(gdf["fclass_name"]).dropna().unique())}
cat_vocab    = {n:i for i,n in enumerate(pd.Series(gdf["category_name"]).dropna().unique())}

gdf["fclass"]   = gdf["fclass_name"].map(lambda n: fclass_vocab.get(n, -1)).astype(int)
gdf["category"] = gdf["category_name"].map(lambda n: cat_vocab.get(n, -1)).astype(int)

gdf = gdf[(gdf["fclass"]>=0) & (gdf["category"]>=0)].reset_index(drop=True)

pois_out = gdf[["feature_id", "category", "fclass", "geometry"]].copy()
pois_out.to_csv(OUT_POIS, index=False)



In [None]:
# @title
import geopandas as gpd
from shapely.geometry import box

lat_min, lat_max = df["latitude"].min(), df["latitude"].max()
lon_min, lon_max = df["longitude"].min(), df["longitude"].max()
area = box(lon_min, lat_min, lon_max, lat_max).buffer(0.01)

boroughs = gpd.GeoDataFrame(geometry=[area], crs="EPSG:4326")

boroughs["geometry"] = boroughs["geometry"].apply(lambda g: g.wkt)
boroughs.to_csv("boroughs_area.csv", index=False)


In [None]:
import sys
module_dir = f'{diretorio_principal}/region-embedding/baselines/poi-encoder'

sys.path.insert(0, module_dir)

In [None]:
from POIEmbedding import PreProcess

PreProcess("pois_gowalla.csv", "boroughs_area.csv", h3=False).run() ##TODO H3 FALSO


In [None]:
test = pd.read_csv("pois_gowalla.csv")
len(test)

In [None]:
import pandas as pd
import numpy as np
import torch
from shapely import wkt

def build_node_features_from_location_encoder(pois_csv_path: str,
                                              loc_embed_pt_path: str,
                                              placeid_col: str = "feature_id"):

    df_pois = pd.read_csv(pois_csv_path)
    if df_pois["geometry"].dtype == object:
        df_pois["geometry"] = df_pois["geometry"].apply(wkt.loads)

    blob = torch.load(loc_embed_pt_path, map_location="cpu")
    E = blob["embeddings"].detach().cpu().numpy()
    placeids = [str(p) for p in blob["placeids"]]
    placeid2idx = {pid: i for i, pid in enumerate(placeids)}

    ids = df_pois[placeid_col].astype(str).tolist()
    D = E.shape[1]
    X = np.zeros((len(ids), D), dtype=np.float32)

    for i, pid in enumerate(ids):
        idx = placeid2idx.get(pid)
        if idx is None:
            raise KeyError(f"placeid {pid} não encontrado em {loc_embed_pt_path}. "
                           f"")
        X[i] = E[idx]

    return X, df_pois


In [None]:
# @title
from POIEmbedding import POI2Vec
p = POI2Vec()
p.train()
p.save_walks()


In [None]:
# @title
from POIEmbedding import POI2Vec
from model import POISet, EmbeddingModel
import torch, torch.utils.data as tud

poi2vec = POI2Vec()
poi2vec.read_walks()
poi2vec.get_global_second_class_walks()

second_class_hierarchy_pairs = list(set([tuple(x) for x in poi2vec.pois[["category","fclass"]].to_numpy()]))
dataset = POISet(
    poi2vec.second_class_number,
    poi2vec.second_class_walks,
    poi2vec.global_second_class_walks,
    k=5
)
model = EmbeddingModel(
    vocab_size=poi2vec.second_class_number,
    embed_size=64,
    second_class_hierarchy_pairs=second_class_hierarchy_pairs,
    le_lambda=1e-8
)

loader = tud.DataLoader(dataset, batch_size=2048, shuffle=True)
opt = torch.optim.Adam(model.parameters(), lr=5e-2)
for e in range(5):
    for i,(inp,pos,neg) in enumerate(loader):
        opt.zero_grad()
        loss,_ = model(inp.long(), pos.long(), neg.long())
        loss.backward()
        opt.step()

torch.save({"in_embed_california.weight": model.clone_input_embedding()}, "poi-encoder-gowalla-h3_california.tensor")


## HGI

In [None]:
import sys
module_dir = f'{diretorio_principal}/region-embedding/baselines/HGI/preprocess'

sys.path.insert(0, module_dir)

In [None]:
import torch, numpy as np, pandas as pd
from torch_geometric.data import Data
from main import Preprocess

POIS = "pois_gowalla.csv"
REGS = "boroughs_area.csv"

data_dict = Preprocess(POIS, REGS, emb_filename=None, h3=False).get_data_torch()

loc_pt_path = "/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/poi_embeddings_location-california.pt"
blob = torch.load(loc_pt_path, map_location="cpu")
E = blob["embeddings"].detach().cpu().numpy()
placeids = [str(p) for p in blob["placeids"]]
placeid2idx = {pid: i for i, pid in enumerate(placeids)}
D = E.shape[1]

order = pd.read_csv("poi_index.csv")
order["feature_id"] = order["feature_id"].astype(str)

X = np.zeros((len(order), D), dtype=np.float32)
for i, pid in enumerate(order["feature_id"].tolist()):
    idx = placeid2idx.get(pid)
    if idx is None:
        raise KeyError(f"placeid {pid} não encontrado em {loc_pt_path}. Gere embeddings para todos os POIs.")
    X[i] = E[idx]

import numpy as np
ei = np.asarray(data_dict['edge_index'])
assert ei.max() < X.shape[0], "edge_index referencia nó >= len(X) — ordem quebrou"

g = Data(
    x=torch.tensor(X, dtype=torch.float32),
    edge_index=torch.tensor(data_dict['edge_index'], dtype=torch.long),
    edge_weight=torch.tensor(data_dict['edge_weight'], dtype=torch.float32),
)
g.region_id  = torch.tensor(data_dict['region_id'], dtype=torch.long)
g.region_area = torch.tensor(data_dict['region_area'], dtype=torch.float32)
g.coarse_region_similarity = torch.tensor(data_dict['coarse_region_similarity'], dtype=torch.float32)
g.region_adjacency = torch.tensor(data_dict['region_adjacency'], dtype=torch.long)

torch.save(g, "gowalla.pt")


In [None]:
import os, pickle as pkl, torch
from torch_geometric.data import Data

os.makedirs("./data", exist_ok=True)

g = torch.load("./gowalla.pt", map_location="cpu")

data_dict = {
    "node_features": g.x.detach().cpu().numpy(),
    "edge_index": g.edge_index.detach().cpu().numpy(),
    "edge_weight": g.edge_weight.detach().cpu().numpy(),
    "region_id": g.region_id.detach().cpu().numpy(),
    "region_area": g.region_area.detach().cpu().numpy(),
    "coarse_region_similarity": g.coarse_region_similarity.detach().cpu().numpy(),
    "region_adjacency": g.region_adjacency.detach().cpu().numpy(),
}

with open("./data/gowalla_hgi_data.pkl", "wb") as f:
    pkl.dump(data_dict, f)



In [None]:
data_dict

In [None]:
import numpy as np

R_from_id = int(np.max(data_dict["region_id"])) + 1
R_area    = len(data_dict["region_area"])
R_adj     = int(data_dict["region_adjacency"].max()) + 1
R_sim     = data_dict["coarse_region_similarity"].shape[0]

print("R from id :", R_from_id)
print("R area    :", R_area)
print("R adj     :", R_adj)
print("R sim     :", R_sim)

assert R_from_id == R_area == R_adj == R_sim, "Desalinhad"


In [None]:
!python {diretorio_principal}/region-embedding/baselines/HGI/train.py --city gowalla --dim 64 --alpha 0.5 --attention_head 4 --epoch 300 --device cpu --save_name gowalla_h3


In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

# Florida


Census: https://catalog.data.gov/dataset/tiger-line-shapefile-2021-state-florida-census-tracts

In [None]:
ESTADO = "Florida"

In [None]:
diretorio_principal = "/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/"

In [None]:
baixar_shapefile_estado("florida")

In [None]:
import os
import geopandas as gpd

arquivos = [os.path.join("/content", f) for f in os.listdir("/content") if f.endswith(".shp")]
arquivo = max(arquivos, key=os.path.getmtime)
tl = gpd.read_file(arquivo).to_crs("EPSG:4326")
tl.plot(edgecolor="black")


In [None]:
tl[["GEOID","geometry"]] ##TODO: Passar arquivo para HGI (CSV)

## BORO -> GEOID

In [None]:
import geopandas as gpd
from shapely import wkt

boroughs = tl[["GEOID", "geometry"]].copy()
boroughs["geometry"] = boroughs["geometry"].apply(lambda g: g.wkt)

boroughs.to_csv(f"{diretorio_principal}/boroughs_area.csv", index=False)


In [None]:
boroughs

In [None]:
!rm -rf /usr/local/lib/python3.12/dist-packages/~orch*
!pip cache purge


In [None]:
%pip -q install -U pip setuptools wheel
%pip -q uninstall -y torch torchvision torchaudio torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv pyg-lib
%pip -q install --no-cache-dir geopandas shapely libpysal h3 h3ronpy pyarrow scipy scikit-learn


In [None]:
# @title
%pip -q install --no-cache-dir torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cpu
%pip install --no-cache-dir pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric \
  -f https://data.pyg.org/whl/torch-2.4.0+cu121.html


In [None]:
%pip -q install --no-cache-dir torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cpu
%pip -q install --no-cache-dir pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-2.4.0+cpu.html


In [None]:
%cd {diretorio_principal}

## POI Embedding

In [None]:
FOLDER_ID_CRUS = "1cV807NNGn4gSDX-7fkJ83rlr0nRo4y89"
FOLDER_ID_SEPARATED = "1XUWhd59YDe8dSrTb6eZlLvhVcLSGpZ7n"

filename = f"checkins_{ESTADO}.csv"

path_crus = f"estados/crus/{filename}"
path_sep  = f"estados/separated/{filename}"

print("Baixando (crus)...")
download_from_folder_by_name(drive_service,FOLDER_ID_CRUS, filename, path_crus)

print("Baixando (separated)...")
download_from_folder_by_name(drive_service,FOLDER_ID_SEPARATED, filename, path_sep)

print("Concluído:", path_crus, "e", path_sep)


In [None]:
import pandas as pd, numpy as np, ast
import geopandas as gpd
from shapely.geometry import Point


CHECKIN_NAO_CRU = f"estados/crus/checkins_{ESTADO}.csv"
CHECKIN_CRU     = f"estados/separated/checkins_{ESTADO}.csv"
OUT_POIS        = "pois_gowalla.csv"

df_labeled = pd.read_csv(CHECKIN_NAO_CRU)
df_raw     = pd.read_csv(CHECKIN_CRU)

lon_col_raw = "lng" if "lng" in df_raw.columns else "longitude"
lat_col_raw = "lat" if "lat" in df_raw.columns else "latitude"
lon_col_lab = "lng" if "lng" in df_labeled.columns else ("longitude" if "longitude" in df_labeled.columns else None)
lat_col_lab = "lat" if "lat" in df_labeled.columns else ("latitude" if "latitude" in df_labeled.columns else None)

def parse_names(cell):
    try:
        lst = ast.literal_eval(cell)
        if isinstance(lst, list):
            return [d.get("name") for d in lst if isinstance(d, dict) and "name" in d]
    except Exception:
        pass
    return []

df_raw["__cat_names"] = df_raw["spot_categories"].fillna("[]").apply(parse_names)

def first_or_none(lst):
    return lst[0] if (isinstance(lst, list) and len(lst) > 0) else None

df_raw["__fclass_name"] = df_raw["__cat_names"].apply(first_or_none)

fclass_by_place = (df_raw.dropna(subset=["__fclass_name"])
                          .groupby("placeid")["__fclass_name"]
                          .agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.iloc[0]))

cat_by_place = (df_labeled.dropna(subset=["category"])
                          .groupby("placeid")["category"]
                          .agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.iloc[0]))
coords_raw = (
    df_raw[[lat_col_raw, lon_col_raw, "placeid"]]
    .rename(columns={lat_col_raw: "latitude", lon_col_raw: "longitude"})
    .dropna(subset=["latitude", "longitude", "placeid"])
    .copy()
)

coords_raw = (
    coords_raw
    .groupby("placeid", as_index=True)[["latitude", "longitude"]]
    .mean()
)

pois = pd.DataFrame({"feature_id": coords_raw.index})
pois["feature_id"] = pois["feature_id"].astype(int)

pois["fclass_name"]   = fclass_by_place.reindex(pois["feature_id"]).values
pois["category_name"] = cat_by_place.reindex(pois["feature_id"]).values

pois = pois.dropna(subset=["fclass_name", "category_name"]).reset_index(drop=True)

gdf = gpd.GeoDataFrame(
    pois,
    geometry=gpd.points_from_xy(
        coords_raw.loc[pois["feature_id"], "longitude"].values,
        coords_raw.loc[pois["feature_id"], "latitude"].values
    ),
    crs="EPSG:4326"
)
gdf["geometry"] = gdf.geometry.apply(lambda p: p.wkt)

fclass_vocab = {n:i for i,n in enumerate(pd.Series(gdf["fclass_name"]).dropna().unique())}
cat_vocab    = {n:i for i,n in enumerate(pd.Series(gdf["category_name"]).dropna().unique())}

gdf["fclass"]   = gdf["fclass_name"].map(lambda n: fclass_vocab.get(n, -1)).astype(int)
gdf["category"] = gdf["category_name"].map(lambda n: cat_vocab.get(n, -1)).astype(int)

gdf = gdf[(gdf["fclass"]>=0) & (gdf["category"]>=0)].reset_index(drop=True)

pois_out = gdf[["feature_id", "category", "fclass", "geometry"]].copy()
pois_out.to_csv(OUT_POIS, index=False)


In [None]:
pois_out.shape

In [None]:
# @title
import geopandas as gpd
from shapely.geometry import box

lat_min, lat_max = df["latitude"].min(), df["latitude"].max()
lon_min, lon_max = df["longitude"].min(), df["longitude"].max()
area = box(lon_min, lat_min, lon_max, lat_max).buffer(0.01)

boroughs = gpd.GeoDataFrame(geometry=[area], crs="EPSG:4326")

boroughs["geometry"] = boroughs["geometry"].apply(lambda g: g.wkt)
boroughs.to_csv("boroughs_area.csv", index=False)


In [None]:
import sys
module_dir = f'{diretorio_principal}/region-embedding/baselines/poi-encoder'

sys.path.insert(0, module_dir)

In [None]:
from POIEmbedding import PreProcess

PreProcess("pois_gowalla.csv", "boroughs_area.csv", h3=False).run() ##TODO H3 FALSO


In [None]:
test = pd.read_csv("pois_gowalla.csv")
len(test)

In [None]:
import pandas as pd
import numpy as np
import torch
from shapely import wkt

def build_node_features_from_location_encoder(pois_csv_path: str,
                                              loc_embed_pt_path: str,
                                              placeid_col: str = "feature_id"):

    df_pois = pd.read_csv(pois_csv_path)
    if df_pois["geometry"].dtype == object:
        df_pois["geometry"] = df_pois["geometry"].apply(wkt.loads)

    blob = torch.load(loc_embed_pt_path, map_location="cpu")
    E = blob["embeddings"].detach().cpu().numpy()
    placeids = [str(p) for p in blob["placeids"]]
    placeid2idx = {pid: i for i, pid in enumerate(placeids)}

    ids = df_pois[placeid_col].astype(str).tolist()
    D = E.shape[1]
    X = np.zeros((len(ids), D), dtype=np.float32)

    for i, pid in enumerate(ids):
        idx = placeid2idx.get(pid)
        if idx is None:
            raise KeyError(f"placeid {pid} não encontrado em {loc_embed_pt_path}. "
                           f"")
        X[i] = E[idx]

    return X, df_pois


In [None]:
# @title
from POIEmbedding import POI2Vec
p = POI2Vec()
p.train()
p.save_walks()


In [None]:
# @title
from POIEmbedding import POI2Vec
from model import POISet, EmbeddingModel
import torch, torch.utils.data as tud

poi2vec = POI2Vec()
poi2vec.read_walks()
poi2vec.get_global_second_class_walks()

second_class_hierarchy_pairs = list(set([tuple(x) for x in poi2vec.pois[["category","fclass"]].to_numpy()]))
dataset = POISet(
    poi2vec.second_class_number,
    poi2vec.second_class_walks,
    poi2vec.global_second_class_walks,
    k=5
)
model = EmbeddingModel(
    vocab_size=poi2vec.second_class_number,
    embed_size=256,
    second_class_hierarchy_pairs=second_class_hierarchy_pairs,
    le_lambda=1e-8
)

loader = tud.DataLoader(dataset, batch_size=2048, shuffle=True)
opt = torch.optim.Adam(model.parameters(), lr=5e-2)
for e in range(5):
    for i,(inp,pos,neg) in enumerate(loader):
        opt.zero_grad()
        loss,_ = model(inp.long(), pos.long(), neg.long())
        loss.backward()
        opt.step()

torch.save({f"in_embed_{ESTADO.lower()}.weight": model.clone_input_embedding()}, f"poi-encoder-gowalla-h3_{ESTADO.lower()}.tensor")


## HGI

In [None]:
import sys
module_dir = f'{diretorio_principal}/region-embedding/baselines/HGI/preprocess'

sys.path.insert(0, module_dir)

In [None]:
import torch, numpy as np, pandas as pd
from torch_geometric.data import Data
from main import Preprocess

POIS = "pois_gowalla.csv"
REGS = "boroughs_area.csv"

data_dict = Preprocess(POIS, REGS, emb_filename=None, h3=False).get_data_torch()

loc_pt_path = f"/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/poi_embeddings_location-{ESTADO.lower()}.pt"
blob = torch.load(loc_pt_path, map_location="cpu")
E = blob["embeddings"].detach().cpu().numpy()
placeids = [str(p) for p in blob["placeids"]]
placeid2idx = {pid: i for i, pid in enumerate(placeids)}
D = E.shape[1]

order = pd.read_csv("poi_index.csv")
order["feature_id"] = order["feature_id"].astype(str)

X = np.zeros((len(order), D), dtype=np.float32)
for i, pid in enumerate(order["feature_id"].tolist()):
    idx = placeid2idx.get(pid)
    if idx is None:
        raise KeyError(f"placeid {pid} não encontrado em {loc_pt_path}. Gere embeddings para todos os POIs.")
    X[i] = E[idx]

import numpy as np
ei = np.asarray(data_dict['edge_index'])
assert ei.max() < X.shape[0], "edge_index referencia nó >= len(X) — ordem quebrou"

g = Data(
    x=torch.tensor(X, dtype=torch.float32),
    edge_index=torch.tensor(data_dict['edge_index'], dtype=torch.long),
    edge_weight=torch.tensor(data_dict['edge_weight'], dtype=torch.float32),
)
g.region_id  = torch.tensor(data_dict['region_id'], dtype=torch.long)
g.region_area = torch.tensor(data_dict['region_area'], dtype=torch.float32)
g.coarse_region_similarity = torch.tensor(data_dict['coarse_region_similarity'], dtype=torch.float32)
g.region_adjacency = torch.tensor(data_dict['region_adjacency'], dtype=torch.long)

torch.save(g, "gowalla.pt")


In [None]:
import os, pickle as pkl, torch
from torch_geometric.data import Data

os.makedirs("./data", exist_ok=True)

g = torch.load("./gowalla.pt", map_location="cpu")

data_dict = {
    "node_features": g.x.detach().cpu().numpy(),
    "edge_index": g.edge_index.detach().cpu().numpy(),
    "edge_weight": g.edge_weight.detach().cpu().numpy(),
    "region_id": g.region_id.detach().cpu().numpy(),
    "region_area": g.region_area.detach().cpu().numpy(),
    "coarse_region_similarity": g.coarse_region_similarity.detach().cpu().numpy(),
    "region_adjacency": g.region_adjacency.detach().cpu().numpy(),
}

with open("./data/gowalla_hgi_data.pkl", "wb") as f:
    pkl.dump(data_dict, f)



In [None]:
import numpy as np

R_from_id = int(np.max(data_dict["region_id"])) + 1
R_area    = len(data_dict["region_area"])
R_adj     = int(data_dict["region_adjacency"].max()) + 1
R_sim     = data_dict["coarse_region_similarity"].shape[0]

print("R from id :", R_from_id)
print("R area    :", R_area)
print("R adj     :", R_adj)
print("R sim     :", R_sim)

assert R_from_id == R_area == R_adj == R_sim, "Desalinhad"


In [None]:
%pip uninstall -y torch torchvision torchaudio torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv pyg-lib

%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
%pip install --no-cache-dir pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric \
  -f https://data.pyg.org/whl/torch-2.4.0+cu121.html


In [None]:
!python {diretorio_principal}/region-embedding/baselines/HGI/train.py --city gowalla --dim 64 --alpha 0.5 --attention_head 4 --epoch 500 --device cuda --save_name gowalla_h3


In [None]:
!python {diretorio_principal}/region-embedding/baselines/HGI/train.py --city gowalla --dim 128 --alpha 0.5 --attention_head 4 --epoch 400 --device cuda --save_name gowalla_h3


In [None]:
!python {diretorio_principal}/region-embedding/baselines/HGI/train.py --city gowalla --dim 32 --alpha 0.5 --attention_head 4 --epoch 300 --device cuda --save_name gowalla_h3


# North Carolina


Census: https://catalog.data.gov/dataset/tiger-line-shapefile-2021-state-north carolina-census-tracts

In [None]:
ESTADO = "North Carolina"

In [None]:
diretorio_principal = "/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/"

In [None]:
baixar_shapefile_estado("north-carolina")

In [None]:
import os
import geopandas as gpd

arquivos = [os.path.join("/content", f) for f in os.listdir("/content") if f.endswith(".shp")]
arquivo = max(arquivos, key=os.path.getmtime)
tl = gpd.read_file(arquivo).to_crs("EPSG:4326")
tl.plot(edgecolor="black")


In [None]:
tl[["GEOID","geometry"]] ##TODO: Passar arquivo para HGI (CSV)

## BORO -> GEOID

In [None]:
import geopandas as gpd
from shapely import wkt

boroughs = tl[["GEOID", "geometry"]].copy()
boroughs["geometry"] = boroughs["geometry"].apply(lambda g: g.wkt)

boroughs.to_csv(f"{diretorio_principal}/boroughs_area.csv", index=False)


In [None]:
boroughs

In [None]:
!rm -rf /usr/local/lib/python3.12/dist-packages/~orch*
!pip cache purge


In [None]:
%pip -q install -U pip setuptools wheel
%pip -q uninstall -y torch torchvision torchaudio torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv pyg-lib
%pip -q install --no-cache-dir geopandas shapely libpysal h3 h3ronpy pyarrow scipy scikit-learn


In [None]:
# @title
%pip -q install --no-cache-dir torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cpu
%pip install --no-cache-dir pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric \
  -f https://data.pyg.org/whl/torch-2.4.0+cu121.html


In [None]:
%pip -q install --no-cache-dir torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cpu
%pip -q install --no-cache-dir pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-2.4.0+cpu.html


In [None]:
%cd {diretorio_principal}

## POI Embedding

In [None]:
FOLDER_ID_CRUS = "1cV807NNGn4gSDX-7fkJ83rlr0nRo4y89"
FOLDER_ID_SEPARATED = "1XUWhd59YDe8dSrTb6eZlLvhVcLSGpZ7n"

filename = f"checkins_{ESTADO}.csv"

path_crus = f"estados/crus/{filename}"
path_sep  = f"estados/separated/{filename}"

print("Baixando (crus)...")
download_from_folder_by_name(drive_service,FOLDER_ID_CRUS, filename, path_crus)

print("Baixando (separated)...")
download_from_folder_by_name(drive_service,FOLDER_ID_SEPARATED, filename, path_sep)

print("Concluído:", path_crus, "e", path_sep)


In [None]:
import pandas as pd, numpy as np, ast
import geopandas as gpd
from shapely.geometry import Point


CHECKIN_NAO_CRU = f"estados/crus/checkins_{ESTADO}.csv"
CHECKIN_CRU     = f"estados/separated/checkins_{ESTADO}.csv"
OUT_POIS        = "pois_gowalla.csv"

df_labeled = pd.read_csv(CHECKIN_NAO_CRU)
df_raw     = pd.read_csv(CHECKIN_CRU)

lon_col_raw = "lng" if "lng" in df_raw.columns else "longitude"
lat_col_raw = "lat" if "lat" in df_raw.columns else "latitude"
lon_col_lab = "lng" if "lng" in df_labeled.columns else ("longitude" if "longitude" in df_labeled.columns else None)
lat_col_lab = "lat" if "lat" in df_labeled.columns else ("latitude" if "latitude" in df_labeled.columns else None)

def parse_names(cell):
    try:
        lst = ast.literal_eval(cell)
        if isinstance(lst, list):
            return [d.get("name") for d in lst if isinstance(d, dict) and "name" in d]
    except Exception:
        pass
    return []

df_raw["__cat_names"] = df_raw["spot_categories"].fillna("[]").apply(parse_names)

def first_or_none(lst):
    return lst[0] if (isinstance(lst, list) and len(lst) > 0) else None

df_raw["__fclass_name"] = df_raw["__cat_names"].apply(first_or_none)

fclass_by_place = (df_raw.dropna(subset=["__fclass_name"])
                          .groupby("placeid")["__fclass_name"]
                          .agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.iloc[0]))

cat_by_place = (df_labeled.dropna(subset=["category"])
                          .groupby("placeid")["category"]
                          .agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.iloc[0]))
coords_raw = (
    df_raw[[lat_col_raw, lon_col_raw, "placeid"]]
    .rename(columns={lat_col_raw: "latitude", lon_col_raw: "longitude"})
    .dropna(subset=["latitude", "longitude", "placeid"])
    .copy()
)

coords_raw = (
    coords_raw
    .groupby("placeid", as_index=True)[["latitude", "longitude"]]
    .mean()
)

pois = pd.DataFrame({"feature_id": coords_raw.index})
pois["feature_id"] = pois["feature_id"].astype(int)

pois["fclass_name"]   = fclass_by_place.reindex(pois["feature_id"]).values
pois["category_name"] = cat_by_place.reindex(pois["feature_id"]).values

pois = pois.dropna(subset=["fclass_name", "category_name"]).reset_index(drop=True)

gdf = gpd.GeoDataFrame(
    pois,
    geometry=gpd.points_from_xy(
        coords_raw.loc[pois["feature_id"], "longitude"].values,
        coords_raw.loc[pois["feature_id"], "latitude"].values
    ),
    crs="EPSG:4326"
)
gdf["geometry"] = gdf.geometry.apply(lambda p: p.wkt)

fclass_vocab = {n:i for i,n in enumerate(pd.Series(gdf["fclass_name"]).dropna().unique())}
cat_vocab    = {n:i for i,n in enumerate(pd.Series(gdf["category_name"]).dropna().unique())}

gdf["fclass"]   = gdf["fclass_name"].map(lambda n: fclass_vocab.get(n, -1)).astype(int)
gdf["category"] = gdf["category_name"].map(lambda n: cat_vocab.get(n, -1)).astype(int)

gdf = gdf[(gdf["fclass"]>=0) & (gdf["category"]>=0)].reset_index(drop=True)

pois_out = gdf[["feature_id", "category", "fclass", "geometry"]].copy()
pois_out.to_csv(OUT_POIS, index=False)


In [None]:
pois_out.shape

In [None]:
# @title
import geopandas as gpd
from shapely.geometry import box

lat_min, lat_max = df["latitude"].min(), df["latitude"].max()
lon_min, lon_max = df["longitude"].min(), df["longitude"].max()
area = box(lon_min, lat_min, lon_max, lat_max).buffer(0.01)

boroughs = gpd.GeoDataFrame(geometry=[area], crs="EPSG:4326")

boroughs["geometry"] = boroughs["geometry"].apply(lambda g: g.wkt)
boroughs.to_csv("boroughs_area.csv", index=False)


In [None]:
import sys
module_dir = f'{diretorio_principal}/region-embedding/baselines/poi-encoder'

sys.path.insert(0, module_dir)

In [None]:
from POIEmbedding import PreProcess

PreProcess("pois_gowalla.csv", "boroughs_area.csv", h3=False).run() ##TODO H3 FALSO


In [None]:
test = pd.read_csv("pois_gowalla.csv")
len(test)

In [None]:
import pandas as pd
import numpy as np
import torch
from shapely import wkt

def build_node_features_from_location_encoder(pois_csv_path: str,
                                              loc_embed_pt_path: str,
                                              placeid_col: str = "feature_id"):

    df_pois = pd.read_csv(pois_csv_path)
    if df_pois["geometry"].dtype == object:
        df_pois["geometry"] = df_pois["geometry"].apply(wkt.loads)

    blob = torch.load(loc_embed_pt_path, map_location="cpu")
    E = blob["embeddings"].detach().cpu().numpy()
    placeids = [str(p) for p in blob["placeids"]]
    placeid2idx = {pid: i for i, pid in enumerate(placeids)}

    ids = df_pois[placeid_col].astype(str).tolist()
    D = E.shape[1]
    X = np.zeros((len(ids), D), dtype=np.float32)

    for i, pid in enumerate(ids):
        idx = placeid2idx.get(pid)
        if idx is None:
            raise KeyError(f"placeid {pid} não encontrado em {loc_embed_pt_path}. "
                           f"")
        X[i] = E[idx]

    return X, df_pois


In [None]:
# @title
from POIEmbedding import POI2Vec
p = POI2Vec()
p.train()
p.save_walks()


In [None]:
# @title
from POIEmbedding import POI2Vec
from model import POISet, EmbeddingModel
import torch, torch.utils.data as tud

poi2vec = POI2Vec()
poi2vec.read_walks()
poi2vec.get_global_second_class_walks()

second_class_hierarchy_pairs = list(set([tuple(x) for x in poi2vec.pois[["category","fclass"]].to_numpy()]))
dataset = POISet(
    poi2vec.second_class_number,
    poi2vec.second_class_walks,
    poi2vec.global_second_class_walks,
    k=5
)
model = EmbeddingModel(
    vocab_size=poi2vec.second_class_number,
    embed_size=256,
    second_class_hierarchy_pairs=second_class_hierarchy_pairs,
    le_lambda=1e-8
)

loader = tud.DataLoader(dataset, batch_size=2048, shuffle=True)
opt = torch.optim.Adam(model.parameters(), lr=5e-2)
for e in range(5):
    for i,(inp,pos,neg) in enumerate(loader):
        opt.zero_grad()
        loss,_ = model(inp.long(), pos.long(), neg.long())
        loss.backward()
        opt.step()

torch.save({f"in_embed_{ESTADO.lower()}.weight": model.clone_input_embedding()}, f"poi-encoder-gowalla-h3_{ESTADO.lower()}.tensor")


## HGI

In [None]:
import sys
module_dir = f'{diretorio_principal}/region-embedding/baselines/HGI/preprocess'

sys.path.insert(0, module_dir)

In [None]:
import torch, numpy as np, pandas as pd
from torch_geometric.data import Data
from main import Preprocess

POIS = "pois_gowalla.csv"
REGS = "boroughs_area.csv"

data_dict = Preprocess(POIS, REGS, emb_filename=None, h3=False).get_data_torch()

loc_pt_path = f"/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/poi_embeddings_location-{ESTADO.lower()}.pt"
blob = torch.load(loc_pt_path, map_location="cpu")
E = blob["embeddings"].detach().cpu().numpy()
placeids = [str(p) for p in blob["placeids"]]
placeid2idx = {pid: i for i, pid in enumerate(placeids)}
D = E.shape[1]

order = pd.read_csv("poi_index.csv")
order["feature_id"] = order["feature_id"].astype(str)

X = np.zeros((len(order), D), dtype=np.float32)
for i, pid in enumerate(order["feature_id"].tolist()):
    idx = placeid2idx.get(pid)
    if idx is None:
        raise KeyError(f"placeid {pid} não encontrado em {loc_pt_path}. Gere embeddings para todos os POIs.")
    X[i] = E[idx]

import numpy as np
ei = np.asarray(data_dict['edge_index'])
assert ei.max() < X.shape[0], "edge_index referencia nó >= len(X) — ordem quebrou"

g = Data(
    x=torch.tensor(X, dtype=torch.float32),
    edge_index=torch.tensor(data_dict['edge_index'], dtype=torch.long),
    edge_weight=torch.tensor(data_dict['edge_weight'], dtype=torch.float32),
)
g.region_id  = torch.tensor(data_dict['region_id'], dtype=torch.long)
g.region_area = torch.tensor(data_dict['region_area'], dtype=torch.float32)
g.coarse_region_similarity = torch.tensor(data_dict['coarse_region_similarity'], dtype=torch.float32)
g.region_adjacency = torch.tensor(data_dict['region_adjacency'], dtype=torch.long)

torch.save(g, "gowalla.pt")


In [None]:
import os, pickle as pkl, torch
from torch_geometric.data import Data

os.makedirs("./data", exist_ok=True)

g = torch.load("./gowalla.pt", map_location="cpu")

data_dict = {
    "node_features": g.x.detach().cpu().numpy(),
    "edge_index": g.edge_index.detach().cpu().numpy(),
    "edge_weight": g.edge_weight.detach().cpu().numpy(),
    "region_id": g.region_id.detach().cpu().numpy(),
    "region_area": g.region_area.detach().cpu().numpy(),
    "coarse_region_similarity": g.coarse_region_similarity.detach().cpu().numpy(),
    "region_adjacency": g.region_adjacency.detach().cpu().numpy(),
}

with open("./data/gowalla_hgi_data.pkl", "wb") as f:
    pkl.dump(data_dict, f)



In [None]:
import numpy as np

R_from_id = int(np.max(data_dict["region_id"])) + 1
R_area    = len(data_dict["region_area"])
R_adj     = int(data_dict["region_adjacency"].max()) + 1
R_sim     = data_dict["coarse_region_similarity"].shape[0]

print("R from id :", R_from_id)
print("R area    :", R_area)
print("R adj     :", R_adj)
print("R sim     :", R_sim)

assert R_from_id == R_area == R_adj == R_sim, "Desalinhad"


In [None]:
%pip uninstall -y torch torchvision torchaudio torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv pyg-lib

%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
%pip install --no-cache-dir pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric \
  -f https://data.pyg.org/whl/torch-2.4.0+cu121.html


In [None]:
!python {diretorio_principal}/region-embedding/baselines/HGI/train.py --city gowalla --dim 64 --alpha 0.5 --attention_head 4 --epoch 500 --device cuda --save_name gowalla_h3


In [None]:
!python {diretorio_principal}/region-embedding/baselines/HGI/train.py --city gowalla --dim 128 --alpha 0.5 --attention_head 4 --epoch 400 --device cuda --save_name gowalla_h3


In [None]:
!python {diretorio_principal}/region-embedding/baselines/HGI/train.py --city gowalla --dim 32 --alpha 0.5 --attention_head 4 --epoch 300 --device cuda --save_name gowalla_h3


# Florida


Census: https://catalog.data.gov/dataset/tiger-line-shapefile-2021-state-north florida-census-tracts

In [None]:
ESTADO = "Florida"

In [None]:
diretorio_principal = "/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/"

In [None]:
baixar_shapefile_estado("florida")

In [None]:
import os
import geopandas as gpd

arquivos = [os.path.join("/content", f) for f in os.listdir("/content") if f.endswith(".shp")]
arquivo = max(arquivos, key=os.path.getmtime)
tl = gpd.read_file(arquivo).to_crs("EPSG:4326")
tl.plot(edgecolor="black")


In [None]:
tl[["GEOID","geometry"]] ##TODO: Passar arquivo para HGI (CSV)

## BORO -> GEOID

In [None]:
import geopandas as gpd
from shapely import wkt

boroughs = tl[["GEOID", "geometry"]].copy()
boroughs["geometry"] = boroughs["geometry"].apply(lambda g: g.wkt)

boroughs.to_csv(f"{diretorio_principal}/boroughs_area.csv", index=False)


In [None]:
boroughs

In [None]:
!rm -rf /usr/local/lib/python3.12/dist-packages/~orch*
!pip cache purge


In [None]:
%pip -q install -U pip setuptools wheel
%pip -q uninstall -y torch torchvision torchaudio torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv pyg-lib
%pip -q install --no-cache-dir geopandas shapely libpysal h3 h3ronpy pyarrow scipy scikit-learn


In [None]:
# @title
%pip -q install --no-cache-dir torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cpu
%pip install --no-cache-dir pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric \
  -f https://data.pyg.org/whl/torch-2.4.0+cu121.html


In [None]:
%pip -q install --no-cache-dir torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cpu
%pip -q install --no-cache-dir pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-2.4.0+cpu.html


In [None]:
%cd {diretorio_principal}

## POI Embedding

In [None]:
FOLDER_ID_CRUS = "1cV807NNGn4gSDX-7fkJ83rlr0nRo4y89"
FOLDER_ID_SEPARATED = "1XUWhd59YDe8dSrTb6eZlLvhVcLSGpZ7n"

filename = f"checkins_{ESTADO}.csv"

path_crus = f"estados/crus/{filename}"
path_sep  = f"estados/separated/{filename}"

print("Baixando (crus)...")
download_from_folder_by_name(drive_service,FOLDER_ID_CRUS, filename, path_crus)

print("Baixando (separated)...")
download_from_folder_by_name(drive_service,FOLDER_ID_SEPARATED, filename, path_sep)

print("Concluído:", path_crus, "e", path_sep)


In [None]:
path_sep

In [None]:
import pandas as pd, numpy as np, ast
import geopandas as gpd
from shapely.geometry import Point


CHECKIN_NAO_CRU = f"estados/crus/checkins_{ESTADO}.csv"
CHECKIN_CRU     = f"estados/separated/checkins_{ESTADO}.csv"
OUT_POIS        = "pois_gowalla.csv"

df_labeled = pd.read_csv(CHECKIN_NAO_CRU)
df_raw     = pd.read_csv(CHECKIN_CRU)

lon_col_raw = "lng" if "lng" in df_raw.columns else "longitude"
lat_col_raw = "lat" if "lat" in df_raw.columns else "latitude"
lon_col_lab = "lng" if "lng" in df_labeled.columns else ("longitude" if "longitude" in df_labeled.columns else None)
lat_col_lab = "lat" if "lat" in df_labeled.columns else ("latitude" if "latitude" in df_labeled.columns else None)

def parse_names(cell):
    try:
        lst = ast.literal_eval(cell)
        if isinstance(lst, list):
            return [d.get("name") for d in lst if isinstance(d, dict) and "name" in d]
    except Exception:
        pass
    return []

df_raw["__cat_names"] = df_raw["spot_categories"].fillna("[]").apply(parse_names)

def first_or_none(lst):
    return lst[0] if (isinstance(lst, list) and len(lst) > 0) else None

df_raw["__fclass_name"] = df_raw["__cat_names"].apply(first_or_none)

fclass_by_place = (df_raw.dropna(subset=["__fclass_name"])
                          .groupby("placeid")["__fclass_name"]
                          .agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.iloc[0]))

cat_by_place = (df_labeled.dropna(subset=["category"])
                          .groupby("placeid")["category"]
                          .agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.iloc[0]))

coords_raw = (df_raw.groupby("placeid")[[lat_col_raw, lon_col_raw]]
                    .mean()
                    .rename(columns={lat_col_raw: "latitude", lon_col_raw: "longitude"}))

if coords_raw.empty and lon_col_lab and lat_col_lab and (lon_col_lab in df_labeled.columns) and (lat_col_lab in df_labeled.columns):
    coords_raw = (df_labeled.groupby("placeid")[[lat_col_lab, lon_col_lab]]
                           .mean()
                           .rename(columns={lat_col_lab: "latitude", lon_col_lab: "longitude"}))

coords_raw = coords_raw.dropna()

pois = pd.DataFrame({"feature_id": coords_raw.index})
pois["feature_id"] = pois["feature_id"].astype(int)

pois["fclass_name"]   = fclass_by_place.reindex(pois["feature_id"]).values
pois["category_name"] = cat_by_place.reindex(pois["feature_id"]).values

pois = pois.dropna(subset=["fclass_name", "category_name"]).reset_index(drop=True)

gdf = gpd.GeoDataFrame(
    pois,
    geometry=gpd.points_from_xy(coords_raw.loc[pois["feature_id"], "longitude"].values,
                                coords_raw.loc[pois["feature_id"], "latitude"].values),
    crs="EPSG:4326"
)
gdf["geometry"] = gdf.geometry.apply(lambda p: p.wkt)

fclass_vocab = {n:i for i,n in enumerate(pd.Series(gdf["fclass_name"]).dropna().unique())}
cat_vocab    = {n:i for i,n in enumerate(pd.Series(gdf["category_name"]).dropna().unique())}

gdf["fclass"]   = gdf["fclass_name"].map(lambda n: fclass_vocab.get(n, -1)).astype(int)
gdf["category"] = gdf["category_name"].map(lambda n: cat_vocab.get(n, -1)).astype(int)

gdf = gdf[(gdf["fclass"]>=0) & (gdf["category"]>=0)].reset_index(drop=True)

pois_out = gdf[["feature_id", "category", "fclass", "geometry"]].copy()
pois_out.to_csv(OUT_POIS, index=False)



In [None]:
pois_out.head()

In [None]:
# @title
import geopandas as gpd
from shapely.geometry import box

lat_min, lat_max = df["latitude"].min(), df["latitude"].max()
lon_min, lon_max = df["longitude"].min(), df["longitude"].max()
area = box(lon_min, lat_min, lon_max, lat_max).buffer(0.01)

boroughs = gpd.GeoDataFrame(geometry=[area], crs="EPSG:4326")

boroughs["geometry"] = boroughs["geometry"].apply(lambda g: g.wkt)
boroughs.to_csv("boroughs_area.csv", index=False)


In [None]:
import sys
module_dir = f'{diretorio_principal}/region-embedding/baselines/poi-encoder'

sys.path.insert(0, module_dir)

In [None]:
from POIEmbedding import PreProcess

PreProcess("pois_gowalla.csv", "boroughs_area.csv", h3=False).run() ##TODO H3 FALSO


In [None]:
test = pd.read_csv("pois_gowalla.csv")
len(test)

In [None]:
import pandas as pd
import numpy as np
import torch
from shapely import wkt

def build_node_features_from_location_encoder(pois_csv_path: str,
                                              loc_embed_pt_path: str,
                                              placeid_col: str = "feature_id"):

    df_pois = pd.read_csv(pois_csv_path)
    if df_pois["geometry"].dtype == object:
        df_pois["geometry"] = df_pois["geometry"].apply(wkt.loads)

    blob = torch.load(loc_embed_pt_path, map_location="cpu")
    E = blob["embeddings"].detach().cpu().numpy()
    placeids = [str(p) for p in blob["placeids"]]
    placeid2idx = {pid: i for i, pid in enumerate(placeids)}

    ids = df_pois[placeid_col].astype(str).tolist()
    D = E.shape[1]
    X = np.zeros((len(ids), D), dtype=np.float32)

    for i, pid in enumerate(ids):
        idx = placeid2idx.get(pid)
        if idx is None:
            raise KeyError(f"placeid {pid} não encontrado em {loc_embed_pt_path}. "
                           f"")
        X[i] = E[idx]

    return X, df_pois


In [None]:
# @title
from POIEmbedding import POI2Vec
p = POI2Vec()
p.train()
p.save_walks()


In [None]:
# @title
from POIEmbedding import POI2Vec
from model import POISet, EmbeddingModel
import torch, torch.utils.data as tud

poi2vec = POI2Vec()
poi2vec.read_walks()
poi2vec.get_global_second_class_walks()

second_class_hierarchy_pairs = list(set([tuple(x) for x in poi2vec.pois[["category","fclass"]].to_numpy()]))
dataset = POISet(
    poi2vec.second_class_number,
    poi2vec.second_class_walks,
    poi2vec.global_second_class_walks,
    k=5
)
model = EmbeddingModel(
    vocab_size=poi2vec.second_class_number,
    embed_size=64,
    second_class_hierarchy_pairs=second_class_hierarchy_pairs,
    le_lambda=1e-8
)

loader = tud.DataLoader(dataset, batch_size=2048, shuffle=True)
opt = torch.optim.Adam(model.parameters(), lr=5e-2)
loss_history = []

for e in range(5):
    epoch_loss = 0
    batches = 0

    for i,(inp,pos,neg) in enumerate(loader):
        opt.zero_grad()
        loss,_ = model(inp.long(), pos.long(), neg.long())
        loss.backward()
        opt.step()

        epoch_loss += loss.item()
        batches += 1

    avg_loss = epoch_loss / batches
    loss_history.append(avg_loss)
    print(f"Época {e+1:02d} | Loss média: {avg_loss:.4f}")


torch.save({f"in_embed_{ESTADO.lower()}.weight": model.clone_input_embedding()}, f"poi-encoder-gowalla-h3_{ESTADO.lower()}.tensor")


In [None]:
pd.read_csv("pois_gowalla.csv")

## HGI

In [None]:
import sys
module_dir = f'{diretorio_principal}/region-embedding/baselines/HGI/preprocess'

sys.path.insert(0, module_dir)

In [None]:
import torch, numpy as np, pandas as pd
from torch_geometric.data import Data
from main import Preprocess

POIS = "pois_gowalla.csv"
REGS = "boroughs_area.csv"

data_dict = Preprocess(POIS, REGS, emb_filename=None, h3=False).get_data_torch()

loc_pt_path = f"/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/poi_embeddings_location-{ESTADO.lower()}.pt"
blob = torch.load(loc_pt_path, map_location="cpu")
E = blob["embeddings"].detach().cpu().numpy()   # (1224, D)
placeids = [str(p) for p in blob["placeids"]]
placeid2idx = {pid: i for i, pid in enumerate(placeids)}
D = E.shape[1]

# edge_index -> número total de nós no grafo
ei = np.asarray(data_dict['edge_index'])
num_nodes_graph = ei.max() + 1
print("num_nodes_graph:", num_nodes_graph)

# poi_index.csv tem o mapeamento nó -> placeid
order = pd.read_csv("poi_index.csv")
order["feature_id"] = order["feature_id"].astype(str)

print("len(poi_index):", len(order))

# X agora tem UM embedding por nó do grafo
X = np.zeros((num_nodes_graph, D), dtype=np.float32)

# Preenche só os nós que são POIs, usando row_idx
missing = 0
for row in order.itertuples():
    node_idx = int(row.row_idx)          # índice do nó no grafo
    pid = row.feature_id                # placeid como string
    emb_idx = placeid2idx.get(str(pid))
    if emb_idx is None:
        missing += 1
        continue
    X[node_idx] = E[emb_idx]

print("Embeddings dos POIs:", E.shape)
print("Matriz X (node embeddings):", X.shape)
print("POIs sem embedding encontrado:", missing)

# sanity check
assert ei.max() < X.shape[0], "edge_index referencia nó >= len(X) — ordem quebrou"

g = Data(
    x=torch.tensor(X, dtype=torch.float32),
    edge_index=torch.tensor(data_dict['edge_index'], dtype=torch.long),
    edge_weight=torch.tensor(data_dict['edge_weight'], dtype=torch.float32),
)
g.region_id  = torch.tensor(data_dict['region_id'], dtype=torch.long)
g.region_area = torch.tensor(data_dict['region_area'], dtype=torch.float32)
g.coarse_region_similarity = torch.tensor(data_dict['coarse_region_similarity'], dtype=torch.float32)
g.region_adjacency = torch.tensor(data_dict['region_adjacency'], dtype=torch.long)

torch.save(g, "gowalla.pt")
print("✅ gowalla.pt salvo com x.shape =", g.x.shape)


In [None]:
import os, pickle as pkl, torch
from torch_geometric.data import Data

os.makedirs("./data", exist_ok=True)

g = torch.load("./gowalla.pt", map_location="cpu")

data_dict = {
    "node_features": g.x.detach().cpu().numpy(),
    "edge_index": g.edge_index.detach().cpu().numpy(),
    "edge_weight": g.edge_weight.detach().cpu().numpy(),
    "region_id": g.region_id.detach().cpu().numpy(),
    "region_area": g.region_area.detach().cpu().numpy(),
    "coarse_region_similarity": g.coarse_region_similarity.detach().cpu().numpy(),
    "region_adjacency": g.region_adjacency.detach().cpu().numpy(),
}

with open("./data/gowalla_hgi_data.pkl", "wb") as f:
    pkl.dump(data_dict, f)



In [None]:
import numpy as np

R_from_id = int(np.max(data_dict["region_id"])) + 1
R_area    = len(data_dict["region_area"])
R_adj     = int(data_dict["region_adjacency"].max()) + 1
R_sim     = data_dict["coarse_region_similarity"].shape[0]

print("R from id :", R_from_id)
print("R area    :", R_area)
print("R adj     :", R_adj)
print("R sim     :", R_sim)

assert R_from_id == R_area == R_adj == R_sim, "Desalinhad"


In [None]:
%pip uninstall -y torch torchvision torchaudio torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv pyg-lib

%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
%pip install --no-cache-dir pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric \
  -f https://data.pyg.org/whl/torch-2.4.0+cu121.html


In [None]:
!python {diretorio_principal}/region-embedding/baselines/HGI/train.py --city gowalla --dim 64 --alpha 0.5 --attention_head 4 --epoch 500 --device cuda --save_name gowalla_h3


In [None]:
!python {diretorio_principal}/region-embedding/baselines/HGI/train.py --city gowalla --dim 128 --alpha 0.5 --attention_head 4 --epoch 400 --device cuda --save_name gowalla_h3


In [None]:
!python {diretorio_principal}/region-embedding/baselines/HGI/train.py --city gowalla --dim 32 --alpha 0.5 --attention_head 4 --epoch 300 --device cuda --save_name gowalla_h3


# California


Census: https://catalog.data.gov/dataset/tiger-line-shapefile-2021-state-north florida-census-tracts

In [None]:
ESTADO = "California"

In [None]:
diretorio_principal = "/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/"

In [None]:
baixar_shapefile_estado("california")

In [None]:
import os
import geopandas as gpd

arquivos = [os.path.join("/content", f) for f in os.listdir("/content") if f.endswith(".shp")]
arquivo = max(arquivos, key=os.path.getmtime)
tl = gpd.read_file(arquivo).to_crs("EPSG:4326")
tl.plot(edgecolor="black")


In [None]:
tl[["GEOID","geometry"]] ##TODO: Passar arquivo para HGI (CSV)

## BORO -> GEOID

In [None]:
import geopandas as gpd
from shapely import wkt

boroughs = tl[["GEOID", "geometry"]].copy()
boroughs["geometry"] = boroughs["geometry"].apply(lambda g: g.wkt)

boroughs.to_csv(f"{diretorio_principal}/boroughs_area.csv", index=False)


In [None]:
boroughs

In [None]:
!rm -rf /usr/local/lib/python3.12/dist-packages/~orch*
!pip cache purge


In [None]:
%pip -q install -U pip setuptools wheel
%pip -q uninstall -y torch torchvision torchaudio torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv pyg-lib
%pip -q install --no-cache-dir geopandas shapely libpysal h3 h3ronpy pyarrow scipy scikit-learn


In [None]:
# @title
%pip -q install --no-cache-dir torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cpu
%pip install --no-cache-dir pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric \
  -f https://data.pyg.org/whl/torch-2.4.0+cu121.html


In [None]:
%pip -q install --no-cache-dir torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cpu
%pip -q install --no-cache-dir pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-2.4.0+cpu.html


In [None]:
%cd {diretorio_principal}

## POI Embedding

In [None]:
FOLDER_ID_CRUS = "1cV807NNGn4gSDX-7fkJ83rlr0nRo4y89"
FOLDER_ID_SEPARATED = "1XUWhd59YDe8dSrTb6eZlLvhVcLSGpZ7n"

filename = f"checkins_{ESTADO}.csv"

path_crus = f"estados/crus/{filename}"
path_sep  = f"estados/separated/{filename}"

print("Baixando (crus)...")
download_from_folder_by_name(drive_service,FOLDER_ID_CRUS, filename, path_crus)

print("Baixando (separated)...")
download_from_folder_by_name(drive_service,FOLDER_ID_SEPARATED, filename, path_sep)

print("Concluído:", path_crus, "e", path_sep)


In [None]:
path_sep

In [None]:
import pandas as pd, numpy as np, ast
import geopandas as gpd
from shapely.geometry import Point


CHECKIN_NAO_CRU = f"estados/crus/checkins_{ESTADO}.csv"
CHECKIN_CRU     = f"estados/separated/checkins_{ESTADO}.csv"
OUT_POIS        = "pois_gowalla.csv"

df_labeled = pd.read_csv(CHECKIN_NAO_CRU)
df_raw     = pd.read_csv(CHECKIN_CRU)

lon_col_raw = "lng" if "lng" in df_raw.columns else "longitude"
lat_col_raw = "lat" if "lat" in df_raw.columns else "latitude"
lon_col_lab = "lng" if "lng" in df_labeled.columns else ("longitude" if "longitude" in df_labeled.columns else None)
lat_col_lab = "lat" if "lat" in df_labeled.columns else ("latitude" if "latitude" in df_labeled.columns else None)

def parse_names(cell):
    try:
        lst = ast.literal_eval(cell)
        if isinstance(lst, list):
            return [d.get("name") for d in lst if isinstance(d, dict) and "name" in d]
    except Exception:
        pass
    return []

df_raw["__cat_names"] = df_raw["spot_categories"].fillna("[]").apply(parse_names)

def first_or_none(lst):
    return lst[0] if (isinstance(lst, list) and len(lst) > 0) else None

df_raw["__fclass_name"] = df_raw["__cat_names"].apply(first_or_none)

fclass_by_place = (df_raw.dropna(subset=["__fclass_name"])
                          .groupby("placeid")["__fclass_name"]
                          .agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.iloc[0]))

cat_by_place = (df_labeled.dropna(subset=["category"])
                          .groupby("placeid")["category"]
                          .agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.iloc[0]))

coords_raw = (df_raw.groupby("placeid")[[lat_col_raw, lon_col_raw]]
                    .mean()
                    .rename(columns={lat_col_raw: "latitude", lon_col_raw: "longitude"}))

if coords_raw.empty and lon_col_lab and lat_col_lab and (lon_col_lab in df_labeled.columns) and (lat_col_lab in df_labeled.columns):
    coords_raw = (df_labeled.groupby("placeid")[[lat_col_lab, lon_col_lab]]
                           .mean()
                           .rename(columns={lat_col_lab: "latitude", lon_col_lab: "longitude"}))

coords_raw = coords_raw.dropna()

pois = pd.DataFrame({"feature_id": coords_raw.index})
pois["feature_id"] = pois["feature_id"].astype(int)

pois["fclass_name"]   = fclass_by_place.reindex(pois["feature_id"]).values
pois["category_name"] = cat_by_place.reindex(pois["feature_id"]).values

pois = pois.dropna(subset=["fclass_name", "category_name"]).reset_index(drop=True)

gdf = gpd.GeoDataFrame(
    pois,
    geometry=gpd.points_from_xy(coords_raw.loc[pois["feature_id"], "longitude"].values,
                                coords_raw.loc[pois["feature_id"], "latitude"].values),
    crs="EPSG:4326"
)
gdf["geometry"] = gdf.geometry.apply(lambda p: p.wkt)

fclass_vocab = {n:i for i,n in enumerate(pd.Series(gdf["fclass_name"]).dropna().unique())}
cat_vocab    = {n:i for i,n in enumerate(pd.Series(gdf["category_name"]).dropna().unique())}

gdf["fclass"]   = gdf["fclass_name"].map(lambda n: fclass_vocab.get(n, -1)).astype(int)
gdf["category"] = gdf["category_name"].map(lambda n: cat_vocab.get(n, -1)).astype(int)

gdf = gdf[(gdf["fclass"]>=0) & (gdf["category"]>=0)].reset_index(drop=True)

pois_out = gdf[["feature_id", "category", "fclass", "geometry"]].copy()
pois_out.to_csv(OUT_POIS, index=False)



In [None]:
pois_out.head()

In [None]:
# @title
import geopandas as gpd
from shapely.geometry import box

lat_min, lat_max = df["latitude"].min(), df["latitude"].max()
lon_min, lon_max = df["longitude"].min(), df["longitude"].max()
area = box(lon_min, lat_min, lon_max, lat_max).buffer(0.01)

boroughs = gpd.GeoDataFrame(geometry=[area], crs="EPSG:4326")

boroughs["geometry"] = boroughs["geometry"].apply(lambda g: g.wkt)
boroughs.to_csv("boroughs_area.csv", index=False)


In [None]:
import sys
module_dir = f'{diretorio_principal}/region-embedding/baselines/poi-encoder'

sys.path.insert(0, module_dir)

In [None]:
from POIEmbedding import PreProcess

PreProcess("pois_gowalla.csv", "boroughs_area.csv", h3=False).run() ##TODO H3 FALSO


In [None]:
test = pd.read_csv("pois_gowalla.csv")
len(test)

In [None]:
import pandas as pd
import numpy as np
import torch
from shapely import wkt

def build_node_features_from_location_encoder(pois_csv_path: str,
                                              loc_embed_pt_path: str,
                                              placeid_col: str = "feature_id"):

    df_pois = pd.read_csv(pois_csv_path)
    if df_pois["geometry"].dtype == object:
        df_pois["geometry"] = df_pois["geometry"].apply(wkt.loads)

    blob = torch.load(loc_embed_pt_path, map_location="cpu")
    E = blob["embeddings"].detach().cpu().numpy()
    placeids = [str(p) for p in blob["placeids"]]
    placeid2idx = {pid: i for i, pid in enumerate(placeids)}

    ids = df_pois[placeid_col].astype(str).tolist()
    D = E.shape[1]
    X = np.zeros((len(ids), D), dtype=np.float32)

    for i, pid in enumerate(ids):
        idx = placeid2idx.get(pid)
        if idx is None:
            raise KeyError(f"placeid {pid} não encontrado em {loc_embed_pt_path}. "
                           f"")
        X[i] = E[idx]

    return X, df_pois


In [None]:
# @title
from POIEmbedding import POI2Vec
p = POI2Vec()
p.train()
p.save_walks()


In [None]:
# @title
from POIEmbedding import POI2Vec
from model import POISet, EmbeddingModel
import torch, torch.utils.data as tud

poi2vec = POI2Vec()
poi2vec.read_walks()
poi2vec.get_global_second_class_walks()

second_class_hierarchy_pairs = list(set([tuple(x) for x in poi2vec.pois[["category","fclass"]].to_numpy()]))
dataset = POISet(
    poi2vec.second_class_number,
    poi2vec.second_class_walks,
    poi2vec.global_second_class_walks,
    k=5
)
model = EmbeddingModel(
    vocab_size=poi2vec.second_class_number,
    embed_size=64,
    second_class_hierarchy_pairs=second_class_hierarchy_pairs,
    le_lambda=1e-8
)

loader = tud.DataLoader(dataset, batch_size=2048, shuffle=True)
opt = torch.optim.Adam(model.parameters(), lr=5e-2)
loss_history = []

for e in range(5):
    epoch_loss = 0
    batches = 0

    for i,(inp,pos,neg) in enumerate(loader):
        opt.zero_grad()
        loss,_ = model(inp.long(), pos.long(), neg.long())
        loss.backward()
        opt.step()

        epoch_loss += loss.item()
        batches += 1

    avg_loss = epoch_loss / batches
    loss_history.append(avg_loss)
    print(f"Época {e+1:02d} | Loss média: {avg_loss:.4f}")


torch.save({f"in_embed_{ESTADO.lower()}.weight": model.clone_input_embedding()}, f"poi-encoder-gowalla-h3_{ESTADO.lower()}.tensor")


In [None]:
pd.read_csv("pois_gowalla.csv")

## HGI

In [None]:
import sys
module_dir = f'{diretorio_principal}/region-embedding/baselines/HGI/preprocess'

sys.path.insert(0, module_dir)

In [None]:
import torch, numpy as np, pandas as pd
from torch_geometric.data import Data
from main import Preprocess

POIS = "pois_gowalla.csv"
REGS = "boroughs_area.csv"

data_dict = Preprocess(POIS, REGS, emb_filename=None, h3=False).get_data_torch()

loc_pt_path = f"/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/poi_embeddings_location-{ESTADO.lower()}.pt"
blob = torch.load(loc_pt_path, map_location="cpu")
E = blob["embeddings"].detach().cpu().numpy()   # (1224, D)
placeids = [str(p) for p in blob["placeids"]]
placeid2idx = {pid: i for i, pid in enumerate(placeids)}
D = E.shape[1]

# edge_index -> número total de nós no grafo
ei = np.asarray(data_dict['edge_index'])
num_nodes_graph = ei.max() + 1
print("num_nodes_graph:", num_nodes_graph)

# poi_index.csv tem o mapeamento nó -> placeid
order = pd.read_csv("poi_index.csv")
order["feature_id"] = order["feature_id"].astype(str)

print("len(poi_index):", len(order))

# X agora tem UM embedding por nó do grafo
X = np.zeros((num_nodes_graph, D), dtype=np.float32)

# Preenche só os nós que são POIs, usando row_idx
missing = 0
for row in order.itertuples():
    node_idx = int(row.row_idx)          # índice do nó no grafo
    pid = row.feature_id                # placeid como string
    emb_idx = placeid2idx.get(str(pid))
    if emb_idx is None:
        missing += 1
        continue
    X[node_idx] = E[emb_idx]

print("Embeddings dos POIs:", E.shape)
print("Matriz X (node embeddings):", X.shape)
print("POIs sem embedding encontrado:", missing)

# sanity check
assert ei.max() < X.shape[0], "edge_index referencia nó >= len(X) — ordem quebrou"

g = Data(
    x=torch.tensor(X, dtype=torch.float32),
    edge_index=torch.tensor(data_dict['edge_index'], dtype=torch.long),
    edge_weight=torch.tensor(data_dict['edge_weight'], dtype=torch.float32),
)
g.region_id  = torch.tensor(data_dict['region_id'], dtype=torch.long)
g.region_area = torch.tensor(data_dict['region_area'], dtype=torch.float32)
g.coarse_region_similarity = torch.tensor(data_dict['coarse_region_similarity'], dtype=torch.float32)
g.region_adjacency = torch.tensor(data_dict['region_adjacency'], dtype=torch.long)

torch.save(g, "gowalla.pt")
print("✅ gowalla.pt salvo com x.shape =", g.x.shape)


In [None]:
import os, pickle as pkl, torch
from torch_geometric.data import Data

os.makedirs("./data", exist_ok=True)

g = torch.load("./gowalla.pt", map_location="cpu")

data_dict = {
    "node_features": g.x.detach().cpu().numpy(),
    "edge_index": g.edge_index.detach().cpu().numpy(),
    "edge_weight": g.edge_weight.detach().cpu().numpy(),
    "region_id": g.region_id.detach().cpu().numpy(),
    "region_area": g.region_area.detach().cpu().numpy(),
    "coarse_region_similarity": g.coarse_region_similarity.detach().cpu().numpy(),
    "region_adjacency": g.region_adjacency.detach().cpu().numpy(),
}

with open("./data/gowalla_hgi_data.pkl", "wb") as f:
    pkl.dump(data_dict, f)



In [None]:
import numpy as np

R_from_id = int(np.max(data_dict["region_id"])) + 1
R_area    = len(data_dict["region_area"])
R_adj     = int(data_dict["region_adjacency"].max()) + 1
R_sim     = data_dict["coarse_region_similarity"].shape[0]

print("R from id :", R_from_id)
print("R area    :", R_area)
print("R adj     :", R_adj)
print("R sim     :", R_sim)

assert R_from_id == R_area == R_adj == R_sim, "Desalinhad"


In [None]:
%pip uninstall -y torch torchvision torchaudio torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv pyg-lib

%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
%pip install --no-cache-dir pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric \
  -f https://data.pyg.org/whl/torch-2.4.0+cu121.html


In [None]:
!python {diretorio_principal}/region-embedding/baselines/HGI/train.py --city gowalla --dim 64 --alpha 0.5 --attention_head 4 --epoch 500 --device cuda --save_name gowalla_h3


In [None]:
!python {diretorio_principal}/region-embedding/baselines/HGI/train.py --city gowalla --dim 128 --alpha 0.5 --attention_head 4 --epoch 400 --device cuda --save_name gowalla_h3


In [None]:
!python {diretorio_principal}/region-embedding/baselines/HGI/train.py --city gowalla --dim 32 --alpha 0.5 --attention_head 4 --epoch 300 --device cuda --save_name gowalla_h3


# Texas


Census: https://catalog.data.gov/dataset/tiger-line-shapefile-2021-state-north florida-census-tracts

In [None]:
ESTADO = "Texas"

In [None]:
diretorio_principal = "/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/"

In [None]:
baixar_shapefile_estado("texas")

In [None]:
import os
import geopandas as gpd

arquivos = [os.path.join("/content", f) for f in os.listdir("/content") if f.endswith(".shp")]
arquivo = max(arquivos, key=os.path.getmtime)
tl = gpd.read_file(arquivo).to_crs("EPSG:4326")
tl.plot(edgecolor="black")


In [None]:
tl[["GEOID","geometry"]] ##TODO: Passar arquivo para HGI (CSV)

## BORO -> GEOID

In [None]:
import geopandas as gpd
from shapely import wkt

boroughs = tl[["GEOID", "geometry"]].copy()
boroughs["geometry"] = boroughs["geometry"].apply(lambda g: g.wkt)

boroughs.to_csv(f"{diretorio_principal}/boroughs_area.csv", index=False)


In [None]:
boroughs

In [None]:
!rm -rf /usr/local/lib/python3.12/dist-packages/~orch*
!pip cache purge


In [None]:
%pip -q install -U pip setuptools wheel
%pip -q uninstall -y torch torchvision torchaudio torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv pyg-lib
%pip -q install --no-cache-dir geopandas shapely libpysal h3 h3ronpy pyarrow scipy scikit-learn


In [None]:
# @title
%pip -q install --no-cache-dir torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cpu
%pip install --no-cache-dir pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric \
  -f https://data.pyg.org/whl/torch-2.4.0+cu121.html


In [None]:
%pip -q install --no-cache-dir torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cpu
%pip -q install --no-cache-dir pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-2.4.0+cpu.html


In [None]:
%cd {diretorio_principal}

## POI Embedding

In [None]:
FOLDER_ID_CRUS = "1cV807NNGn4gSDX-7fkJ83rlr0nRo4y89"
FOLDER_ID_SEPARATED = "1XUWhd59YDe8dSrTb6eZlLvhVcLSGpZ7n"

filename = f"checkins_{ESTADO}.csv"

path_crus = f"estados/crus/{filename}"
path_sep  = f"estados/separated/{filename}"

print("Baixando (crus)...")
download_from_folder_by_name(drive_service,FOLDER_ID_CRUS, filename, path_crus)

print("Baixando (separated)...")
download_from_folder_by_name(drive_service,FOLDER_ID_SEPARATED, filename, path_sep)

print("Concluído:", path_crus, "e", path_sep)


In [None]:
path_sep

In [None]:
import pandas as pd, numpy as np, ast
import geopandas as gpd
from shapely.geometry import Point


CHECKIN_NAO_CRU = f"estados/crus/checkins_{ESTADO}.csv"
CHECKIN_CRU     = f"estados/separated/checkins_{ESTADO}.csv"
OUT_POIS        = "pois_gowalla.csv"

df_labeled = pd.read_csv(CHECKIN_NAO_CRU)
df_raw     = pd.read_csv(CHECKIN_CRU)

lon_col_raw = "lng" if "lng" in df_raw.columns else "longitude"
lat_col_raw = "lat" if "lat" in df_raw.columns else "latitude"
lon_col_lab = "lng" if "lng" in df_labeled.columns else ("longitude" if "longitude" in df_labeled.columns else None)
lat_col_lab = "lat" if "lat" in df_labeled.columns else ("latitude" if "latitude" in df_labeled.columns else None)

def parse_names(cell):
    try:
        lst = ast.literal_eval(cell)
        if isinstance(lst, list):
            return [d.get("name") for d in lst if isinstance(d, dict) and "name" in d]
    except Exception:
        pass
    return []

df_raw["__cat_names"] = df_raw["spot_categories"].fillna("[]").apply(parse_names)

def first_or_none(lst):
    return lst[0] if (isinstance(lst, list) and len(lst) > 0) else None

df_raw["__fclass_name"] = df_raw["__cat_names"].apply(first_or_none)

fclass_by_place = (df_raw.dropna(subset=["__fclass_name"])
                          .groupby("placeid")["__fclass_name"]
                          .agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.iloc[0]))

cat_by_place = (df_labeled.dropna(subset=["category"])
                          .groupby("placeid")["category"]
                          .agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.iloc[0]))

coords_raw = (df_raw.groupby("placeid")[[lat_col_raw, lon_col_raw]]
                    .mean()
                    .rename(columns={lat_col_raw: "latitude", lon_col_raw: "longitude"}))

if coords_raw.empty and lon_col_lab and lat_col_lab and (lon_col_lab in df_labeled.columns) and (lat_col_lab in df_labeled.columns):
    coords_raw = (df_labeled.groupby("placeid")[[lat_col_lab, lon_col_lab]]
                           .mean()
                           .rename(columns={lat_col_lab: "latitude", lon_col_lab: "longitude"}))

coords_raw = coords_raw.dropna()

pois = pd.DataFrame({"feature_id": coords_raw.index})
pois["feature_id"] = pois["feature_id"].astype(int)

pois["fclass_name"]   = fclass_by_place.reindex(pois["feature_id"]).values
pois["category_name"] = cat_by_place.reindex(pois["feature_id"]).values

pois = pois.dropna(subset=["fclass_name", "category_name"]).reset_index(drop=True)

gdf = gpd.GeoDataFrame(
    pois,
    geometry=gpd.points_from_xy(coords_raw.loc[pois["feature_id"], "longitude"].values,
                                coords_raw.loc[pois["feature_id"], "latitude"].values),
    crs="EPSG:4326"
)
gdf["geometry"] = gdf.geometry.apply(lambda p: p.wkt)

fclass_vocab = {n:i for i,n in enumerate(pd.Series(gdf["fclass_name"]).dropna().unique())}
cat_vocab    = {n:i for i,n in enumerate(pd.Series(gdf["category_name"]).dropna().unique())}

gdf["fclass"]   = gdf["fclass_name"].map(lambda n: fclass_vocab.get(n, -1)).astype(int)
gdf["category"] = gdf["category_name"].map(lambda n: cat_vocab.get(n, -1)).astype(int)

gdf = gdf[(gdf["fclass"]>=0) & (gdf["category"]>=0)].reset_index(drop=True)

pois_out = gdf[["feature_id", "category", "fclass", "geometry"]].copy()
pois_out.to_csv(OUT_POIS, index=False)



In [None]:
pois_out.head()

In [None]:
# @title
import geopandas as gpd
from shapely.geometry import box

lat_min, lat_max = df["latitude"].min(), df["latitude"].max()
lon_min, lon_max = df["longitude"].min(), df["longitude"].max()
area = box(lon_min, lat_min, lon_max, lat_max).buffer(0.01)

boroughs = gpd.GeoDataFrame(geometry=[area], crs="EPSG:4326")

boroughs["geometry"] = boroughs["geometry"].apply(lambda g: g.wkt)
boroughs.to_csv("boroughs_area.csv", index=False)


In [None]:
import sys
module_dir = f'{diretorio_principal}/region-embedding/baselines/poi-encoder'

sys.path.insert(0, module_dir)

In [None]:
from POIEmbedding import PreProcess

PreProcess("pois_gowalla.csv", "boroughs_area.csv", h3=False).run() ##TODO H3 FALSO


In [None]:
test = pd.read_csv("pois_gowalla.csv")
len(test)

In [None]:
import pandas as pd
import numpy as np
import torch
from shapely import wkt

def build_node_features_from_location_encoder(pois_csv_path: str,
                                              loc_embed_pt_path: str,
                                              placeid_col: str = "feature_id"):

    df_pois = pd.read_csv(pois_csv_path)
    if df_pois["geometry"].dtype == object:
        df_pois["geometry"] = df_pois["geometry"].apply(wkt.loads)

    blob = torch.load(loc_embed_pt_path, map_location="cpu")
    E = blob["embeddings"].detach().cpu().numpy()
    placeids = [str(p) for p in blob["placeids"]]
    placeid2idx = {pid: i for i, pid in enumerate(placeids)}

    ids = df_pois[placeid_col].astype(str).tolist()
    D = E.shape[1]
    X = np.zeros((len(ids), D), dtype=np.float32)

    for i, pid in enumerate(ids):
        idx = placeid2idx.get(pid)
        if idx is None:
            raise KeyError(f"placeid {pid} não encontrado em {loc_embed_pt_path}. "
                           f"")
        X[i] = E[idx]

    return X, df_pois


In [None]:
# @title
from POIEmbedding import POI2Vec
p = POI2Vec()
p.train()
p.save_walks()


In [None]:
# @title
from POIEmbedding import POI2Vec
from model import POISet, EmbeddingModel
import torch, torch.utils.data as tud

poi2vec = POI2Vec()
poi2vec.read_walks()
poi2vec.get_global_second_class_walks()

second_class_hierarchy_pairs = list(set([tuple(x) for x in poi2vec.pois[["category","fclass"]].to_numpy()]))
dataset = POISet(
    poi2vec.second_class_number,
    poi2vec.second_class_walks,
    poi2vec.global_second_class_walks,
    k=5
)
model = EmbeddingModel(
    vocab_size=poi2vec.second_class_number,
    embed_size=64,
    second_class_hierarchy_pairs=second_class_hierarchy_pairs,
    le_lambda=1e-8
)

loader = tud.DataLoader(dataset, batch_size=2048, shuffle=True)
opt = torch.optim.Adam(model.parameters(), lr=5e-2)
loss_history = []

for e in range(5):
    epoch_loss = 0
    batches = 0

    for i,(inp,pos,neg) in enumerate(loader):
        opt.zero_grad()
        loss,_ = model(inp.long(), pos.long(), neg.long())
        loss.backward()
        opt.step()

        epoch_loss += loss.item()
        batches += 1

    avg_loss = epoch_loss / batches
    loss_history.append(avg_loss)
    print(f"Época {e+1:02d} | Loss média: {avg_loss:.4f}")


torch.save({f"in_embed_{ESTADO.lower()}.weight": model.clone_input_embedding()}, f"poi-encoder-gowalla-h3_{ESTADO.lower()}.tensor")


In [None]:
pd.read_csv("pois_gowalla.csv")

## HGI

In [None]:
import sys
module_dir = f'{diretorio_principal}/region-embedding/baselines/HGI/preprocess'

sys.path.insert(0, module_dir)

In [None]:
import torch, numpy as np, pandas as pd
from torch_geometric.data import Data
from main import Preprocess

POIS = "pois_gowalla.csv"
REGS = "boroughs_area.csv"

data_dict = Preprocess(POIS, REGS, emb_filename=None, h3=False).get_data_torch()

loc_pt_path = f"/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/poi_embeddings_location-{ESTADO.lower()}.pt"
blob = torch.load(loc_pt_path, map_location="cpu")
E = blob["embeddings"].detach().cpu().numpy()   # (1224, D)
placeids = [str(p) for p in blob["placeids"]]
placeid2idx = {pid: i for i, pid in enumerate(placeids)}
D = E.shape[1]

# edge_index -> número total de nós no grafo
ei = np.asarray(data_dict['edge_index'])
num_nodes_graph = ei.max() + 1
print("num_nodes_graph:", num_nodes_graph)

# poi_index.csv tem o mapeamento nó -> placeid
order = pd.read_csv("poi_index.csv")
order["feature_id"] = order["feature_id"].astype(str)

print("len(poi_index):", len(order))

# X agora tem UM embedding por nó do grafo
X = np.zeros((num_nodes_graph, D), dtype=np.float32)

# Preenche só os nós que são POIs, usando row_idx
missing = 0
for row in order.itertuples():
    node_idx = int(row.row_idx)          # índice do nó no grafo
    pid = row.feature_id                # placeid como string
    emb_idx = placeid2idx.get(str(pid))
    if emb_idx is None:
        missing += 1
        continue
    X[node_idx] = E[emb_idx]

print("Embeddings dos POIs:", E.shape)
print("Matriz X (node embeddings):", X.shape)
print("POIs sem embedding encontrado:", missing)

# sanity check
assert ei.max() < X.shape[0], "edge_index referencia nó >= len(X) — ordem quebrou"

g = Data(
    x=torch.tensor(X, dtype=torch.float32),
    edge_index=torch.tensor(data_dict['edge_index'], dtype=torch.long),
    edge_weight=torch.tensor(data_dict['edge_weight'], dtype=torch.float32),
)
g.region_id  = torch.tensor(data_dict['region_id'], dtype=torch.long)
g.region_area = torch.tensor(data_dict['region_area'], dtype=torch.float32)
g.coarse_region_similarity = torch.tensor(data_dict['coarse_region_similarity'], dtype=torch.float32)
g.region_adjacency = torch.tensor(data_dict['region_adjacency'], dtype=torch.long)

torch.save(g, "gowalla.pt")
print("✅ gowalla.pt salvo com x.shape =", g.x.shape)


In [None]:
import os, pickle as pkl, torch
from torch_geometric.data import Data

os.makedirs("./data", exist_ok=True)

g = torch.load("./gowalla.pt", map_location="cpu")

data_dict = {
    "node_features": g.x.detach().cpu().numpy(),
    "edge_index": g.edge_index.detach().cpu().numpy(),
    "edge_weight": g.edge_weight.detach().cpu().numpy(),
    "region_id": g.region_id.detach().cpu().numpy(),
    "region_area": g.region_area.detach().cpu().numpy(),
    "coarse_region_similarity": g.coarse_region_similarity.detach().cpu().numpy(),
    "region_adjacency": g.region_adjacency.detach().cpu().numpy(),
}

with open("./data/gowalla_hgi_data.pkl", "wb") as f:
    pkl.dump(data_dict, f)



In [None]:
import numpy as np

R_from_id = int(np.max(data_dict["region_id"])) + 1
R_area    = len(data_dict["region_area"])
R_adj     = int(data_dict["region_adjacency"].max()) + 1
R_sim     = data_dict["coarse_region_similarity"].shape[0]

print("R from id :", R_from_id)
print("R area    :", R_area)
print("R adj     :", R_adj)
print("R sim     :", R_sim)

assert R_from_id == R_area == R_adj == R_sim, "Desalinhad"


In [None]:
%pip uninstall -y torch torchvision torchaudio torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv pyg-lib

%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
%pip install --no-cache-dir pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric \
  -f https://data.pyg.org/whl/torch-2.4.0+cu121.html


In [None]:
!python {diretorio_principal}/region-embedding/baselines/HGI/train.py --city gowalla --dim 64 --alpha 0.5 --attention_head 4 --epoch 500 --device cuda --save_name gowalla_h3


In [None]:
!python {diretorio_principal}/region-embedding/baselines/HGI/train.py --city gowalla --dim 128 --alpha 0.5 --attention_head 4 --epoch 400 --device cuda --save_name gowalla_h3


In [None]:
!python {diretorio_principal}/region-embedding/baselines/HGI/train.py --city gowalla --dim 32 --alpha 0.5 --attention_head 4 --epoch 300 --device cuda --save_name gowalla_h3


# Montana


Census: https://catalog.data.gov/dataset/tiger-line-shapefile-2021-state-nebraska-census-tracts

In [None]:
ESTADO = "Montana"

In [None]:
diretorio_principal = "/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/"

In [None]:
baixar_shapefile_estado("montana")

In [None]:
import os
import geopandas as gpd

arquivos = [os.path.join("/content", f) for f in os.listdir("/content") if f.endswith(".shp")]
arquivo = max(arquivos, key=os.path.getmtime)
tl = gpd.read_file(arquivo).to_crs("EPSG:4326")
tl.plot(edgecolor="black")


In [None]:
tl[["GEOID","geometry"]] ##TODO: Passar arquivo para HGI (CSV)

## BORO -> GEOID

In [None]:
import geopandas as gpd
from shapely import wkt

boroughs = tl[["GEOID", "geometry"]].copy()
boroughs["geometry"] = boroughs["geometry"].apply(lambda g: g.wkt)

boroughs.to_csv(f"{diretorio_principal}/boroughs_area.csv", index=False)


In [None]:
boroughs

In [None]:
!rm -rf /usr/local/lib/python3.12/dist-packages/~orch*
!pip cache purge


In [None]:
%pip -q install -U pip setuptools wheel
%pip -q uninstall -y torch torchvision torchaudio torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv pyg-lib
%pip -q install --no-cache-dir geopandas shapely libpysal h3 h3ronpy pyarrow scipy scikit-learn


In [None]:
%pip -q install --no-cache-dir torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cpu
%pip -q install --no-cache-dir pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-2.4.0+cpu.html


In [None]:
%cd {diretorio_principal}

## POI Embedding

In [None]:
FOLDER_ID_CRUS = "1cV807NNGn4gSDX-7fkJ83rlr0nRo4y89"
FOLDER_ID_SEPARATED = "1XUWhd59YDe8dSrTb6eZlLvhVcLSGpZ7n"

filename = f"checkins_{ESTADO}.csv"

path_crus = f"estados/crus/{filename}"
path_sep  = f"estados/separated/{filename}"

print("Baixando (crus)...")
download_from_folder_by_name(drive_service,FOLDER_ID_CRUS, filename, path_crus)

print("Baixando (separated)...")
download_from_folder_by_name(drive_service,FOLDER_ID_SEPARATED, filename, path_sep)

print("Concluído:", path_crus, "e", path_sep)


In [None]:
import pandas as pd, numpy as np, ast
import geopandas as gpd
from shapely.geometry import Point


CHECKIN_NAO_CRU = f"estados/crus/checkins_{ESTADO}.csv"
CHECKIN_CRU     = f"estados/separated/checkins_{ESTADO}.csv"
OUT_POIS        = "pois_gowalla.csv"

df_labeled = pd.read_csv(CHECKIN_NAO_CRU)
df_raw     = pd.read_csv(CHECKIN_CRU)

lon_col_raw = "lng" if "lng" in df_raw.columns else "longitude"
lat_col_raw = "lat" if "lat" in df_raw.columns else "latitude"
lon_col_lab = "lng" if "lng" in df_labeled.columns else ("longitude" if "longitude" in df_labeled.columns else None)
lat_col_lab = "lat" if "lat" in df_labeled.columns else ("latitude" if "latitude" in df_labeled.columns else None)

def parse_names(cell):
    try:
        lst = ast.literal_eval(cell)
        if isinstance(lst, list):
            return [d.get("name") for d in lst if isinstance(d, dict) and "name" in d]
    except Exception:
        pass
    return []

df_raw["__cat_names"] = df_raw["spot_categories"].fillna("[]").apply(parse_names)

def first_or_none(lst):
    return lst[0] if (isinstance(lst, list) and len(lst) > 0) else None

df_raw["__fclass_name"] = df_raw["__cat_names"].apply(first_or_none)

fclass_by_place = (df_raw.dropna(subset=["__fclass_name"])
                          .groupby("placeid")["__fclass_name"]
                          .agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.iloc[0]))

cat_by_place = (df_labeled.dropna(subset=["category"])
                          .groupby("placeid")["category"]
                          .agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.iloc[0]))

coords_raw = (df_raw.groupby("placeid")[[lat_col_raw, lon_col_raw]]
                    .mean()
                    .rename(columns={lat_col_raw: "latitude", lon_col_raw: "longitude"}))

if coords_raw.empty and lon_col_lab and lat_col_lab and (lon_col_lab in df_labeled.columns) and (lat_col_lab in df_labeled.columns):
    coords_raw = (df_labeled.groupby("placeid")[[lat_col_lab, lon_col_lab]]
                           .mean()
                           .rename(columns={lat_col_lab: "latitude", lon_col_lab: "longitude"}))

coords_raw = coords_raw.dropna()

pois = pd.DataFrame({"feature_id": coords_raw.index})
pois["feature_id"] = pois["feature_id"].astype(int)

pois["fclass_name"]   = fclass_by_place.reindex(pois["feature_id"]).values
pois["category_name"] = cat_by_place.reindex(pois["feature_id"]).values

pois = pois.dropna(subset=["fclass_name", "category_name"]).reset_index(drop=True)

gdf = gpd.GeoDataFrame(
    pois,
    geometry=gpd.points_from_xy(coords_raw.loc[pois["feature_id"], "longitude"].values,
                                coords_raw.loc[pois["feature_id"], "latitude"].values),
    crs="EPSG:4326"
)
gdf["geometry"] = gdf.geometry.apply(lambda p: p.wkt)

fclass_vocab = {n:i for i,n in enumerate(pd.Series(gdf["fclass_name"]).dropna().unique())}
cat_vocab    = {n:i for i,n in enumerate(pd.Series(gdf["category_name"]).dropna().unique())}

gdf["fclass"]   = gdf["fclass_name"].map(lambda n: fclass_vocab.get(n, -1)).astype(int)
gdf["category"] = gdf["category_name"].map(lambda n: cat_vocab.get(n, -1)).astype(int)

gdf = gdf[(gdf["fclass"]>=0) & (gdf["category"]>=0)].reset_index(drop=True)

pois_out = gdf[["feature_id", "category", "fclass", "geometry"]].copy()
pois_out.to_csv(OUT_POIS, index=False)



In [None]:
# @title
import geopandas as gpd
from shapely.geometry import box

lat_min, lat_max = df["latitude"].min(), df["latitude"].max()
lon_min, lon_max = df["longitude"].min(), df["longitude"].max()
area = box(lon_min, lat_min, lon_max, lat_max).buffer(0.01)

boroughs = gpd.GeoDataFrame(geometry=[area], crs="EPSG:4326")

boroughs["geometry"] = boroughs["geometry"].apply(lambda g: g.wkt)
boroughs.to_csv("boroughs_area.csv", index=False)


In [None]:
import sys
module_dir = f'{diretorio_principal}/region-embedding/baselines/poi-encoder'

sys.path.insert(0, module_dir)

In [None]:
from POIEmbedding import PreProcess

PreProcess("pois_gowalla.csv", "boroughs_area.csv", h3=False).run() ##TODO H3 FALSO


In [None]:
import pandas as pd
import numpy as np
import torch
from shapely import wkt

def build_node_features_from_location_encoder(pois_csv_path: str,
                                              loc_embed_pt_path: str,
                                              placeid_col: str = "feature_id"):

    df_pois = pd.read_csv(pois_csv_path)
    if df_pois["geometry"].dtype == object:
        df_pois["geometry"] = df_pois["geometry"].apply(wkt.loads)

    blob = torch.load(loc_embed_pt_path, map_location="cpu")
    E = blob["embeddings"].detach().cpu().numpy()
    placeids = [str(p) for p in blob["placeids"]]
    placeid2idx = {pid: i for i, pid in enumerate(placeids)}

    ids = df_pois[placeid_col].astype(str).tolist()
    D = E.shape[1]
    X = np.zeros((len(ids), D), dtype=np.float32)

    for i, pid in enumerate(ids):
        idx = placeid2idx.get(pid)
        if idx is None:
            raise KeyError(f"placeid {pid} não encontrado em {loc_embed_pt_path}. "
                           f"")
        X[i] = E[idx]

    return X, df_pois


In [None]:
# @title
from POIEmbedding import POI2Vec
p = POI2Vec()
p.train()
p.save_walks()


In [None]:
# @title
from POIEmbedding import POI2Vec
from model import POISet, EmbeddingModel
import torch, torch.utils.data as tud

poi2vec = POI2Vec()
poi2vec.read_walks()
poi2vec.get_global_second_class_walks()

second_class_hierarchy_pairs = list(set([tuple(x) for x in poi2vec.pois[["category","fclass"]].to_numpy()]))
dataset = POISet(
    poi2vec.second_class_number,
    poi2vec.second_class_walks,
    poi2vec.global_second_class_walks,
    k=5
)
model = EmbeddingModel(
    vocab_size=poi2vec.second_class_number,
    embed_size=64,
    second_class_hierarchy_pairs=second_class_hierarchy_pairs,
    le_lambda=1e-8
)

loader = tud.DataLoader(dataset, batch_size=2048, shuffle=True)
opt = torch.optim.Adam(model.parameters(), lr=5e-2)
for e in range(5):
    for i,(inp,pos,neg) in enumerate(loader):
        opt.zero_grad()
        loss,_ = model(inp.long(), pos.long(), neg.long())
        loss.backward()
        opt.step()

torch.save({f"in_embed_{ESTADO}.weight": model.clone_input_embedding()}, f"poi-encoder-gowalla-h3_{ESTADO}.tensor")


In [None]:
print(f"poi-encoder-gowalla-h3_{ESTADO}.tensor")

## HGI

In [None]:
import sys
module_dir = f'{diretorio_principal}/region-embedding/baselines/HGI/preprocess'

sys.path.insert(0, module_dir)

In [None]:
import pandas as pd
import numpy as np
import torch
out_df = pd.read_csv(f"/content/drive/MyDrive/MTL_POI_Novo/data/output/{ESTADO.lower()}/embeddings-poi-encoder.csv")
out_df = out_df.sort_values("placeid").reset_index(drop=True)

placeids = out_df["placeid"].astype(str).tolist()

emb_cols = [c for c in out_df.columns if c.isnumeric()]
E = out_df[emb_cols].to_numpy(dtype=np.float32)

torch.save({
    "embeddings": torch.from_numpy(E),
    "placeids": placeids
}, f"/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/poi_embeddings_encoder.pt")

print("Arquivo salvo com sucesso!")
print(E.shape, "embeddings salvos")


In [None]:
import torch, numpy as np, pandas as pd
from torch_geometric.data import Data
from main import Preprocess

POIS = "pois_gowalla.csv"
REGS = "boroughs_area.csv"

data_dict = Preprocess(POIS, REGS, emb_filename=None, h3=False).get_data_torch()

loc_pt_path = "/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/poi_embeddings_encoder.pt"
blob = torch.load(loc_pt_path, map_location="cpu")
E = blob["embeddings"].detach().cpu().numpy()
placeids = [str(p) for p in blob["placeids"]]
placeid2idx = {pid: i for i, pid in enumerate(placeids)}
D = E.shape[1]

order = pd.read_csv("poi_index.csv")
order["feature_id"] = order["feature_id"].astype(str)

X = np.zeros((len(order), D), dtype=np.float32)
for i, pid in enumerate(order["feature_id"].tolist()):
    idx = placeid2idx.get(pid)
    if idx is None:
        raise KeyError(f"placeid {pid} não encontrado em {loc_pt_path}. Gere embeddings para todos os POIs.")
    X[i] = E[idx]

import numpy as np
ei = np.asarray(data_dict['edge_index'])
assert ei.max() < X.shape[0], "edge_index referencia nó >= len(X) — ordem quebrou"

g = Data(
    x=torch.tensor(X, dtype=torch.float32),
    edge_index=torch.tensor(data_dict['edge_index'], dtype=torch.long),
    edge_weight=torch.tensor(data_dict['edge_weight'], dtype=torch.float32),
)
g.region_id  = torch.tensor(data_dict['region_id'], dtype=torch.long)
g.region_area = torch.tensor(data_dict['region_area'], dtype=torch.float32)
g.coarse_region_similarity = torch.tensor(data_dict['coarse_region_similarity'], dtype=torch.float32)
g.region_adjacency = torch.tensor(data_dict['region_adjacency'], dtype=torch.long)

torch.save(g, "gowalla.pt")


In [None]:
import os, pickle as pkl, torch
from torch_geometric.data import Data

os.makedirs("./data", exist_ok=True)

g = torch.load("./gowalla.pt", map_location="cpu")

data_dict = {
    "node_features": g.x.detach().cpu().numpy(),
    "edge_index": g.edge_index.detach().cpu().numpy(),
    "edge_weight": g.edge_weight.detach().cpu().numpy(),
    "region_id": g.region_id.detach().cpu().numpy(),
    "region_area": g.region_area.detach().cpu().numpy(),
    "coarse_region_similarity": g.coarse_region_similarity.detach().cpu().numpy(),
    "region_adjacency": g.region_adjacency.detach().cpu().numpy(),
}

with open("./data/gowalla_hgi_data.pkl", "wb") as f:
    pkl.dump(data_dict, f)



In [None]:
import numpy as np

R_from_id = int(np.max(data_dict["region_id"])) + 1
R_area    = len(data_dict["region_area"])
R_adj     = int(data_dict["region_adjacency"].max()) + 1
R_sim     = data_dict["coarse_region_similarity"].shape[0]

print("R from id :", R_from_id)
print("R area    :", R_area)
print("R adj     :", R_adj)
print("R sim     :", R_sim)

assert R_from_id == R_area == R_adj == R_sim, "Desalinhad"


In [None]:
!python {diretorio_principal}/region-embedding/baselines/HGI/train.py --city gowalla --dim 64 --alpha 0.5 --attention_head 4 --epoch 300 --device cpu --save_name gowalla_h3


# Alabama


Census: https://catalog.data.gov/dataset/tiger-line-shapefile-2021-state-nebraska-census-tracts

In [None]:
ESTADO = "Alabama"

In [None]:
diretorio_principal = "/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/"

In [None]:
baixar_shapefile_estado("alabama")

In [None]:
import os
import geopandas as gpd

arquivos = [os.path.join("/content", f) for f in os.listdir("/content") if f.endswith(".shp")]
arquivo = max(arquivos, key=os.path.getmtime)
tl = gpd.read_file(arquivo).to_crs("EPSG:4326")
tl.plot(edgecolor="black")


In [None]:
tl[["GEOID","geometry"]] ##TODO: Passar arquivo para HGI (CSV)

## BORO -> GEOID

In [None]:
import geopandas as gpd
from shapely import wkt

boroughs = tl[["GEOID", "geometry"]].copy()
boroughs["geometry"] = boroughs["geometry"].apply(lambda g: g.wkt)

boroughs.to_csv(f"{diretorio_principal}/boroughs_area.csv", index=False)


In [None]:
boroughs

In [None]:
!rm -rf /usr/local/lib/python3.12/dist-packages/~orch*
!pip cache purge


In [None]:
%pip -q install -U pip setuptools wheel
%pip -q uninstall -y torch torchvision torchaudio torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv pyg-lib
%pip -q install --no-cache-dir geopandas shapely libpysal h3 h3ronpy pyarrow scipy scikit-learn


In [None]:
%pip -q install --no-cache-dir torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cpu
%pip -q install --no-cache-dir pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-2.4.0+cpu.html


In [None]:
%cd {diretorio_principal}

## POI Embedding

In [None]:
FOLDER_ID_CRUS = "1cV807NNGn4gSDX-7fkJ83rlr0nRo4y89"
FOLDER_ID_SEPARATED = "1XUWhd59YDe8dSrTb6eZlLvhVcLSGpZ7n"

filename = f"checkins_{ESTADO}.csv"

path_crus = f"estados/crus/{filename}"
path_sep  = f"estados/separated/{filename}"

print("Baixando (crus)...")
download_from_folder_by_name(drive_service,FOLDER_ID_CRUS, filename, path_crus)

print("Baixando (separated)...")
download_from_folder_by_name(drive_service,FOLDER_ID_SEPARATED, filename, path_sep)

print("Concluído:", path_crus, "e", path_sep)


In [None]:
import pandas as pd, numpy as np, ast
import geopandas as gpd
from shapely.geometry import Point


CHECKIN_NAO_CRU = f"estados/crus/checkins_{ESTADO}.csv"
CHECKIN_CRU     = f"estados/separated/checkins_{ESTADO}.csv"
OUT_POIS        = "pois_gowalla.csv"

df_labeled = pd.read_csv(CHECKIN_NAO_CRU)
df_raw     = pd.read_csv(CHECKIN_CRU)

lon_col_raw = "lng" if "lng" in df_raw.columns else "longitude"
lat_col_raw = "lat" if "lat" in df_raw.columns else "latitude"
lon_col_lab = "lng" if "lng" in df_labeled.columns else ("longitude" if "longitude" in df_labeled.columns else None)
lat_col_lab = "lat" if "lat" in df_labeled.columns else ("latitude" if "latitude" in df_labeled.columns else None)

def parse_names(cell):
    try:
        lst = ast.literal_eval(cell)
        if isinstance(lst, list):
            return [d.get("name") for d in lst if isinstance(d, dict) and "name" in d]
    except Exception:
        pass
    return []

df_raw["__cat_names"] = df_raw["spot_categories"].fillna("[]").apply(parse_names)

def first_or_none(lst):
    return lst[0] if (isinstance(lst, list) and len(lst) > 0) else None

df_raw["__fclass_name"] = df_raw["__cat_names"].apply(first_or_none)

fclass_by_place = (df_raw.dropna(subset=["__fclass_name"])
                          .groupby("placeid")["__fclass_name"]
                          .agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.iloc[0]))

cat_by_place = (df_labeled.dropna(subset=["category"])
                          .groupby("placeid")["category"]
                          .agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.iloc[0]))

coords_raw = (df_raw.groupby("placeid")[[lat_col_raw, lon_col_raw]]
                    .mean()
                    .rename(columns={lat_col_raw: "latitude", lon_col_raw: "longitude"}))

if coords_raw.empty and lon_col_lab and lat_col_lab and (lon_col_lab in df_labeled.columns) and (lat_col_lab in df_labeled.columns):
    coords_raw = (df_labeled.groupby("placeid")[[lat_col_lab, lon_col_lab]]
                           .mean()
                           .rename(columns={lat_col_lab: "latitude", lon_col_lab: "longitude"}))

coords_raw = coords_raw.dropna()

pois = pd.DataFrame({"feature_id": coords_raw.index})
pois["feature_id"] = pois["feature_id"].astype(int)

pois["fclass_name"]   = fclass_by_place.reindex(pois["feature_id"]).values
pois["category_name"] = cat_by_place.reindex(pois["feature_id"]).values

pois = pois.dropna(subset=["fclass_name", "category_name"]).reset_index(drop=True)

gdf = gpd.GeoDataFrame(
    pois,
    geometry=gpd.points_from_xy(coords_raw.loc[pois["feature_id"], "longitude"].values,
                                coords_raw.loc[pois["feature_id"], "latitude"].values),
    crs="EPSG:4326"
)
gdf["geometry"] = gdf.geometry.apply(lambda p: p.wkt)

fclass_vocab = {n:i for i,n in enumerate(pd.Series(gdf["fclass_name"]).dropna().unique())}
cat_vocab    = {n:i for i,n in enumerate(pd.Series(gdf["category_name"]).dropna().unique())}

gdf["fclass"]   = gdf["fclass_name"].map(lambda n: fclass_vocab.get(n, -1)).astype(int)
gdf["category"] = gdf["category_name"].map(lambda n: cat_vocab.get(n, -1)).astype(int)

gdf = gdf[(gdf["fclass"]>=0) & (gdf["category"]>=0)].reset_index(drop=True)

pois_out = gdf[["feature_id", "category", "fclass", "geometry"]].copy()
pois_out.to_csv(OUT_POIS, index=False)



In [None]:
# @title
import geopandas as gpd
from shapely.geometry import box

lat_min, lat_max = df["latitude"].min(), df["latitude"].max()
lon_min, lon_max = df["longitude"].min(), df["longitude"].max()
area = box(lon_min, lat_min, lon_max, lat_max).buffer(0.01)

boroughs = gpd.GeoDataFrame(geometry=[area], crs="EPSG:4326")

boroughs["geometry"] = boroughs["geometry"].apply(lambda g: g.wkt)
boroughs.to_csv("boroughs_area.csv", index=False)


In [None]:
import sys
module_dir = f'{diretorio_principal}/region-embedding/baselines/poi-encoder'

sys.path.insert(0, module_dir)

In [None]:
from POIEmbedding import PreProcess

PreProcess("pois_gowalla.csv", "boroughs_area.csv", h3=False).run() ##TODO H3 FALSO


In [None]:
import pandas as pd
import numpy as np
import torch
from shapely import wkt

def build_node_features_from_location_encoder(pois_csv_path: str,
                                              loc_embed_pt_path: str,
                                              placeid_col: str = "feature_id"):

    df_pois = pd.read_csv(pois_csv_path)
    if df_pois["geometry"].dtype == object:
        df_pois["geometry"] = df_pois["geometry"].apply(wkt.loads)

    blob = torch.load(loc_embed_pt_path, map_location="cpu")
    E = blob["embeddings"].detach().cpu().numpy()
    placeids = [str(p) for p in blob["placeids"]]
    placeid2idx = {pid: i for i, pid in enumerate(placeids)}

    ids = df_pois[placeid_col].astype(str).tolist()
    D = E.shape[1]
    X = np.zeros((len(ids), D), dtype=np.float32)

    for i, pid in enumerate(ids):
        idx = placeid2idx.get(pid)
        if idx is None:
            raise KeyError(f"placeid {pid} não encontrado em {loc_embed_pt_path}. "
                           f"")
        X[i] = E[idx]

    return X, df_pois


In [None]:
# @title
from POIEmbedding import POI2Vec
p = POI2Vec()
p.train()
p.save_walks()


In [None]:
# @title
from POIEmbedding import POI2Vec
from model import POISet, EmbeddingModel
import torch, torch.utils.data as tud

poi2vec = POI2Vec()
poi2vec.read_walks()
poi2vec.get_global_second_class_walks()

second_class_hierarchy_pairs = list(set([tuple(x) for x in poi2vec.pois[["category","fclass"]].to_numpy()]))
dataset = POISet(
    poi2vec.second_class_number,
    poi2vec.second_class_walks,
    poi2vec.global_second_class_walks,
    k=5
)
model = EmbeddingModel(
    vocab_size=poi2vec.second_class_number,
    embed_size=64,
    second_class_hierarchy_pairs=second_class_hierarchy_pairs,
    le_lambda=1e-8
)

loader = tud.DataLoader(dataset, batch_size=2048, shuffle=True)
opt = torch.optim.Adam(model.parameters(), lr=5e-2)
for e in range(5):
    for i,(inp,pos,neg) in enumerate(loader):
        opt.zero_grad()
        loss,_ = model(inp.long(), pos.long(), neg.long())
        loss.backward()
        opt.step()

torch.save({f"in_embed_{ESTADO}.weight": model.clone_input_embedding()}, f"poi-encoder-gowalla-h3_{ESTADO}.tensor")


In [None]:
print(f"poi-encoder-gowalla-h3_{ESTADO}.tensor")

## HGI

In [None]:
import sys
module_dir = f'{diretorio_principal}/region-embedding/baselines/HGI/preprocess'

sys.path.insert(0, module_dir)

In [None]:
import pandas as pd
import numpy as np
import torch
out_df = pd.read_csv(f"/content/drive/MyDrive/MTL_POI_Novo/data/output/{ESTADO.lower()}/embeddings-poi-encoder.csv")
out_df = out_df.sort_values("placeid").reset_index(drop=True)

placeids = out_df["placeid"].astype(str).tolist()

emb_cols = [c for c in out_df.columns if c.isnumeric()]
E = out_df[emb_cols].to_numpy(dtype=np.float32)

torch.save({
    "embeddings": torch.from_numpy(E),
    "placeids": placeids
}, f"/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/poi_embeddings_encoder.pt")

print("Arquivo salvo com sucesso!")
print(E.shape, "embeddings salvos")


In [None]:
import torch, numpy as np, pandas as pd
from torch_geometric.data import Data
from main import Preprocess

POIS = "pois_gowalla.csv"
REGS = "boroughs_area.csv"

data_dict = Preprocess(POIS, REGS, emb_filename=None, h3=False).get_data_torch()

loc_pt_path = "/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/poi_embeddings_encoder.pt"
blob = torch.load(loc_pt_path, map_location="cpu")
E = blob["embeddings"].detach().cpu().numpy()
placeids = [str(p) for p in blob["placeids"]]
placeid2idx = {pid: i for i, pid in enumerate(placeids)}
D = E.shape[1]

order = pd.read_csv("poi_index.csv")
order["feature_id"] = order["feature_id"].astype(str)

X = np.zeros((len(order), D), dtype=np.float32)
for i, pid in enumerate(order["feature_id"].tolist()):
    idx = placeid2idx.get(pid)
    if idx is None:
        raise KeyError(f"placeid {pid} não encontrado em {loc_pt_path}. Gere embeddings para todos os POIs.")
    X[i] = E[idx]

import numpy as np
ei = np.asarray(data_dict['edge_index'])
assert ei.max() < X.shape[0], "edge_index referencia nó >= len(X) — ordem quebrou"

g = Data(
    x=torch.tensor(X, dtype=torch.float32),
    edge_index=torch.tensor(data_dict['edge_index'], dtype=torch.long),
    edge_weight=torch.tensor(data_dict['edge_weight'], dtype=torch.float32),
)
g.region_id  = torch.tensor(data_dict['region_id'], dtype=torch.long)
g.region_area = torch.tensor(data_dict['region_area'], dtype=torch.float32)
g.coarse_region_similarity = torch.tensor(data_dict['coarse_region_similarity'], dtype=torch.float32)
g.region_adjacency = torch.tensor(data_dict['region_adjacency'], dtype=torch.long)

torch.save(g, "gowalla.pt")


In [None]:
import os, pickle as pkl, torch
from torch_geometric.data import Data

os.makedirs("./data", exist_ok=True)

g = torch.load("./gowalla.pt", map_location="cpu")

data_dict = {
    "node_features": g.x.detach().cpu().numpy(),
    "edge_index": g.edge_index.detach().cpu().numpy(),
    "edge_weight": g.edge_weight.detach().cpu().numpy(),
    "region_id": g.region_id.detach().cpu().numpy(),
    "region_area": g.region_area.detach().cpu().numpy(),
    "coarse_region_similarity": g.coarse_region_similarity.detach().cpu().numpy(),
    "region_adjacency": g.region_adjacency.detach().cpu().numpy(),
}

with open("./data/gowalla_hgi_data.pkl", "wb") as f:
    pkl.dump(data_dict, f)



In [None]:
import numpy as np

R_from_id = int(np.max(data_dict["region_id"])) + 1
R_area    = len(data_dict["region_area"])
R_adj     = int(data_dict["region_adjacency"].max()) + 1
R_sim     = data_dict["coarse_region_similarity"].shape[0]

print("R from id :", R_from_id)
print("R area    :", R_area)
print("R adj     :", R_adj)
print("R sim     :", R_sim)

assert R_from_id == R_area == R_adj == R_sim, "Desalinhad"


In [None]:
!python {diretorio_principal}/region-embedding/baselines/HGI/train.py --city gowalla --dim 64 --alpha 0.5 --attention_head 4 --epoch 300 --device cpu --save_name gowalla_h3


# Florida


Census: https://catalog.data.gov/dataset/tiger-line-shapefile-2021-state-nebraska-census-tracts

In [None]:
ESTADO = "Florida"

In [None]:
diretorio_principal = "/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/"

In [None]:
baixar_shapefile_estado("florida")

In [None]:
import os
import geopandas as gpd

arquivos = [os.path.join("/content", f) for f in os.listdir("/content") if f.endswith(".shp")]
arquivo = max(arquivos, key=os.path.getmtime)
tl = gpd.read_file(arquivo).to_crs("EPSG:4326")
tl.plot(edgecolor="black")


In [None]:
tl[["GEOID","geometry"]] ##TODO: Passar arquivo para HGI (CSV)

## BORO -> GEOID

In [None]:
import geopandas as gpd
from shapely import wkt

boroughs = tl[["GEOID", "geometry"]].copy()
boroughs["geometry"] = boroughs["geometry"].apply(lambda g: g.wkt)

boroughs.to_csv(f"{diretorio_principal}/boroughs_area.csv", index=False)


In [None]:
boroughs

In [None]:
!rm -rf /usr/local/lib/python3.12/dist-packages/~orch*
!pip cache purge


In [None]:
%pip -q install -U pip setuptools wheel
%pip -q uninstall -y torch torchvision torchaudio torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv pyg-lib
%pip -q install --no-cache-dir geopandas shapely libpysal h3 h3ronpy pyarrow scipy scikit-learn


In [None]:
%pip -q install --no-cache-dir torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cpu
%pip -q install --no-cache-dir pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-2.4.0+cpu.html


In [None]:
%cd {diretorio_principal}

## POI Embedding

In [None]:
FOLDER_ID_CRUS = "1cV807NNGn4gSDX-7fkJ83rlr0nRo4y89"
FOLDER_ID_SEPARATED = "1XUWhd59YDe8dSrTb6eZlLvhVcLSGpZ7n"

filename = f"checkins_{ESTADO}.csv"

path_crus = f"estados/crus/{filename}"
path_sep  = f"estados/separated/{filename}"

print("Baixando (crus)...")
download_from_folder_by_name(drive_service,FOLDER_ID_CRUS, filename, path_crus)

print("Baixando (separated)...")
download_from_folder_by_name(drive_service,FOLDER_ID_SEPARATED, filename, path_sep)

print("Concluído:", path_crus, "e", path_sep)


In [None]:
import pandas as pd, numpy as np, ast
import geopandas as gpd
from shapely.geometry import Point


CHECKIN_NAO_CRU = f"estados/crus/checkins_{ESTADO}.csv"
CHECKIN_CRU     = f"estados/separated/checkins_{ESTADO}.csv"
OUT_POIS        = "pois_gowalla.csv"

df_labeled = pd.read_csv(CHECKIN_NAO_CRU)
df_raw     = pd.read_csv(CHECKIN_CRU)

lon_col_raw = "lng" if "lng" in df_raw.columns else "longitude"
lat_col_raw = "lat" if "lat" in df_raw.columns else "latitude"
lon_col_lab = "lng" if "lng" in df_labeled.columns else ("longitude" if "longitude" in df_labeled.columns else None)
lat_col_lab = "lat" if "lat" in df_labeled.columns else ("latitude" if "latitude" in df_labeled.columns else None)

def parse_names(cell):
    try:
        lst = ast.literal_eval(cell)
        if isinstance(lst, list):
            return [d.get("name") for d in lst if isinstance(d, dict) and "name" in d]
    except Exception:
        pass
    return []

df_raw["__cat_names"] = df_raw["spot_categories"].fillna("[]").apply(parse_names)

def first_or_none(lst):
    return lst[0] if (isinstance(lst, list) and len(lst) > 0) else None

df_raw["__fclass_name"] = df_raw["__cat_names"].apply(first_or_none)

fclass_by_place = (df_raw.dropna(subset=["__fclass_name"])
                          .groupby("placeid")["__fclass_name"]
                          .agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.iloc[0]))

cat_by_place = (df_labeled.dropna(subset=["category"])
                          .groupby("placeid")["category"]
                          .agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.iloc[0]))

coords_raw = (df_raw.groupby("placeid")[[lat_col_raw, lon_col_raw]]
                    .mean()
                    .rename(columns={lat_col_raw: "latitude", lon_col_raw: "longitude"}))

if coords_raw.empty and lon_col_lab and lat_col_lab and (lon_col_lab in df_labeled.columns) and (lat_col_lab in df_labeled.columns):
    coords_raw = (df_labeled.groupby("placeid")[[lat_col_lab, lon_col_lab]]
                           .mean()
                           .rename(columns={lat_col_lab: "latitude", lon_col_lab: "longitude"}))

coords_raw = coords_raw.dropna()

pois = pd.DataFrame({"feature_id": coords_raw.index})
pois["feature_id"] = pois["feature_id"].astype(int)

pois["fclass_name"]   = fclass_by_place.reindex(pois["feature_id"]).values
pois["category_name"] = cat_by_place.reindex(pois["feature_id"]).values

pois = pois.dropna(subset=["fclass_name", "category_name"]).reset_index(drop=True)

gdf = gpd.GeoDataFrame(
    pois,
    geometry=gpd.points_from_xy(coords_raw.loc[pois["feature_id"], "longitude"].values,
                                coords_raw.loc[pois["feature_id"], "latitude"].values),
    crs="EPSG:4326"
)
gdf["geometry"] = gdf.geometry.apply(lambda p: p.wkt)

fclass_vocab = {n:i for i,n in enumerate(pd.Series(gdf["fclass_name"]).dropna().unique())}
cat_vocab    = {n:i for i,n in enumerate(pd.Series(gdf["category_name"]).dropna().unique())}

gdf["fclass"]   = gdf["fclass_name"].map(lambda n: fclass_vocab.get(n, -1)).astype(int)
gdf["category"] = gdf["category_name"].map(lambda n: cat_vocab.get(n, -1)).astype(int)

gdf = gdf[(gdf["fclass"]>=0) & (gdf["category"]>=0)].reset_index(drop=True)

pois_out = gdf[["feature_id", "category", "fclass", "geometry"]].copy()
pois_out.to_csv(OUT_POIS, index=False)



In [None]:
# @title
import geopandas as gpd
from shapely.geometry import box

lat_min, lat_max = df["latitude"].min(), df["latitude"].max()
lon_min, lon_max = df["longitude"].min(), df["longitude"].max()
area = box(lon_min, lat_min, lon_max, lat_max).buffer(0.01)

boroughs = gpd.GeoDataFrame(geometry=[area], crs="EPSG:4326")

boroughs["geometry"] = boroughs["geometry"].apply(lambda g: g.wkt)
boroughs.to_csv("boroughs_area.csv", index=False)


In [None]:
import sys
module_dir = f'{diretorio_principal}/region-embedding/baselines/poi-encoder'

sys.path.insert(0, module_dir)

In [None]:
from POIEmbedding import PreProcess

PreProcess("pois_gowalla.csv", "boroughs_area.csv", h3=False).run() ##TODO H3 FALSO


In [None]:
import pandas as pd
import numpy as np
import torch
from shapely import wkt

def build_node_features_from_location_encoder(pois_csv_path: str,
                                              loc_embed_pt_path: str,
                                              placeid_col: str = "feature_id"):

    df_pois = pd.read_csv(pois_csv_path)
    if df_pois["geometry"].dtype == object:
        df_pois["geometry"] = df_pois["geometry"].apply(wkt.loads)

    blob = torch.load(loc_embed_pt_path, map_location="cpu")
    E = blob["embeddings"].detach().cpu().numpy()
    placeids = [str(p) for p in blob["placeids"]]
    placeid2idx = {pid: i for i, pid in enumerate(placeids)}

    ids = df_pois[placeid_col].astype(str).tolist()
    D = E.shape[1]
    X = np.zeros((len(ids), D), dtype=np.float32)

    for i, pid in enumerate(ids):
        idx = placeid2idx.get(pid)
        if idx is None:
            raise KeyError(f"placeid {pid} não encontrado em {loc_embed_pt_path}. "
                           f"")
        X[i] = E[idx]

    return X, df_pois


In [None]:
# @title
from POIEmbedding import POI2Vec
p = POI2Vec()
p.train()
p.save_walks()


In [None]:
# @title
from POIEmbedding import POI2Vec
from model import POISet, EmbeddingModel
import torch, torch.utils.data as tud

poi2vec = POI2Vec()
poi2vec.read_walks()
poi2vec.get_global_second_class_walks()

second_class_hierarchy_pairs = list(set([tuple(x) for x in poi2vec.pois[["category","fclass"]].to_numpy()]))
dataset = POISet(
    poi2vec.second_class_number,
    poi2vec.second_class_walks,
    poi2vec.global_second_class_walks,
    k=5
)
model = EmbeddingModel(
    vocab_size=poi2vec.second_class_number,
    embed_size=64,
    second_class_hierarchy_pairs=second_class_hierarchy_pairs,
    le_lambda=1e-8
)

loader = tud.DataLoader(dataset, batch_size=2048, shuffle=True)
opt = torch.optim.Adam(model.parameters(), lr=5e-2)
for e in range(5):
    for i,(inp,pos,neg) in enumerate(loader):
        opt.zero_grad()
        loss,_ = model(inp.long(), pos.long(), neg.long())
        loss.backward()
        opt.step()

torch.save({f"in_embed_{ESTADO}.weight": model.clone_input_embedding()}, f"poi-encoder-gowalla-h3_{ESTADO}.tensor")


In [None]:
print(f"poi-encoder-gowalla-h3_{ESTADO}.tensor")

## HGI

In [None]:
import sys
module_dir = f'{diretorio_principal}/region-embedding/baselines/HGI/preprocess'

sys.path.insert(0, module_dir)

In [None]:
import pandas as pd
import numpy as np
import torch
out_df = pd.read_csv(f"/content/drive/MyDrive/MTL_POI_Novo/data/output/{ESTADO.lower()}/embeddings-poi-encoder.csv")
out_df = out_df.sort_values("placeid").reset_index(drop=True)

placeids = out_df["placeid"].astype(str).tolist()

emb_cols = [c for c in out_df.columns if c.isnumeric()]
E = out_df[emb_cols].to_numpy(dtype=np.float32)

torch.save({
    "embeddings": torch.from_numpy(E),
    "placeids": placeids
}, f"/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/poi_embeddings_encoder.pt")

print("Arquivo salvo com sucesso!")
print(E.shape, "embeddings salvos")


In [None]:
import torch, numpy as np, pandas as pd
from torch_geometric.data import Data
from main import Preprocess

POIS = "pois_gowalla.csv"
REGS = "boroughs_area.csv"

data_dict = Preprocess(POIS, REGS, emb_filename=None, h3=False).get_data_torch()

loc_pt_path = "/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/poi_embeddings_encoder.pt"
blob = torch.load(loc_pt_path, map_location="cpu")
E = blob["embeddings"].detach().cpu().numpy()
placeids = [str(p) for p in blob["placeids"]]
placeid2idx = {pid: i for i, pid in enumerate(placeids)}
D = E.shape[1]

order = pd.read_csv("poi_index.csv")
order["feature_id"] = order["feature_id"].astype(str)

X = np.zeros((len(order), D), dtype=np.float32)
for i, pid in enumerate(order["feature_id"].tolist()):
    idx = placeid2idx.get(pid)
    if idx is None:
        raise KeyError(f"placeid {pid} não encontrado em {loc_pt_path}. Gere embeddings para todos os POIs.")
    X[i] = E[idx]

import numpy as np
ei = np.asarray(data_dict['edge_index'])
assert ei.max() < X.shape[0], "edge_index referencia nó >= len(X) — ordem quebrou"

g = Data(
    x=torch.tensor(X, dtype=torch.float32),
    edge_index=torch.tensor(data_dict['edge_index'], dtype=torch.long),
    edge_weight=torch.tensor(data_dict['edge_weight'], dtype=torch.float32),
)
g.region_id  = torch.tensor(data_dict['region_id'], dtype=torch.long)
g.region_area = torch.tensor(data_dict['region_area'], dtype=torch.float32)
g.coarse_region_similarity = torch.tensor(data_dict['coarse_region_similarity'], dtype=torch.float32)
g.region_adjacency = torch.tensor(data_dict['region_adjacency'], dtype=torch.long)

torch.save(g, "gowalla.pt")


In [None]:
import os, pickle as pkl, torch
from torch_geometric.data import Data

os.makedirs("./data", exist_ok=True)

g = torch.load("./gowalla.pt", map_location="cpu")

data_dict = {
    "node_features": g.x.detach().cpu().numpy(),
    "edge_index": g.edge_index.detach().cpu().numpy(),
    "edge_weight": g.edge_weight.detach().cpu().numpy(),
    "region_id": g.region_id.detach().cpu().numpy(),
    "region_area": g.region_area.detach().cpu().numpy(),
    "coarse_region_similarity": g.coarse_region_similarity.detach().cpu().numpy(),
    "region_adjacency": g.region_adjacency.detach().cpu().numpy(),
}

with open("./data/gowalla_hgi_data.pkl", "wb") as f:
    pkl.dump(data_dict, f)



In [None]:
import numpy as np

R_from_id = int(np.max(data_dict["region_id"])) + 1
R_area    = len(data_dict["region_area"])
R_adj     = int(data_dict["region_adjacency"].max()) + 1
R_sim     = data_dict["coarse_region_similarity"].shape[0]

print("R from id :", R_from_id)
print("R area    :", R_area)
print("R adj     :", R_adj)
print("R sim     :", R_sim)

assert R_from_id == R_area == R_adj == R_sim, "Desalinhad"


In [None]:
%pip uninstall -y torch torchvision torchaudio torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv pyg-lib

%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
%pip install --no-cache-dir pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric \
  -f https://data.pyg.org/whl/torch-2.4.0+cu121.html


In [None]:
!python {diretorio_principal}/region-embedding/baselines/HGI/train.py --city gowalla --dim 64 --alpha 0.5 --attention_head 4 --epoch 300 --device cuda --save_name gowalla_h3


# California


Census: https://catalog.data.gov/dataset/tiger-line-shapefile-2021-state-nebraska-census-tracts

In [None]:
ESTADO = "California"

In [None]:
diretorio_principal = "/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/"

In [None]:
baixar_shapefile_estado("california")

In [None]:
import os
import geopandas as gpd

arquivos = [os.path.join("/content", f) for f in os.listdir("/content") if f.endswith(".shp")]
arquivo = max(arquivos, key=os.path.getmtime)
tl = gpd.read_file(arquivo).to_crs("EPSG:4326")
tl.plot(edgecolor="black")


In [None]:
tl[["GEOID","geometry"]] ##TODO: Passar arquivo para HGI (CSV)

## BORO -> GEOID

In [None]:
import geopandas as gpd
from shapely import wkt

boroughs = tl[["GEOID", "geometry"]].copy()
boroughs["geometry"] = boroughs["geometry"].apply(lambda g: g.wkt)

boroughs.to_csv(f"{diretorio_principal}/boroughs_area.csv", index=False)


In [None]:
boroughs

In [None]:
!rm -rf /usr/local/lib/python3.12/dist-packages/~orch*
!pip cache purge


In [None]:
%pip -q install -U pip setuptools wheel
%pip -q uninstall -y torch torchvision torchaudio torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv pyg-lib
%pip -q install --no-cache-dir geopandas shapely libpysal h3 h3ronpy pyarrow scipy scikit-learn


In [None]:
%pip -q install --no-cache-dir torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cpu
%pip -q install --no-cache-dir pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-2.4.0+cpu.html


In [None]:
%cd {diretorio_principal}

## POI Embedding

In [None]:
FOLDER_ID_CRUS = "1cV807NNGn4gSDX-7fkJ83rlr0nRo4y89"
FOLDER_ID_SEPARATED = "1XUWhd59YDe8dSrTb6eZlLvhVcLSGpZ7n"

filename = f"checkins_{ESTADO}.csv"

path_crus = f"estados/crus/{filename}"
path_sep  = f"estados/separated/{filename}"

print("Baixando (crus)...")
download_from_folder_by_name(drive_service,FOLDER_ID_CRUS, filename, path_crus)

print("Baixando (separated)...")
download_from_folder_by_name(drive_service,FOLDER_ID_SEPARATED, filename, path_sep)

print("Concluído:", path_crus, "e", path_sep)


In [None]:
import pandas as pd, numpy as np, ast
import geopandas as gpd
from shapely.geometry import Point


CHECKIN_NAO_CRU = f"estados/crus/checkins_{ESTADO}.csv"
CHECKIN_CRU     = f"estados/separated/checkins_{ESTADO}.csv"
OUT_POIS        = "pois_gowalla.csv"

df_labeled = pd.read_csv(CHECKIN_NAO_CRU)
df_raw     = pd.read_csv(CHECKIN_CRU)

lon_col_raw = "lng" if "lng" in df_raw.columns else "longitude"
lat_col_raw = "lat" if "lat" in df_raw.columns else "latitude"
lon_col_lab = "lng" if "lng" in df_labeled.columns else ("longitude" if "longitude" in df_labeled.columns else None)
lat_col_lab = "lat" if "lat" in df_labeled.columns else ("latitude" if "latitude" in df_labeled.columns else None)

def parse_names(cell):
    try:
        lst = ast.literal_eval(cell)
        if isinstance(lst, list):
            return [d.get("name") for d in lst if isinstance(d, dict) and "name" in d]
    except Exception:
        pass
    return []

df_raw["__cat_names"] = df_raw["spot_categories"].fillna("[]").apply(parse_names)

def first_or_none(lst):
    return lst[0] if (isinstance(lst, list) and len(lst) > 0) else None

df_raw["__fclass_name"] = df_raw["__cat_names"].apply(first_or_none)

fclass_by_place = (df_raw.dropna(subset=["__fclass_name"])
                          .groupby("placeid")["__fclass_name"]
                          .agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.iloc[0]))

cat_by_place = (df_labeled.dropna(subset=["category"])
                          .groupby("placeid")["category"]
                          .agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.iloc[0]))

coords_raw = (df_raw.groupby("placeid")[[lat_col_raw, lon_col_raw]]
                    .mean()
                    .rename(columns={lat_col_raw: "latitude", lon_col_raw: "longitude"}))

if coords_raw.empty and lon_col_lab and lat_col_lab and (lon_col_lab in df_labeled.columns) and (lat_col_lab in df_labeled.columns):
    coords_raw = (df_labeled.groupby("placeid")[[lat_col_lab, lon_col_lab]]
                           .mean()
                           .rename(columns={lat_col_lab: "latitude", lon_col_lab: "longitude"}))

coords_raw = coords_raw.dropna()

pois = pd.DataFrame({"feature_id": coords_raw.index})
pois["feature_id"] = pois["feature_id"].astype(int)

pois["fclass_name"]   = fclass_by_place.reindex(pois["feature_id"]).values
pois["category_name"] = cat_by_place.reindex(pois["feature_id"]).values

pois = pois.dropna(subset=["fclass_name", "category_name"]).reset_index(drop=True)

gdf = gpd.GeoDataFrame(
    pois,
    geometry=gpd.points_from_xy(coords_raw.loc[pois["feature_id"], "longitude"].values,
                                coords_raw.loc[pois["feature_id"], "latitude"].values),
    crs="EPSG:4326"
)
gdf["geometry"] = gdf.geometry.apply(lambda p: p.wkt)

fclass_vocab = {n:i for i,n in enumerate(pd.Series(gdf["fclass_name"]).dropna().unique())}
cat_vocab    = {n:i for i,n in enumerate(pd.Series(gdf["category_name"]).dropna().unique())}

gdf["fclass"]   = gdf["fclass_name"].map(lambda n: fclass_vocab.get(n, -1)).astype(int)
gdf["category"] = gdf["category_name"].map(lambda n: cat_vocab.get(n, -1)).astype(int)

gdf = gdf[(gdf["fclass"]>=0) & (gdf["category"]>=0)].reset_index(drop=True)

pois_out = gdf[["feature_id", "category", "fclass", "geometry"]].copy()
pois_out.to_csv(OUT_POIS, index=False)



In [None]:
# @title
import geopandas as gpd
from shapely.geometry import box

lat_min, lat_max = df["latitude"].min(), df["latitude"].max()
lon_min, lon_max = df["longitude"].min(), df["longitude"].max()
area = box(lon_min, lat_min, lon_max, lat_max).buffer(0.01)

boroughs = gpd.GeoDataFrame(geometry=[area], crs="EPSG:4326")

boroughs["geometry"] = boroughs["geometry"].apply(lambda g: g.wkt)
boroughs.to_csv("boroughs_area.csv", index=False)


In [None]:
import sys
module_dir = f'{diretorio_principal}/region-embedding/baselines/poi-encoder'

sys.path.insert(0, module_dir)

In [None]:
from POIEmbedding import PreProcess

PreProcess("pois_gowalla.csv", "boroughs_area.csv", h3=False).run() ##TODO H3 FALSO


In [None]:
import pandas as pd
import numpy as np
import torch
from shapely import wkt

def build_node_features_from_location_encoder(pois_csv_path: str,
                                              loc_embed_pt_path: str,
                                              placeid_col: str = "feature_id"):

    df_pois = pd.read_csv(pois_csv_path)
    if df_pois["geometry"].dtype == object:
        df_pois["geometry"] = df_pois["geometry"].apply(wkt.loads)

    blob = torch.load(loc_embed_pt_path, map_location="cpu")
    E = blob["embeddings"].detach().cpu().numpy()
    placeids = [str(p) for p in blob["placeids"]]
    placeid2idx = {pid: i for i, pid in enumerate(placeids)}

    ids = df_pois[placeid_col].astype(str).tolist()
    D = E.shape[1]
    X = np.zeros((len(ids), D), dtype=np.float32)

    for i, pid in enumerate(ids):
        idx = placeid2idx.get(pid)
        if idx is None:
            raise KeyError(f"placeid {pid} não encontrado em {loc_embed_pt_path}. "
                           f"")
        X[i] = E[idx]

    return X, df_pois


In [None]:
# @title
from POIEmbedding import POI2Vec
p = POI2Vec()
p.train()
p.save_walks()


In [None]:
# @title
from POIEmbedding import POI2Vec
from model import POISet, EmbeddingModel
import torch, torch.utils.data as tud

poi2vec = POI2Vec()
poi2vec.read_walks()
poi2vec.get_global_second_class_walks()

second_class_hierarchy_pairs = list(set([tuple(x) for x in poi2vec.pois[["category","fclass"]].to_numpy()]))
dataset = POISet(
    poi2vec.second_class_number,
    poi2vec.second_class_walks,
    poi2vec.global_second_class_walks,
    k=5
)
model = EmbeddingModel(
    vocab_size=poi2vec.second_class_number,
    embed_size=64,
    second_class_hierarchy_pairs=second_class_hierarchy_pairs,
    le_lambda=1e-8
)

loader = tud.DataLoader(dataset, batch_size=2048, shuffle=True)
opt = torch.optim.Adam(model.parameters(), lr=5e-2)
for e in range(5):
    for i,(inp,pos,neg) in enumerate(loader):
        opt.zero_grad()
        loss,_ = model(inp.long(), pos.long(), neg.long())
        loss.backward()
        opt.step()

torch.save({f"in_embed_{ESTADO}.weight": model.clone_input_embedding()}, f"poi-encoder-gowalla-h3_{ESTADO}.tensor")


In [None]:
print(f"poi-encoder-gowalla-h3_{ESTADO}.tensor")

## HGI

In [None]:
import sys
module_dir = f'{diretorio_principal}/region-embedding/baselines/HGI/preprocess'

sys.path.insert(0, module_dir)

In [None]:
import pandas as pd
import numpy as np
import torch
out_df = pd.read_csv(f"/content/drive/MyDrive/MTL_POI_Novo/data/output/{ESTADO.lower()}/embeddings-poi-encoder.csv")
out_df = out_df.sort_values("placeid").reset_index(drop=True)

placeids = out_df["placeid"].astype(str).tolist()

emb_cols = [c for c in out_df.columns if c.isnumeric()]
E = out_df[emb_cols].to_numpy(dtype=np.float32)

torch.save({
    "embeddings": torch.from_numpy(E),
    "placeids": placeids
}, f"/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/poi_embeddings_encoder.pt")

print("Arquivo salvo com sucesso!")
print(E.shape, "embeddings salvos")


In [None]:
import torch, numpy as np, pandas as pd
from torch_geometric.data import Data
from main import Preprocess

POIS = "pois_gowalla.csv"
REGS = "boroughs_area.csv"

data_dict = Preprocess(POIS, REGS, emb_filename=None, h3=False).get_data_torch()

loc_pt_path = "/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/poi_embeddings_encoder.pt"
blob = torch.load(loc_pt_path, map_location="cpu")
E = blob["embeddings"].detach().cpu().numpy()
placeids = [str(p) for p in blob["placeids"]]
placeid2idx = {pid: i for i, pid in enumerate(placeids)}
D = E.shape[1]

order = pd.read_csv("poi_index.csv")
order["feature_id"] = order["feature_id"].astype(str)

X = np.zeros((len(order), D), dtype=np.float32)
for i, pid in enumerate(order["feature_id"].tolist()):
    idx = placeid2idx.get(pid)
    if idx is None:
        raise KeyError(f"placeid {pid} não encontrado em {loc_pt_path}. Gere embeddings para todos os POIs.")
    X[i] = E[idx]

import numpy as np
ei = np.asarray(data_dict['edge_index'])
assert ei.max() < X.shape[0], "edge_index referencia nó >= len(X) — ordem quebrou"

g = Data(
    x=torch.tensor(X, dtype=torch.float32),
    edge_index=torch.tensor(data_dict['edge_index'], dtype=torch.long),
    edge_weight=torch.tensor(data_dict['edge_weight'], dtype=torch.float32),
)
g.region_id  = torch.tensor(data_dict['region_id'], dtype=torch.long)
g.region_area = torch.tensor(data_dict['region_area'], dtype=torch.float32)
g.coarse_region_similarity = torch.tensor(data_dict['coarse_region_similarity'], dtype=torch.float32)
g.region_adjacency = torch.tensor(data_dict['region_adjacency'], dtype=torch.long)

torch.save(g, "gowalla.pt")


In [None]:
import os, pickle as pkl, torch
from torch_geometric.data import Data

os.makedirs("./data", exist_ok=True)

g = torch.load("./gowalla.pt", map_location="cpu")

data_dict = {
    "node_features": g.x.detach().cpu().numpy(),
    "edge_index": g.edge_index.detach().cpu().numpy(),
    "edge_weight": g.edge_weight.detach().cpu().numpy(),
    "region_id": g.region_id.detach().cpu().numpy(),
    "region_area": g.region_area.detach().cpu().numpy(),
    "coarse_region_similarity": g.coarse_region_similarity.detach().cpu().numpy(),
    "region_adjacency": g.region_adjacency.detach().cpu().numpy(),
}

with open("./data/gowalla_hgi_data.pkl", "wb") as f:
    pkl.dump(data_dict, f)



In [None]:
import numpy as np

R_from_id = int(np.max(data_dict["region_id"])) + 1
R_area    = len(data_dict["region_area"])
R_adj     = int(data_dict["region_adjacency"].max()) + 1
R_sim     = data_dict["coarse_region_similarity"].shape[0]

print("R from id :", R_from_id)
print("R area    :", R_area)
print("R adj     :", R_adj)
print("R sim     :", R_sim)

assert R_from_id == R_area == R_adj == R_sim, "Desalinhad"


In [None]:
%pip uninstall -y torch torchvision torchaudio torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv pyg-lib

%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
%pip install --no-cache-dir pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric \
  -f https://data.pyg.org/whl/torch-2.4.0+cu121.html


In [None]:
!python {diretorio_principal}/region-embedding/baselines/HGI/train.py --city gowalla --dim 64 --alpha 0.5 --attention_head 4 --epoch 300 --device cuda --save_name gowalla_h3


# Texas


Census: https://catalog.data.gov/dataset/tiger-line-shapefile-2021-state-nebraska-census-tracts

In [None]:
ESTADO = "Texas"

In [None]:
diretorio_principal = "/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/"

In [None]:
baixar_shapefile_estado("texas")

In [None]:
import os
import geopandas as gpd

arquivos = [os.path.join("/content", f) for f in os.listdir("/content") if f.endswith(".shp")]
arquivo = max(arquivos, key=os.path.getmtime)
tl = gpd.read_file(arquivo).to_crs("EPSG:4326")
tl.plot(edgecolor="black")


In [None]:
tl[["GEOID","geometry"]] ##TODO: Passar arquivo para HGI (CSV)

## BORO -> GEOID

In [None]:
import geopandas as gpd
from shapely import wkt

boroughs = tl[["GEOID", "geometry"]].copy()
boroughs["geometry"] = boroughs["geometry"].apply(lambda g: g.wkt)

boroughs.to_csv(f"{diretorio_principal}/boroughs_area.csv", index=False)


In [None]:
boroughs

In [None]:
!rm -rf /usr/local/lib/python3.12/dist-packages/~orch*
!pip cache purge


In [None]:
%pip -q install -U pip setuptools wheel
%pip -q uninstall -y torch torchvision torchaudio torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv pyg-lib
%pip -q install --no-cache-dir geopandas shapely libpysal h3 h3ronpy pyarrow scipy scikit-learn


In [None]:
%pip -q install --no-cache-dir torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cpu
%pip -q install --no-cache-dir pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-2.4.0+cpu.html


In [None]:
%cd {diretorio_principal}

## POI Embedding

In [None]:
FOLDER_ID_CRUS = "1cV807NNGn4gSDX-7fkJ83rlr0nRo4y89"
FOLDER_ID_SEPARATED = "1XUWhd59YDe8dSrTb6eZlLvhVcLSGpZ7n"

filename = f"checkins_{ESTADO}.csv"

path_crus = f"estados/crus/{filename}"
path_sep  = f"estados/separated/{filename}"

print("Baixando (crus)...")
download_from_folder_by_name(drive_service,FOLDER_ID_CRUS, filename, path_crus)

print("Baixando (separated)...")
download_from_folder_by_name(drive_service,FOLDER_ID_SEPARATED, filename, path_sep)

print("Concluído:", path_crus, "e", path_sep)


In [None]:
import pandas as pd, numpy as np, ast
import geopandas as gpd
from shapely.geometry import Point


CHECKIN_NAO_CRU = f"estados/crus/checkins_{ESTADO}.csv"
CHECKIN_CRU     = f"estados/separated/checkins_{ESTADO}.csv"
OUT_POIS        = "pois_gowalla.csv"

df_labeled = pd.read_csv(CHECKIN_NAO_CRU)
df_raw     = pd.read_csv(CHECKIN_CRU)

lon_col_raw = "lng" if "lng" in df_raw.columns else "longitude"
lat_col_raw = "lat" if "lat" in df_raw.columns else "latitude"
lon_col_lab = "lng" if "lng" in df_labeled.columns else ("longitude" if "longitude" in df_labeled.columns else None)
lat_col_lab = "lat" if "lat" in df_labeled.columns else ("latitude" if "latitude" in df_labeled.columns else None)

def parse_names(cell):
    try:
        lst = ast.literal_eval(cell)
        if isinstance(lst, list):
            return [d.get("name") for d in lst if isinstance(d, dict) and "name" in d]
    except Exception:
        pass
    return []

df_raw["__cat_names"] = df_raw["spot_categories"].fillna("[]").apply(parse_names)

def first_or_none(lst):
    return lst[0] if (isinstance(lst, list) and len(lst) > 0) else None

df_raw["__fclass_name"] = df_raw["__cat_names"].apply(first_or_none)

fclass_by_place = (df_raw.dropna(subset=["__fclass_name"])
                          .groupby("placeid")["__fclass_name"]
                          .agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.iloc[0]))

cat_by_place = (df_labeled.dropna(subset=["category"])
                          .groupby("placeid")["category"]
                          .agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.iloc[0]))

coords_raw = (df_raw.groupby("placeid")[[lat_col_raw, lon_col_raw]]
                    .mean()
                    .rename(columns={lat_col_raw: "latitude", lon_col_raw: "longitude"}))

if coords_raw.empty and lon_col_lab and lat_col_lab and (lon_col_lab in df_labeled.columns) and (lat_col_lab in df_labeled.columns):
    coords_raw = (df_labeled.groupby("placeid")[[lat_col_lab, lon_col_lab]]
                           .mean()
                           .rename(columns={lat_col_lab: "latitude", lon_col_lab: "longitude"}))

coords_raw = coords_raw.dropna()

pois = pd.DataFrame({"feature_id": coords_raw.index})
pois["feature_id"] = pois["feature_id"].astype(int)

pois["fclass_name"]   = fclass_by_place.reindex(pois["feature_id"]).values
pois["category_name"] = cat_by_place.reindex(pois["feature_id"]).values

pois = pois.dropna(subset=["fclass_name", "category_name"]).reset_index(drop=True)

gdf = gpd.GeoDataFrame(
    pois,
    geometry=gpd.points_from_xy(coords_raw.loc[pois["feature_id"], "longitude"].values,
                                coords_raw.loc[pois["feature_id"], "latitude"].values),
    crs="EPSG:4326"
)
gdf["geometry"] = gdf.geometry.apply(lambda p: p.wkt)

fclass_vocab = {n:i for i,n in enumerate(pd.Series(gdf["fclass_name"]).dropna().unique())}
cat_vocab    = {n:i for i,n in enumerate(pd.Series(gdf["category_name"]).dropna().unique())}

gdf["fclass"]   = gdf["fclass_name"].map(lambda n: fclass_vocab.get(n, -1)).astype(int)
gdf["category"] = gdf["category_name"].map(lambda n: cat_vocab.get(n, -1)).astype(int)

gdf = gdf[(gdf["fclass"]>=0) & (gdf["category"]>=0)].reset_index(drop=True)

pois_out = gdf[["feature_id", "category", "fclass", "geometry"]].copy()
pois_out.to_csv(OUT_POIS, index=False)



In [None]:
# @title
import geopandas as gpd
from shapely.geometry import box

lat_min, lat_max = df["latitude"].min(), df["latitude"].max()
lon_min, lon_max = df["longitude"].min(), df["longitude"].max()
area = box(lon_min, lat_min, lon_max, lat_max).buffer(0.01)

boroughs = gpd.GeoDataFrame(geometry=[area], crs="EPSG:4326")

boroughs["geometry"] = boroughs["geometry"].apply(lambda g: g.wkt)
boroughs.to_csv("boroughs_area.csv", index=False)


In [None]:
import sys
module_dir = f'{diretorio_principal}/region-embedding/baselines/poi-encoder'

sys.path.insert(0, module_dir)

In [None]:
from POIEmbedding import PreProcess

PreProcess("pois_gowalla.csv", "boroughs_area.csv", h3=False).run() ##TODO H3 FALSO


In [None]:
import pandas as pd
import numpy as np
import torch
from shapely import wkt

def build_node_features_from_location_encoder(pois_csv_path: str,
                                              loc_embed_pt_path: str,
                                              placeid_col: str = "feature_id"):

    df_pois = pd.read_csv(pois_csv_path)
    if df_pois["geometry"].dtype == object:
        df_pois["geometry"] = df_pois["geometry"].apply(wkt.loads)

    blob = torch.load(loc_embed_pt_path, map_location="cpu")
    E = blob["embeddings"].detach().cpu().numpy()
    placeids = [str(p) for p in blob["placeids"]]
    placeid2idx = {pid: i for i, pid in enumerate(placeids)}

    ids = df_pois[placeid_col].astype(str).tolist()
    D = E.shape[1]
    X = np.zeros((len(ids), D), dtype=np.float32)

    for i, pid in enumerate(ids):
        idx = placeid2idx.get(pid)
        if idx is None:
            raise KeyError(f"placeid {pid} não encontrado em {loc_embed_pt_path}. "
                           f"")
        X[i] = E[idx]

    return X, df_pois


In [None]:
# @title
from POIEmbedding import POI2Vec
p = POI2Vec()
p.train()
p.save_walks()


In [None]:
# @title
from POIEmbedding import POI2Vec
from model import POISet, EmbeddingModel
import torch, torch.utils.data as tud

poi2vec = POI2Vec()
poi2vec.read_walks()
poi2vec.get_global_second_class_walks()

second_class_hierarchy_pairs = list(set([tuple(x) for x in poi2vec.pois[["category","fclass"]].to_numpy()]))
dataset = POISet(
    poi2vec.second_class_number,
    poi2vec.second_class_walks,
    poi2vec.global_second_class_walks,
    k=5
)
model = EmbeddingModel(
    vocab_size=poi2vec.second_class_number,
    embed_size=64,
    second_class_hierarchy_pairs=second_class_hierarchy_pairs,
    le_lambda=1e-8
)

loader = tud.DataLoader(dataset, batch_size=2048, shuffle=True)
opt = torch.optim.Adam(model.parameters(), lr=5e-2)
for e in range(5):
    for i,(inp,pos,neg) in enumerate(loader):
        opt.zero_grad()
        loss,_ = model(inp.long(), pos.long(), neg.long())
        loss.backward()
        opt.step()

torch.save({f"in_embed_{ESTADO}.weight": model.clone_input_embedding()}, f"poi-encoder-gowalla-h3_{ESTADO}.tensor")


In [None]:
print(f"poi-encoder-gowalla-h3_{ESTADO}.tensor")

## HGI

In [None]:
import sys
module_dir = f'{diretorio_principal}/region-embedding/baselines/HGI/preprocess'

sys.path.insert(0, module_dir)

In [None]:
import pandas as pd
import numpy as np
import torch
out_df = pd.read_csv(f"/content/drive/MyDrive/MTL_POI_Novo/data/output/{ESTADO.lower()}/embeddings-poi-encoder.csv")
out_df = out_df.sort_values("placeid").reset_index(drop=True)

placeids = out_df["placeid"].astype(str).tolist()

emb_cols = [c for c in out_df.columns if c.isnumeric()]
E = out_df[emb_cols].to_numpy(dtype=np.float32)

torch.save({
    "embeddings": torch.from_numpy(E),
    "placeids": placeids
}, f"/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/poi_embeddings_encoder.pt")

print("Arquivo salvo com sucesso!")
print(E.shape, "embeddings salvos")


In [None]:
import torch, numpy as np, pandas as pd
from torch_geometric.data import Data
from main import Preprocess

POIS = "pois_gowalla.csv"
REGS = "boroughs_area.csv"

data_dict = Preprocess(POIS, REGS, emb_filename=None, h3=False).get_data_torch()

loc_pt_path = "/content/drive/MyDrive/region-embedding-benchmark-main/region-embedding-benchmark-main/poi_embeddings_encoder.pt"
blob = torch.load(loc_pt_path, map_location="cpu")
E = blob["embeddings"].detach().cpu().numpy()
placeids = [str(p) for p in blob["placeids"]]
placeid2idx = {pid: i for i, pid in enumerate(placeids)}
D = E.shape[1]

order = pd.read_csv("poi_index.csv")
order["feature_id"] = order["feature_id"].astype(str)

X = np.zeros((len(order), D), dtype=np.float32)
for i, pid in enumerate(order["feature_id"].tolist()):
    idx = placeid2idx.get(pid)
    if idx is None:
        raise KeyError(f"placeid {pid} não encontrado em {loc_pt_path}. Gere embeddings para todos os POIs.")
    X[i] = E[idx]

import numpy as np
ei = np.asarray(data_dict['edge_index'])
assert ei.max() < X.shape[0], "edge_index referencia nó >= len(X) — ordem quebrou"

g = Data(
    x=torch.tensor(X, dtype=torch.float32),
    edge_index=torch.tensor(data_dict['edge_index'], dtype=torch.long),
    edge_weight=torch.tensor(data_dict['edge_weight'], dtype=torch.float32),
)
g.region_id  = torch.tensor(data_dict['region_id'], dtype=torch.long)
g.region_area = torch.tensor(data_dict['region_area'], dtype=torch.float32)
g.coarse_region_similarity = torch.tensor(data_dict['coarse_region_similarity'], dtype=torch.float32)
g.region_adjacency = torch.tensor(data_dict['region_adjacency'], dtype=torch.long)

torch.save(g, "gowalla.pt")


In [None]:
import os, pickle as pkl, torch
from torch_geometric.data import Data

os.makedirs("./data", exist_ok=True)

g = torch.load("./gowalla.pt", map_location="cpu")

data_dict = {
    "node_features": g.x.detach().cpu().numpy(),
    "edge_index": g.edge_index.detach().cpu().numpy(),
    "edge_weight": g.edge_weight.detach().cpu().numpy(),
    "region_id": g.region_id.detach().cpu().numpy(),
    "region_area": g.region_area.detach().cpu().numpy(),
    "coarse_region_similarity": g.coarse_region_similarity.detach().cpu().numpy(),
    "region_adjacency": g.region_adjacency.detach().cpu().numpy(),
}

with open("./data/gowalla_hgi_data.pkl", "wb") as f:
    pkl.dump(data_dict, f)



In [None]:
import numpy as np

R_from_id = int(np.max(data_dict["region_id"])) + 1
R_area    = len(data_dict["region_area"])
R_adj     = int(data_dict["region_adjacency"].max()) + 1
R_sim     = data_dict["coarse_region_similarity"].shape[0]

print("R from id :", R_from_id)
print("R area    :", R_area)
print("R adj     :", R_adj)
print("R sim     :", R_sim)

assert R_from_id == R_area == R_adj == R_sim, "Desalinhad"


In [None]:
%pip uninstall -y torch torchvision torchaudio torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv pyg-lib

%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
%pip install --no-cache-dir pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric \
  -f https://data.pyg.org/whl/torch-2.4.0+cu121.html


In [None]:
!python {diretorio_principal}/region-embedding/baselines/HGI/train.py --city gowalla --dim 64 --alpha 0.5 --attention_head 4 --epoch 300 --device cuda --save_name gowalla_h3
