In [2]:
import torch
import pandas as pd
import geopandas as gpd
import os
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

from shapely import wkt
from tqdm import tqdm
from geopy.distance import great_circle
from sklearn.preprocessing import OneHotEncoder

from torch_geometric.data import Data
from torch_geometric.data import InMemoryDataset

from tensorboardX import SummaryWriter
from datetime import datetime

tqdm.pandas()



In [3]:
class MigrationDataset(InMemoryDataset):
    def __init__(self, root, transform=None):
        super(MigrationDataset, self).__init__(root, transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        files = []
        for file in os.listdir(self.root):
            if file.endswith(".geojson") or file.endswith(".csv"):
                files.append(file)
        return files
        
    @property
    def processed_file_names(self):
        return ['migration_dataset']

    def download(self):
        pass
    
    def process(self):
        
        # read files in specified folder
        cities = gpd.read_file(os.path.join(self.root, "cities_aggregated.geojson")).set_index("city")
        responses = gpd.read_file(os.path.join(self.root, "responses_clustered.csv"))


        # extract cities features 
        cities_features = cities[[
            "population",
            "city_category", 
            "harsh_climate", 
            "ueqi_score", 
            "residential", 
            "street_networks", 
            "greens_spaces", 
            "public_and_business_infrastructure", 
            "social_and_leisure_infrastructure",
            "citywide_space"
            ]]
        
        # encode categorical features
        one_hot = OneHotEncoder()
        encoded_category = one_hot.fit_transform(np.expand_dims(cities["city_category"].to_numpy(), 1)).toarray()
        encoded_category_names = one_hot.get_feature_names_out(["category"])
        cities_features.loc[:, encoded_category_names] = encoded_category
        cities_features = cities_features.drop(["city_category"], axis=1)
        cities_features["harsh_climate"] = cities_features["harsh_climate"].astype(int)

        # form distance matrix
        DM = cities["geometry"].progress_apply(
            lambda p1: cities["geometry"].apply(
                lambda p2: great_circle(p1.coords[0], p2.coords[0]).km
                ))

        # form origin-destination matrix

        responses_counts = responses.groupby(["cluster_center_cv", "cluster_center_vacancy"])["id_candidate"].count()
        responses_cities = responses_counts.index.get_level_values(0).drop_duplicates()
        OD = pd.DataFrame(None, index=DM.columns, columns=DM.columns)
        OD = OD.progress_apply(
            lambda city: city.fillna(responses_counts[city.name]).fillna(0) 
            if city.name in responses_cities else city.fillna(0)
            )
        
        # transform data
        
        cities_num = len(OD)
        edge_index = [[], []]
        for i in range(cities_num):
            edge_index[0].extend([i for j in range(cities_num)])
            edge_index[1].extend([j for j in range(cities_num)])

        edge_index = torch.tensor(edge_index)
        y = torch.tensor(np.concatenate((OD.to_numpy())), dtype=torch.float32)
        edge_attr = torch.tensor(np.concatenate((DM.to_numpy())), dtype=torch.float32)
        x = torch.tensor(cities_features.to_numpy(), dtype=torch.float32)

        # exclude diagonal
        non_diagonal = edge_attr > 0
        edge_attr = edge_attr[non_diagonal]
        edge_index = edge_index[:, non_diagonal]
        y = y[non_diagonal]
        
        # create torch object          
        graph = Data(x=x,edge_index=edge_index, y=y, edge_attr=edge_attr)
        
        data_list = []
        data_list.append(graph)
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

Data can be found here: https://disk.yandex.ru/client/disk/%D0%A2%D1%80%D1%83%D0%B4%D0%BE%D0%B2%D1%8B%D0%B5%20%D1%80%D0%B5%D1%81%D1%83%D1%80%D1%81%D1%8B/%D0%A4%D0%B0%D0%B9%D0%BB%D1%8B/ML_experiments/data

In [4]:
dataset = MigrationDataset("/var/essdata/IDU/other/mm_22/industrial-location/ml/data")

Processing...
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cities_features.loc[:, encoded_category_names] = encoded_category
100%|██████████| 1106/1106 [01:20<00:00, 13.74it/s]
100%|██████████| 1106/1106 [00:01<00:00, 1063.62it/s]
Done!


In [5]:
dataset.data

Data(x=[1106, 14], edge_index=[2, 1222130], edge_attr=[1222130], y=[1222130])

In [42]:
class FNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, dropout):
        super(FNN, self).__init__()

        self.num_layers = num_layers

        self.lins = nn.ModuleList()
        self.lins.append(nn.Linear(input_dim, hidden_dim))
        for _ in range(self.num_layers - 2):
            self.lins.append(nn.Linear(hidden_dim, hidden_dim))
        self.lins.append(nn.Linear(hidden_dim, 1))

        self.norm = nn.ModuleList()
        for l in range(self.num_layers):
            self.norm.append(nn.LayerNorm(hidden_dim))

        self.dropout = dropout

    def forward(self, data):

        x, edge_index, edge_weight = data.x, data.edge_index, data.edge_attr.unsqueeze(-1)

        x_s = x[edge_index[0]]
        x_d = x[edge_index[1]]
        y = torch.cat((x_s, x_d, edge_weight), axis=1)
        # y = F.normalize(y)

        for i in range(self.num_layers - 1):
            y = self.lins[i](y) 
            y = nn.functional.leaky_relu(y)
            y = F.dropout(y, p=self.dropout, training=self.training)
            y = self.norm[i](y)

        y = self.lins[-1](y)
        y = torch.relu(y).squeeze()
        
        return y

In [51]:
def r2_loss(output, target):
    target_mean = torch.mean(target)
    ss_tot = torch.sum((target - target_mean) ** 2)
    ss_res = torch.sum((target - output) ** 2)
    r2 = 1 - ss_res / ss_tot
    return r2

def train_func(dataset, model, epochs, writer):

    optimize = torch.optim.Adam(list(model.parameters()),  lr=0.001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimize, factor=0.9, min_lr=0.0001)

    # train
    for epoch in range(epochs + 1):

        optimize.zero_grad()
        model.train()
        y_hat = model(dataset)

        loss = F.mse_loss(y_hat, dataset.y)
        r2 = r2_loss(y_hat, dataset.y)

        loss.backward()
        optimize.step()
        scheduler.step(loss)

        t_metrics = {"train_loss": loss, "train_r2": r2}  
        for name, v_metric in t_metrics.items(): writer.add_scalar(name, v_metric, epoch)

        if epoch % 10 == 0:    
            print(
                "Epoch {}. TRAIN: loss {:.4f}, r2: {:.4f}. lr: {:.4f} ".format(
                    epoch, t_metrics["train_loss"], t_metrics["train_r2"], optimize.param_groups[0]["lr"]
                    )
                ) 

    return model


In [55]:
input_dim = dataset.data.x.shape[1] * 2 + 1
hidden_dim = 128
num_layers = 3
dropout = 0.2

model_v1 = FNN(input_dim, hidden_dim, num_layers, dropout)

In [57]:
datetime_now = datetime.now().strftime("%Y%m%d-%H%M%S")
logdir = "./logs/" + datetime_now
writer = SummaryWriter(logdir)

trained_model = train_func(dataset.data, model_v1, 5000, writer)

Epoch 0. TRAIN: loss 393.5331, r2: 0.0009. lr: 0.0010 
Epoch 10. TRAIN: loss 393.4045, r2: 0.0012. lr: 0.0010 
Epoch 20. TRAIN: loss 393.2929, r2: 0.0015. lr: 0.0010 
Epoch 30. TRAIN: loss 393.1610, r2: 0.0018. lr: 0.0010 
Epoch 40. TRAIN: loss 393.0468, r2: 0.0021. lr: 0.0010 
Epoch 50. TRAIN: loss 392.8981, r2: 0.0025. lr: 0.0010 
Epoch 60. TRAIN: loss 392.7978, r2: 0.0027. lr: 0.0010 
Epoch 70. TRAIN: loss 392.6240, r2: 0.0032. lr: 0.0010 
Epoch 80. TRAIN: loss 392.3603, r2: 0.0039. lr: 0.0010 
Epoch 90. TRAIN: loss 392.0068, r2: 0.0047. lr: 0.0010 
Epoch 100. TRAIN: loss 391.6028, r2: 0.0058. lr: 0.0010 
Epoch 110. TRAIN: loss 390.8111, r2: 0.0078. lr: 0.0010 
Epoch 120. TRAIN: loss 390.0086, r2: 0.0098. lr: 0.0010 
Epoch 130. TRAIN: loss 388.6362, r2: 0.0133. lr: 0.0010 
Epoch 140. TRAIN: loss 387.3689, r2: 0.0165. lr: 0.0010 
Epoch 150. TRAIN: loss 385.2827, r2: 0.0218. lr: 0.0010 
Epoch 160. TRAIN: loss 383.3600, r2: 0.0267. lr: 0.0010 
Epoch 170. TRAIN: loss 381.9162, r2: 0.030