In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import networkx as nx

import torch_geometric
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, GATConv
from torch_geometric.transforms import RandomNodeSplit

In [2]:
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, dropout, output_dim=1, return_embeds=False):
        super().__init__()
        self.convs = torch.nn.ModuleList([
            GCNConv(input_dim, hidden_dim) if i == 0 else GCNConv(hidden_dim, hidden_dim)  for i in range(num_layers - 1)
        ])
        self.out_conv = GATConv(hidden_dim, output_dim, heads=1)
        self.bns = torch.nn.ModuleList([torch.nn.BatchNorm1d(hidden_dim) for _ in range(num_layers - 1)])
        self.sigmoid = nn.Sigmoid()
        self.dropout = dropout
        self.return_embeds = return_embeds

    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()
        for bn in self.bns:
            bn.reset_parameters()

    def forward(self, x, adj_t):
        out = x
        num_layers = len(self.convs)
        for i in range(num_layers - 1):
            out = self.convs[i](out, adj_t)
            out = self.bns[i](out)
            out = F.relu(out)
            out = F.dropout(out, p=self.dropout, training=self.training)

        if not self.return_embeds:
            out = self.out_conv(out, adj_t)
            out = self.sigmoid(out)

        return out

In [3]:
model = GCN(200, 128, 3, 0.2)
model.load_state_dict(torch.load("best_model_1.pth"))
model.eval()

GCN(
  (convs): ModuleList(
    (0): GCNConv(200, 128)
    (1): GCNConv(128, 128)
  )
  (out_conv): GATConv(128, 1, heads=1)
  (bns): ModuleList(
    (0): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (sigmoid): Sigmoid()
)

In [4]:
import numpy as np
idx_to_client = np.load('idx_to_client.npy')

In [5]:
idx_to_client

array([1000100, 1000121, 1000131, ..., 8388605, 8388606, 8388607])

In [6]:
from torch_geometric.data import Data

x_all = np.load('/home/docker_current/datasets/graph/nodes.npy', allow_pickle=True)
y_all = np.load('/home/docker_current/datasets/graph/lables.npy', allow_pickle=True)
date = np.load('/home/docker_current/datasets/graph/dates.npy', allow_pickle=True).astype(int)
edges = np.load('/home/docker_current/datasets/graph/edges.npy', allow_pickle=True)
weights = np.load('/home/docker_current/datasets/graph/weights.npy', allow_pickle=True)

y_all[y_all == 'test'] = 2
y_all = y_all.astype(int)

x = torch.tensor(x_all).float()
y = torch.tensor(y_all)
edges = torch.tensor(edges)

date = torch.tensor(date)
weights = torch.tensor(weights)

data = Data(x=x, edge_index=edges, y=y, weights=weights)

In [7]:
model.return_embeds = True
embeds = model(data.x, data.edge_index).detach()

In [9]:
from sklearn.model_selection import train_test_split

idx = np.arange(len(data.x))
idx = idx[(data.y != -1) & (data.y != 2)]
train_idx, val_idx = train_test_split(idx, test_size=0.2, stratify=data.y[(data.y != -1) & (data.y != 2)], random_state=42)

data.train_idx = torch.tensor(train_idx)
data.val_idx = torch.tensor(val_idx)

In [13]:
X_train = embeds[data.train_idx].cpu().numpy()
X_val = embeds[data.val_idx].cpu().numpy()

y_train = data.y[data.train_idx].cpu().numpy()
y_val = data.y[data.val_idx].cpu().numpy()

In [14]:
X_train.shape, X_val.shape

((101944, 128), (25486, 128))

In [15]:
X = np.vstack((X_train, X_val))
y = np.hstack((y_train, y_val))

seq_idx = np.hstack((train_idx, val_idx))

In [16]:
seq_idx.shape, y.shape, X.shape

((127430,), (127430,), (127430, 128))

In [17]:
import pandas as pd
new_feat_data = pd.DataFrame(X)
new_feat_data['CLIENT_ID'] = idx_to_client[seq_idx]
new_feat_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,119,120,121,122,123,124,125,126,127,CLIENT_ID
0,0.217884,0.385625,0.000000,0.323903,0.512439,0.000000,0.173049,0.200288,0.000000,0.516379,...,0.419873,0.000000,0.283245,0.000000,0.026198,0.362399,0.000000,0.356571,0.000000,1534508
1,0.392155,0.191459,0.264613,0.490732,0.318962,0.207046,0.491928,0.448652,0.285401,0.017647,...,0.509867,0.301533,0.450824,0.327464,0.271524,0.515282,0.000000,0.421111,0.120646,1731420
2,0.546014,0.336280,0.000000,0.314020,0.000000,0.339958,0.028114,0.301742,0.000000,0.272717,...,0.264353,0.000000,0.284783,0.000000,0.112499,0.355740,0.000000,0.393993,0.000000,969573
3,0.440921,0.111825,0.096268,0.460961,0.196523,0.053137,0.449275,0.449609,0.069790,0.000000,...,0.537326,0.089619,0.386800,0.203013,0.094064,0.523410,0.000000,0.388394,0.000000,1445377
4,0.403699,0.000000,0.000000,0.263136,0.355017,0.000000,0.000000,0.381226,0.000000,0.000000,...,0.363528,0.000000,0.000000,0.000000,0.000000,0.339976,0.000000,0.446807,0.000000,1540716
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127425,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.342216,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.458094,0.000000,0.793250,58597
127426,0.230501,0.255480,0.000000,0.546097,1.098971,0.108305,0.029350,0.332598,0.000000,0.000000,...,0.373306,0.000000,0.424288,0.000000,0.000000,0.430083,0.018732,0.025373,0.000000,1549483
127427,0.463780,0.000000,0.078355,0.465118,0.134754,0.000000,0.430563,0.461291,0.078295,0.000000,...,0.502730,0.015206,0.387764,0.118504,0.171899,0.486032,0.000000,0.435273,0.000000,1638883
127428,0.444100,0.000000,0.000000,0.445084,0.000000,0.000000,0.297551,0.657651,0.000000,0.000000,...,0.426647,0.000000,0.215592,0.000000,0.000000,0.372828,0.000000,0.087852,0.000000,1603216


In [18]:
min(new_feat_data['CLIENT_ID'].values), max(new_feat_data['CLIENT_ID'].values)

(7150, 1770038)

In [19]:
test_embeds = embeds[data.y == 2].cpu().numpy()
test_embeds.shape

(31858, 128)

In [20]:
FEATURES_TRAINTEST = pd.read_csv('/home/docker_current/datasets/VK/FINAL_FEATURES_TRAINTEST.tsv', sep = '\t')
TARGETS_DATES_TRAINTEST = pd.read_csv('/home/docker_current/datasets/VK/FINAL_TARGETS_DATES_TRAINTEST.tsv', sep = '\t')
traintest_df = TARGETS_DATES_TRAINTEST.merge(FEATURES_TRAINTEST, how='inner')
del TARGETS_DATES_TRAINTEST, FEATURES_TRAINTEST

test_id = traintest_df[traintest_df['TARGET'] == 'test']['CLIENT_ID'].values

In [28]:
test_idx = np.where(data.y == 2)[0]

In [29]:
test_features_df = pd.DataFrame(test_embeds)
test_features_df["CLIENT_ID"] = idx_to_client[test_idx]
test_features_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,119,120,121,122,123,124,125,126,127,CLIENT_ID
0,0.000000,0.000000,0.000000,0.199516,0.000000,0.000000,0.416514,0.611709,0.000000,0.000000,...,0.471383,0.000000,0.228237,0.000000,0.023097,0.205599,0.000000,0.403484,0.000000,1000121
1,0.000000,0.091477,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.647987,...,0.000000,0.000000,0.000000,0.883155,0.000000,0.000000,2.839014,0.000000,0.541635,100013
2,0.356484,0.212728,0.054907,0.364588,0.000000,0.128383,0.489786,0.229836,0.166434,0.015713,...,0.470482,0.154295,0.376930,0.241890,0.119272,0.424511,0.000000,0.388609,0.000000,1001420
3,1.363400,0.000000,0.000000,0.051902,0.000000,0.000000,0.000000,0.468849,0.000000,0.000000,...,0.412347,0.000000,0.000000,0.338456,0.000000,0.000000,0.000000,0.136588,0.000000,1001693
4,0.000000,0.000000,0.000000,0.000000,0.532344,0.000000,0.000000,0.000000,0.000000,0.285709,...,0.000000,0.000000,0.000000,0.282031,0.000000,0.000000,2.069549,0.000000,0.010304,100198
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31853,1.370202,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.772140,0.000000,0.000000,1.968021,0.000000,0.000000,0.000000,0.000000,0.000000,1602155
31854,0.392547,0.192341,0.268089,0.492769,0.322955,0.210823,0.496445,0.451485,0.292918,0.017743,...,0.509743,0.305900,0.456037,0.333279,0.275263,0.518133,0.000000,0.423737,0.122730,1602526
31855,0.392547,0.192341,0.268089,0.492769,0.322955,0.210823,0.496445,0.451485,0.292918,0.017743,...,0.509743,0.305900,0.456037,0.333279,0.275263,0.518133,0.000000,0.423737,0.122730,1602561
31856,0.176861,0.000000,0.416378,0.467534,0.012447,0.305624,0.233324,0.448859,0.015269,0.000000,...,0.517360,0.000000,0.712943,0.000000,0.274945,0.790878,0.000000,0.432779,0.346384,1488997


In [30]:
GNN_feat_df = new_feat_data.append(test_features_df)

In [31]:
GNN_feat_df.to_csv('GNN_feat_1.csv', index=False)

In [32]:
GNN_feat_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,119,120,121,122,123,124,125,126,127,CLIENT_ID
0,0.217884,0.385625,0.000000,0.323903,0.512439,0.000000,0.173049,0.200288,0.000000,0.516379,...,0.419873,0.000000,0.283245,0.000000,0.026198,0.362399,0.0,0.356571,0.000000,1534508
1,0.392155,0.191459,0.264613,0.490732,0.318962,0.207046,0.491928,0.448652,0.285401,0.017647,...,0.509867,0.301533,0.450824,0.327464,0.271524,0.515282,0.0,0.421111,0.120646,1731420
2,0.546014,0.336280,0.000000,0.314020,0.000000,0.339958,0.028114,0.301742,0.000000,0.272717,...,0.264353,0.000000,0.284783,0.000000,0.112499,0.355740,0.0,0.393993,0.000000,969573
3,0.440921,0.111825,0.096268,0.460961,0.196523,0.053137,0.449275,0.449609,0.069790,0.000000,...,0.537326,0.089619,0.386800,0.203013,0.094064,0.523410,0.0,0.388394,0.000000,1445377
4,0.403699,0.000000,0.000000,0.263136,0.355017,0.000000,0.000000,0.381226,0.000000,0.000000,...,0.363528,0.000000,0.000000,0.000000,0.000000,0.339976,0.0,0.446807,0.000000,1540716
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31853,1.370202,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.772140,0.000000,0.000000,1.968021,0.000000,0.000000,0.0,0.000000,0.000000,1602155
31854,0.392547,0.192341,0.268089,0.492769,0.322955,0.210823,0.496445,0.451485,0.292918,0.017743,...,0.509743,0.305900,0.456037,0.333279,0.275263,0.518133,0.0,0.423737,0.122730,1602526
31855,0.392547,0.192341,0.268089,0.492769,0.322955,0.210823,0.496445,0.451485,0.292918,0.017743,...,0.509743,0.305900,0.456037,0.333279,0.275263,0.518133,0.0,0.423737,0.122730,1602561
31856,0.176861,0.000000,0.416378,0.467534,0.012447,0.305624,0.233324,0.448859,0.015269,0.000000,...,0.517360,0.000000,0.712943,0.000000,0.274945,0.790878,0.0,0.432779,0.346384,1488997
