In [1]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm
from collections import defaultdict

from sklearn.metrics import roc_auc_score

import torch
import torch.nn as nn

import torch_geometric
from torch_geometric.data import Data

sns.set(font_scale=1.3)

In [2]:
targets = pd.read_csv('./data/FINAL_TARGETS_DATES_TRAINTEST.tsv', sep='\t')
compressed = pd.read_csv('./data/compressed.csv').drop(columns=['Unnamed: 0'])
features = pd.read_csv('./data/FINAL_FEATURES_TRAINTEST.tsv', sep='\t')[compressed.columns[compressed.columns != 'FRIEND_ID']]
all_sequences = pd.read_csv('./data/FINAL_ALL_SEQUENCES_TRAINTEST.tsv', sep='\t')

users_data = targets.merge(features, on='CLIENT_ID', how='inner')

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(binary=True)
data = vec.fit_transform(all_sequences.SEQUENCE)

In [None]:
vals = []
new_rows = []
new_cols = []
for i in tqdm(range(data.shape[0] - 1)):
    if (i + 1) % 1000 == 0:''
        print(len(np.concatenate(vals)))

    res = data[i].multiply(data)[i + 1:].sum(axis=1)
    j = np.where(res > 40)[0]
    vals.append(res[j])
    new_rows.append([i] * len(j))
    new_cols.append(j + i + 1)

In [None]:
new_rows_stacked = np.concatenate(new_rows)
new_cols_stacked = np.concatenate(new_cols)
weights = np.concatenate(vals)

In [None]:
edges = np.stack((new_rows_stacked, new_cols_stacked), axis=0)
edges.shape

In [None]:
np.save('edges.npy', edges)
np.save('weights.npy', weights)

In [3]:
friends = compressed.iloc[:, 1:].rename(columns={'FRIEND_ID': 'CLIENT_ID'})

In [4]:
users_data = users_data.append(friends)

In [5]:
users_data['CLIENT_ID'] = users_data['CLIENT_ID'].astype(int)

In [6]:
rest_id = list(set(users_data['CLIENT_ID'].values).difference(set(all_sequences['CLIENT_ID'].values)))
idx_to_client = list(all_sequences['CLIENT_ID'].values) + rest_id
client_to_idx = {v: i for i, v in enumerate(idx_to_client)}

In [10]:
import pickle

with open('idx_to_client.pickle', 'wb') as handle:
    pickle.dump(idx_to_client, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('idx_to_client.pickle', 'rb') as handle:
    idx_to_client = pickle.load(handle)

In [14]:
np.save('idx_to_client.npy', idx_to_client)

In [29]:
users_data['CLIENT_ID'] = users_data['CLIENT_ID'].apply(lambda x: client_to_idx[x])
users_data['TARGET'] = users_data['TARGET'].fillna(-1)
users_data['RETRO_DT'] = users_data['RETRO_DT'].fillna(-1)

In [30]:
users_data = users_data.sort_values(by='CLIENT_ID')

In [31]:
x = users_data.drop(columns=['CLIENT_ID', 'TARGET', 'RETRO_DT']).values
y = users_data['TARGET'].values
date = users_data['RETRO_DT'].values

In [32]:
new_edges = compressed[['CLIENT_ID', 'FRIEND_ID']].copy()
new_edges['CLIENT_ID'] = new_edges['CLIENT_ID'].apply(lambda x: client_to_idx[x])
new_edges['FRIEND_ID'] = new_edges['FRIEND_ID'].apply(lambda x: client_to_idx[x])
new_edges = new_edges.values.T

In [33]:
old_edges = np.load('./edges.npy').astype(int)
old_weights = np.load('./weights.npy')[:, 0]

In [34]:
edges = np.concatenate((old_edges, new_edges), axis=1)
edges = np.concatenate((edges, edges[::-1]), axis=1)
weights = np.concatenate((old_weights, np.ones(edges.shape[1] - old_edges.shape[1])), axis=0)
weights = np.concatenate((weights, weights), axis=0)

In [35]:
users_data.shape

(1466558, 203)

In [36]:
np.save('./graph/nodes.npy', x)
np.save('./graph/lables.npy', y)
np.save('./graph/dates.npy', date)
np.save('./graph/edges.npy', edges)
np.save('./graph/weights.npy', weights)

In [37]:
x = np.load('./graph/nodes.npy', allow_pickle=True)
y = np.load('./graph/lables.npy', allow_pickle=True)
date = np.load('./graph/dates.npy', allow_pickle=True)
edges = np.load('./graph/edges.npy', allow_pickle=True)
weights = np.load('./graph/weights.npy', allow_pickle=True)

In [38]:
x.shape, y.shape, date.shape, edges.shape, weights.shape

((1466558, 200), (1466558,), (1466558,), (2, 3014878), (6029756,))