In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.utils import shuffle
from tqdm import tqdm
import pickle as pkl
import pandas as pd
import random
import sys
import json
import os
from sklearn.metrics import f1_score

In [2]:
random.seed(0)
np.random.seed(0)

In [163]:
config = {
    #embedding computation
    'cleora_n_iter': 15, # 5, 10, 25, 50
    'cleora_dim': 4096, # 512, 1024, 2048, 4096
    
    #dataset preparation
    'train_test_split': 0.2,
    
    #training classification
    'input_embeddings': [
                    'output/emb__cluster_id__StarNode.out',
                    'output/emb__CliqueNode__CliqueNode.out',
                   ],
    'batch_size': 256,
    'test_batch_size': 1000,
    'epochs': [20],
}

# DATASET Deezer Europe

In [164]:
df_cleora = pd.read_csv("./deezer_europe/deezer_europe_edges.csv")

In [165]:
df_cleora.head()

Unnamed: 0,node_1,node_2
0,0,14270
1,0,16976
2,0,12029
3,0,3001
4,0,14581


In [166]:
train_cleora, test_cleora = train_test_split(df_cleora, test_size=config['train_test_split'])

In [167]:
deezer_europe_cleora_input_clique_filename = "deezer_europe_cleora_input_clique.txt"
deezer_europe_cleora_input_star_filename = "deezer_europe_cleora_input_star.txt"
output_dir = 'output'

In [168]:
with open(deezer_europe_cleora_input_clique_filename, "w") as f_cleora_clique, open(deezer_europe_cleora_input_star_filename, "w") as f_cleora_star:
    grouped_train = train_cleora.groupby('node_1')
    for n, (name, group) in enumerate(grouped_train):
        group_list = group['node_2'].tolist()
        group_elems = list(map(str, group_list))
        f_cleora_clique.write("{} {}\n".format(name, ' '.join(group_elems)))
        f_cleora_star.write("{}\t{}\n".format(n, name))
        for elem in group_elems:
            f_cleora_star.write("{}\t{}\n".format(n, elem))

In [169]:
df = pd.read_csv("./deezer_europe/deezer_europe_target.csv")

In [170]:
df

Unnamed: 0,id,target
0,0,0
1,1,0
2,2,0
3,3,1
4,4,0
...,...,...
28276,28276,0
28277,28277,1
28278,28278,0
28279,28279,1


In [171]:
classes = df['target'].unique()
class_ids = list(range(0, len(classes)))
class_dict = {k:v for k,v in zip(classes, class_ids)}
df['target'] = [class_dict[item] for item in df['target']] 

In [172]:
train_filename = "deezer_europe_classification_train.txt"
test_filename = "deezer_europe_classification_test.txt"

In [173]:
train, test = train_test_split(df, test_size=config['train_test_split'])

In [174]:
with open(train_filename, "w") as f_train:
    for index, row in train.iterrows():
        f_train.write("{} {}\n".format(row['id'], row['target']))

In [175]:
with open(test_filename, "w") as f_test:
    for index, row in test.iterrows():
        f_test.write("{} {}\n".format(row['id'], row['target']))

# Cleora training

In [176]:
import subprocess


def columns2output_filename(output_dir, columns):
    columns_split = columns.split()
    if len(columns_split) == 1 and 'reflexive' in columns:
        column_name = columns.split('::')[-1]
        return os.path.join(output_dir, f'emb__{column_name}__{column_name}.out')

    column_names = [i.split('::')[-1] for i in columns_split]
    return os.path.join(output_dir, 'emb__' + '__'.join(column_names) + '.out')


def train_cleora(dim, n_iter, columns, input_filename, output_dir):
    command = ['./cleora-v1.2.3-x86_64-pc-windows-msvc',
                '--columns', columns,
                '--dimension', str(dim), 
                '-n', str(n_iter), 
                '--input', input_filename, 
                '-o', output_dir]
    subprocess.run(command, check=True)
    return columns2output_filename(output_dir, columns)

## Star expansion

In [177]:
%%time
cleora_output_star_filename = train_cleora(config['cleora_dim'], config['cleora_n_iter'], "transient::cluster_id StarNode", deezer_europe_cleora_input_star_filename, output_dir)

CPU times: total: 15.6 ms
Wall time: 17.4 s


## Clique expansion

In [178]:
%%time
cleora_output_clique_filename = train_cleora(config['cleora_dim'], config['cleora_n_iter'], "complex::reflexive::CliqueNode", deezer_europe_cleora_input_clique_filename, output_dir)

CPU times: total: 15.6 ms
Wall time: 17.1 s


# Classification

In [179]:
def read_embeddings(input_file):
    df_full = pd.read_csv(input_file, delimiter = " ", skiprows=[0], header=None, 
                     index_col=0)
    df_full = df_full.drop([1], axis=1)

    return df_full

In [180]:
def read_train_test(embeddings):
    valid_idx = embeddings.index.to_numpy()
    
    train = np.loadtxt(train_filename, delimiter=" ", dtype=np.int64) 
    test = np.loadtxt(test_filename, delimiter=" ", dtype=np.int64)
    
    train = train[np.isin(train[:,0], valid_idx) & np.isin(train[:,1], valid_idx)]
    test = [t for t in test if (t[0] in valid_idx) and (t[1] in valid_idx)] 
     
    train = np.array(train)
    test = np.array(test)
    
    return train,test

In [181]:
batch_size = config['batch_size']
test_batch_size = config['test_batch_size']

In [182]:
for algo in config['input_embeddings']:
    embeddings = read_embeddings(algo)
    train,test = read_train_test(embeddings)
                                 
    y_train = train[:, 1]
    y_test = test[:, 1]

    clf = SGDClassifier(random_state=0, loss='log', alpha=0.0001)
    for e in tqdm(range(0, max(config['epochs']))):
        for idx in range(0,train.shape[0],batch_size):
            ex=train[idx:min(idx+batch_size,train.shape[0]),:]

            ex_emb_in = embeddings.loc[ex[:,0]].to_numpy()
            ex_y = y_train[idx:min(idx+batch_size,train.shape[0])]
    
            clf.partial_fit(ex_emb_in, ex_y, classes=[0,1,2,3])
        
        if e+1 in config['epochs']:
            acc = 0.0
            y_pred = []
            for n, idx in enumerate(range(0,test.shape[0],test_batch_size)):
                ex=test[idx:min(idx+test_batch_size,train.shape[0]),:]
                ex_emb_in = embeddings.loc[ex[:,0]].to_numpy()
                pred = clf.predict_proba(ex_emb_in)
    
                classes = np.argmax(pred, axis=1)
                y_pred.extend(classes)

            f1_micro = f1_score(y_test, y_pred, average='micro')
            f1_macro = f1_score(y_test, y_pred, average='macro')
            print('algo: {} epochs: {}, micro f1: {}, macro f1:{}'.format(algo, e+1, f1_micro, f1_macro))


100%|██████████| 20/20 [00:44<00:00,  2.21s/it]

algo: output/emb__cluster_id__StarNode.out epochs: 20, micro f1: 0.5694009216589861, macro f1:0.46429302133528033



100%|██████████| 20/20 [00:45<00:00,  2.28s/it]

algo: output/emb__CliqueNode__CliqueNode.out epochs: 20, micro f1: 0.55963133640553, macro f1:0.4349924303526836





## Results

| cleora_n_iter | cleora_dim | star micro f1 | star macro f1 | clique micro f1 | clique macro f1 |
|---------------|------------|---------------|---------------|-----------------|-----------------|
| 5             | 512        | 0.56          | 0.53          | 0.54            | 0.51            |
| 5             | 1024       | 0.55          | 0.53          | 0.55            | 0.52            |
| 10            | 1024       | 0.57          | 0.50          | **0.57**        | 0.48            |
| 10            | 2048       | 0.57          | 0.54          | **0.57**        | **0.53**        |
| 15            | 2048       | **0.58**      | **0.57**      | **0.57**        | 0.51            |
| 15            | 4096       | 0.57          | 0.46          | 0.56            | 0.43            |