In [2]:
import json
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import numpy as np

In [3]:
def get_metrics(y_pred, y_test):
    correct = np.sum(y_pred == np.array(y_test))
    acc = correct/len(y_pred)
    fn = 0
    tp = 0
    for p, l in zip (y_pred, np.array(y_test)):
        if l != 0 and p != 0:
            tp += 1
        if l != 0 and p == 0:
            fn += 1
    
    return acc, (fn / (fn + tp))

In [121]:
# node/edge level
data = [json.loads(line) for line in open('./data/graphs/graphs.jsonl', 'r')]

edges = []
labels = []
graph_index = []
for i, graph in enumerate(data):
    for source, dicts in graph.items():
        for target, features in dicts.items():
            labels.append(features.pop('label'))
            edges.append(list(features.values()))
            graph_index.append(i)

In [122]:
# proportion of positive labels 
np.sum(np.array(labels) != 0) / len(labels)

0.02311320297271349

In [5]:
X_train, X_test, y_train, y_test = train_test_split(edges, labels, stratify=labels, random_state=1)
# X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, stratify=y_test, random_state=1, train_size=0.5)

In [36]:
clf = MLPClassifier(random_state=1, max_iter=300,early_stopping=True, n_iter_no_change=5)
clf.fit(X_train, y_train)


In [6]:
clf = DecisionTreeClassifier(random_state=1)
clf.fit(X_train, y_train)

In [10]:
clf = SVC(random_state=1, max_iter=300)
clf.fit(X_train, y_train)



In [11]:
y_pred = clf.predict(X_test)
get_metrics(y_pred, y_test)

(0.06436117253061277, 0.07468531468531468)

In [123]:
#graph level
data = [json.loads(line) for line in open('./data/graphs/graphs.jsonl', 'r')]

graphs = []
graph_labels = []

graph_index = []
for i, graph in enumerate(data):
    edges = []
    labels = []
    for source, dicts in graph.items():
        for target, features in dicts.items():
            labels.append(features.pop('label'))
            edges.append(list(features.values()))
            graph_index.append(i)
    graphs.append(edges)
    graph_labels.append(labels)

In [130]:
# proportion of positive labels 
g_l = [sum(x) for x in graph_labels]
np.sum(np.array(g_l) != 0) / len(g_l)

0.3756

In [13]:
X_train, X_test, y_train, y_test = train_test_split(graphs, graph_labels, random_state=1)
X_train = [x for y in X_train for x in y]
y_train = [x for y in y_train for x in y]

In [36]:
clf = MLPClassifier(random_state=1, max_iter=300,early_stopping=True, n_iter_no_change=5)
clf.fit(X_train, y_train)

In [39]:
clf = DecisionTreeClassifier(random_state=1)
clf.fit(X_train, y_train)

In [17]:
clf = SVC(random_state=1, max_iter=300)
clf.fit(X_train, y_train)



In [40]:
preds = []
trues = []
for g, l in zip(X_test, y_test):
    g_pred = clf.predict(g)
    g_pred = (np.sum(g_pred) > 0).astype(int)
    preds.append(g_pred)
    
    g_true = (np.sum(np.array(l)) > 0).astype(int)
    trues.append(g_true)

In [41]:
get_metrics(preds, trues)

(0.6752, 0.3311965811965812)

## Dynamic graphs

In [131]:
with open('./data/graphs/rnn_g.json', 'r') as f:
    data = json.load(f)
    f.close()

In [132]:
clusters, labels = data['clusters'], data['labels']

In [133]:
all_clusters = []
all_labels = []
for clst_over_t, l_over_t in zip(clusters, labels):
    for clst_at_t, l_at_t in zip(clst_over_t, l_over_t):
        flatten_clst = [x for y in clst_at_t for x in y]
        flatten_l = [x for y in l_at_t for x in y]
        for c, l in zip(flatten_clst, flatten_l):
            if c == -1:
                continue
            else:
                all_clusters.append(c)
                all_labels.append(l)

In [134]:
all_clusters = np.array(all_clusters)
all_labels = np.array(all_labels)
all_clusters_one_hot = np.zeros((all_clusters.size, all_clusters.max() + 1))
all_clusters_one_hot[np.arange(all_clusters.size), all_clusters] = 1

In [136]:
# proportion of positive labels 
np.sum(np.array(all_labels) != 0) / len(all_labels)

0.015251283498333901

In [52]:
X_train, X_test, y_train, y_test = train_test_split(all_clusters_one_hot, all_labels, random_state=1)


In [50]:
clf = MLPClassifier(random_state=1, max_iter=300,early_stopping=True, n_iter_no_change=5)
clf.fit(X_train, y_train)

In [58]:
clf = DecisionTreeClassifier(random_state=1)
clf.fit(X_train, y_train)

In [60]:
clf = SVC(random_state=1, max_iter=300)
clf.fit(X_train, y_train)



In [61]:
y_pred = clf.predict(X_test)
get_metrics(y_pred, y_test)

(0.7616783880215315, 0.1119484623799684)

In [137]:
# graph level
all_clusters = []
all_labels = []

for clst_over_t, l_over_t in zip(clusters, labels):
    graph_level_cluster = []
    graph_level_labels = []
    for clst_at_t, l_at_t in zip(clst_over_t, l_over_t):
        flatten_clst = [x for y in clst_at_t for x in y]
        flatten_l = [x for y in l_at_t for x in y]
        for c, l in zip(flatten_clst, flatten_l):
        # for c, l in zip(clst_at_t, l_at_t):
            if c == -1:
                continue
            else:
                graph_level_cluster.append(c)
                graph_level_labels.append(l)
    all_clusters.append(graph_level_cluster)
    all_labels.append(graph_level_labels)

In [140]:
# proportion of positive labels 
g_l = [sum(x) for x in all_labels]
np.sum(np.array(g_l) != 0) / len(g_l)

0.997

In [110]:
X_train, X_test, y_train, y_test = train_test_split(all_clusters, all_labels, random_state=1)
X_train = [x for y in X_train for x in y]
y_train = [x for y in y_train for x in y]
X_train = np.array(X_train)
y_train = np.array(y_train)
X_train_one_hot = np.zeros((X_train.size, X_train.max() + 1))
X_train_one_hot[np.arange(X_train.size), X_train] = 1

In [135]:
clf = MLPClassifier(random_state=1, max_iter=300,early_stopping=True, n_iter_no_change=5)
clf.fit(X_train_one_hot, y_train)

In [112]:
clf = DecisionTreeClassifier(random_state=1)
clf.fit(X_train_one_hot, y_train)

In [117]:
clf = SVC(random_state=1, max_iter=300)
clf.fit(X_train_one_hot, y_train)



In [119]:
preds = []
trues = []
for g, l in zip(X_test, y_test):
    g = np.array(g)
    g_one_hot = np.zeros((g.size, 4))
    g_one_hot[np.arange(g.size), g] = 1
    
    g_pred = clf.predict(g_one_hot)
    g_pred = (np.sum(g_pred) > 0).astype(int)
    preds.append(g_pred)
    
    g_true = (np.sum(np.array(l)) > 0).astype(int)
    trues.append(g_true)

In [120]:
get_metrics(preds, trues)

(0.99704, 0.0)