In [34]:
import os
import networkx as nx
import numpy as np
import random
import pandas as pd
from sklearn.model_selection import train_test_split

In [18]:
gld_graph_positive_weights = nx.read_weighted_edgelist('data/gld.edgelist',delimiter="\t")
gld_graph_negative_weights = nx.read_weighted_edgelist('data/gld.edgelist',delimiter="\t")

In [19]:
print(gld_graph_negative_weights)
print(gld_graph_positive_weights)

Graph with 2031 nodes and 178880 edges
Graph with 2031 nodes and 178880 edges


In [20]:
print("Before:",gld_graph_positive_weights.number_of_nodes(),gld_graph_positive_weights.number_of_edges())
for edge in list(gld_graph_positive_weights.edges):
    if gld_graph_positive_weights.get_edge_data(edge[0],edge[1])['weight']<=0.0:
        gld_graph_positive_weights.remove_edge(edge[0],edge[1])
print("After:",gld_graph_positive_weights.number_of_nodes(),gld_graph_positive_weights.number_of_edges())

Before: 2031 178880
After: 2031 5676


In [21]:
# print(gld_graph_negative_weights)
print("Before:",gld_graph_negative_weights.number_of_nodes(),gld_graph_negative_weights.number_of_edges())
for edge in list(gld_graph_negative_weights.edges):
    if gld_graph_negative_weights.get_edge_data(edge[0],edge[1])['weight']>0.0:
        gld_graph_negative_weights.remove_edge(edge[0],edge[1])
print("After:",gld_graph_negative_weights.number_of_nodes(),gld_graph_negative_weights.number_of_edges())
# print(gld_graph_negative_weights)

Before: 2031 178880
After: 2031 173204


In [50]:
df = pd.read_csv('data/gld.edgelist', delimiter='\t', index_col=None, header=None)
print(df)

           0     1  2
0         G1   L39  1
1         G1   L41  1
2         G1   L85  1
3         G1  L185  1
4         G1  L205  1
...      ...   ... ..
178875  L810  D217  0
178876  L811  D217  0
178877  L812  D217  0
178878  L813  D217  0
178879  L814  D217  0

[178880 rows x 3 columns]


In [52]:
positive_samples = df[df[2] > 0].reset_index(drop=True)
negative_samples = df[df[2] <= 0].reset_index(drop=True)
print(positive_samples)

         0     1  2
0       G1   L39  1
1       G1   L41  1
2       G1   L85  1
3       G1  L185  1
4       G1  L205  1
...    ...   ... ..
5671   L71  D217  1
5672   L73  D217  1
5673   L80  D217  1
5674   L98  D217  1
5675  L113  D217  1

[5676 rows x 3 columns]


In [56]:
min_count = min(len(positive_samples), len(negative_samples))
positive_samples_balanced = positive_samples.sample(n=min_count, random_state=12, ignore_index = True)
negative_samples_balanced = negative_samples.sample(n=min_count, random_state=12, ignore_index = True)
print(positive_samples_balanced)
print(negative_samples_balanced)

         0     1  2
0      L34   D62  1
1      G22   L71  1
2      L69   D45  1
3     G128  L299  1
4     L289   D46  1
...    ...   ... ..
5671  L323   D52  1
5672  G547  L113  1
5673  L687   D46  1
5674  L145   D44  1
5675  G618   L71  1

[5676 rows x 3 columns]
         0     1  2
0     L287   D40  0
1     L780  D193  0
2     L362  D143  0
3      L45  D134  0
4     L660  D188  0
...    ...   ... ..
5671  L691  D182  0
5672   L43   D28  0
5673  L496  D142  0
5674  L734   D39  0
5675   L51  D120  0

[5676 rows x 3 columns]


In [61]:
balanced_dataset = pd.concat([positive_samples_balanced, negative_samples_balanced])
balanced_dataset = balanced_dataset.sample(frac=1, random_state=12).reset_index(drop=True)
print(balanced_dataset)

          0     1  2
0      G105  L181  1
1      L366  D170  0
2      L679  D160  0
3      G152   L71  1
4      L687  D121  0
...     ...   ... ..
11347  L149  D129  1
11348  L508   D30  0
11349   L81   D94  1
11350  L180   D27  0
11351  L755  D182  0

[11352 rows x 3 columns]


In [64]:
x = balanced_dataset[[0,1]]
y = balanced_dataset[[2]]
print(x)
print(y)

          0     1
0      G105  L181
1      L366  D170
2      L679  D160
3      G152   L71
4      L687  D121
...     ...   ...
11347  L149  D129
11348  L508   D30
11349   L81   D94
11350  L180   D27
11351  L755  D182

[11352 rows x 2 columns]
       2
0      1
1      0
2      0
3      1
4      0
...   ..
11347  1
11348  0
11349  1
11350  0
11351  0

[11352 rows x 1 columns]


In [73]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [78]:
X_train.to_csv('data/balanced data/x_train_gld.txt', sep=',', header=False, index=False)
y_train.to_csv('data/balanced data/y_train_gld.txt', header=False, index=False)
X_test.to_csv('data/balanced data/x_test_gld.txt', sep=',', header=False, index=False)
y_test.to_csv('data/balanced data/y_test_gld.txt', header=False, index=False)