In [1]:
import os
import networkx as nx
import numpy as np
import random
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
gmd_graph_positive_weights = nx.read_weighted_edgelist('data/gmd.edgelist',delimiter="\t")
gmd_graph_negative_weights = nx.read_weighted_edgelist('data/gmd.edgelist',delimiter="\t")

print("Before:",gmd_graph_positive_weights.number_of_nodes(),gmd_graph_positive_weights.number_of_edges())
for edge in list(gmd_graph_positive_weights.edges):
    if gmd_graph_positive_weights.get_edge_data(edge[0],edge[1])['weight']<=0.0:
        gmd_graph_positive_weights.remove_edge(edge[0],edge[1])
print("After:",gmd_graph_positive_weights.number_of_nodes(),gmd_graph_positive_weights.number_of_edges())

print("Before:",gmd_graph_negative_weights.number_of_nodes(),gmd_graph_negative_weights.number_of_edges())
for edge in list(gmd_graph_negative_weights.edges):
    if gmd_graph_negative_weights.get_edge_data(edge[0],edge[1])['weight']>0.0:
        gmd_graph_negative_weights.remove_edge(edge[0],edge[1])
print("After:",gmd_graph_negative_weights.number_of_nodes(),gmd_graph_negative_weights.number_of_edges())

Before: 3693 595954
After: 3693 85836
Before: 3693 595954
After: 3693 510118


In [3]:
df = pd.read_csv('data/gmd.edgelist', delimiter='\t', index_col=None, header=None)
print(df)

            0     1  2
0          G1    M1  1
1          G1    M2  1
2          G1    M3  1
3          G1    M4  1
4          G1    M5  1
...       ...   ... ..
595949  M2472  D217  0
595950  M2473  D217  0
595951  M2474  D217  0
595952  M2475  D217  0
595953  M2476  D217  0

[595954 rows x 3 columns]


In [4]:
positive_samples = df[df[2] > 0].reset_index(drop=True)
negative_samples = df[df[2] <= 0].reset_index(drop=True)
print(positive_samples)
print(negative_samples)

          0     1  2
0        G1    M1  1
1        G1    M2  1
2        G1    M3  1
3        G1    M4  1
4        G1    M5  1
...     ...   ... ..
85831  M403  D215  1
85832  M636  D216  1
85833    M2  D217  1
85834   M11  D217  1
85835  M780  D217  1

[85836 rows x 3 columns]
            0     1  2
0          M1    D1  0
1          M2    D1  0
2          M3    D1  0
3          M4    D1  0
4          M5    D1  0
...       ...   ... ..
510113  M2472  D217  0
510114  M2473  D217  0
510115  M2474  D217  0
510116  M2475  D217  0
510117  M2476  D217  0

[510118 rows x 3 columns]


In [5]:
min_count = min(len(positive_samples), len(negative_samples))
positive_samples_balanced = positive_samples.sample(n=min_count, random_state=12, ignore_index = True)
negative_samples_balanced = negative_samples.sample(n=min_count, random_state=12, ignore_index = True)
print(positive_samples_balanced)
print(negative_samples_balanced)

           0      1  2
0      M2013    D27  1
1       G602  M1269  1
2       G631   M635  1
3       G414   M369  1
4       G910   M871  1
...      ...    ... ..
85831  M2121    D77  1
85832   G559   M737  1
85833   G601  M1366  1
85834   G330   M468  1
85835  M1482   D110  1

[85836 rows x 3 columns]
           0     1  2
0       M554   D94  0
1      M1896  D214  0
2       M895   D86  0
3       M851  D159  0
4       M685   D82  0
...      ...   ... ..
85831  M1519   D16  0
85832  M2319  D175  0
85833  M2043  D132  0
85834   M470  D181  0
85835   M646  D217  0

[85836 rows x 3 columns]


In [6]:
balanced_dataset = pd.concat([positive_samples_balanced, negative_samples_balanced])
balanced_dataset = balanced_dataset.sample(frac=1, random_state=12).reset_index(drop=True)
print(balanced_dataset)

            0      1  2
0       M1455    D71  0
1        M746   D165  0
2        G246   M410  1
3        G597   M118  1
4        G448   M410  1
...       ...    ... ..
171667   M330    D69  1
171668    M42    D11  0
171669   G464   M115  1
171670   G449  M1909  1
171671  M1738   D131  0

[171672 rows x 3 columns]


In [7]:
x = balanced_dataset[[0,1]]
y = balanced_dataset[[2]]
print(x)
print(y)

            0      1
0       M1455    D71
1        M746   D165
2        G246   M410
3        G597   M118
4        G448   M410
...       ...    ...
171667   M330    D69
171668    M42    D11
171669   G464   M115
171670   G449  M1909
171671  M1738   D131

[171672 rows x 2 columns]
        2
0       0
1       0
2       1
3       1
4       1
...    ..
171667  1
171668  0
171669  1
171670  1
171671  0

[171672 rows x 1 columns]


In [8]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [9]:
X_train.to_csv('data/balanced data/x_train_gmd.txt', sep=',', header=False, index=False)
y_train.to_csv('data/balanced data/y_train_gmd.txt', header=False, index=False)
X_test.to_csv('data/balanced data/x_test_gmd.txt', sep=',', header=False, index=False)
y_test.to_csv('data/balanced data/y_test_gmd.txt', header=False, index=False)