In [1]:
import os
import networkx as nx
import numpy as np
import random
import pandas as pd
from sklearn.model_selection import train_test_split

In [39]:
gg_dd_graph_positive_weights = nx.read_weighted_edgelist('data/gg_dd_edges.edgelist',delimiter="\t")
gg_dd_graph_negative_weights = nx.read_weighted_edgelist('data/gg_dd_edges.edgelist',delimiter="\t")

print("Before:",gg_dd_graph_positive_weights.number_of_nodes(),gg_dd_graph_positive_weights.number_of_edges())
for edge in list(gg_dd_graph_positive_weights.edges):
    if gg_dd_graph_positive_weights.get_edge_data(edge[0],edge[1])['weight']<0.4:
        gg_dd_graph_positive_weights.remove_edge(edge[0],edge[1])
print("After:",gg_dd_graph_positive_weights.number_of_nodes(),gg_dd_graph_positive_weights.number_of_edges())

print("Before:",gg_dd_graph_negative_weights.number_of_nodes(),gg_dd_graph_negative_weights.number_of_edges())
for edge in list(gg_dd_graph_negative_weights.edges):
    if gg_dd_graph_negative_weights.get_edge_data(edge[0],edge[1])['weight']>=0.4:
        gg_dd_graph_negative_weights.remove_edge(edge[0],edge[1])
print("After:",gg_dd_graph_negative_weights.number_of_nodes(),gg_dd_graph_negative_weights.number_of_edges())

Before: 1217 675456
After: 1217 271086
Before: 1217 675456
After: 1217 404370


In [40]:
df = pd.read_csv('data/gg_dd_edges.edgelist', delimiter='\t', index_col=None, header=None)
print(df)

            0     1         2
0          G1    D1  1.000000
1          G1    D2  1.000000
2          G1    D3  0.000000
3          G1    D4  0.000000
4          G1    D5  1.000000
...       ...   ...       ...
1132690  D217  D211  0.054521
1132691  D217  D212  0.026878
1132692  D217  D214  0.028294
1132693  D217  D216  0.041172
1132694  D217  D217  1.000000

[1132695 rows x 3 columns]


In [43]:
positive_samples = df[df[2] >= 0.4].reset_index(drop=True)
negative_samples = df[df[2] < 0.4].reset_index(drop=True)
print(positive_samples)
print(negative_samples)

           0     1         2
0         G1    D1  1.000000
1         G1    D2  1.000000
2         G1    D5  1.000000
3         G1    D6  1.000000
4         G1    D7  1.000000
...      ...   ...       ...
535254  D217   D31  0.838709
535255  D217   D60  0.564516
535256  D217  D119  0.402632
535257  D217  D196  0.458801
535258  D217  D217  1.000000

[535259 rows x 3 columns]
           0     1         2
0         G1    D3  0.000000
1         G1    D4  0.000000
2         G1    D9  0.000000
3         G1   D14  0.000000
4         G1   D15  0.000000
...      ...   ...       ...
597431  D217  D210  0.184015
597432  D217  D211  0.054521
597433  D217  D212  0.026878
597434  D217  D214  0.028294
597435  D217  D216  0.041172

[597436 rows x 3 columns]


In [44]:
min_count = min(len(positive_samples), len(negative_samples))
positive_samples_balanced = positive_samples.sample(n=min_count, random_state=21, ignore_index = True)
negative_samples_balanced = negative_samples.sample(n=min_count, random_state=21, ignore_index = True)
balanced_dataset = pd.concat([positive_samples_balanced, negative_samples_balanced])
balanced_dataset = balanced_dataset.sample(frac=1, random_state=21).reset_index(drop=True)
x = balanced_dataset[[0,1]]
y = balanced_dataset[[2]]
print(x)
print(y)

            0     1
0        G199  G724
1        G432  G294
2         D16   D49
3        G695   D40
4        G247  G350
...       ...   ...
1070513   G30  G746
1070514  G723  D139
1070515  G863  G848
1070516  G416  G764
1070517   G91   G83

[1070518 rows x 2 columns]
                2
0        0.237600
1        0.254833
2        0.024567
3        0.000000
4        0.378773
...           ...
1070513  0.417500
1070514  0.000000
1070515  0.538182
1070516  0.131000
1070517  0.421467

[1070518 rows x 1 columns]


In [46]:
y_threshold = y[y<=0.4]
unique_classes, class_counts = np.unique(y_threshold, return_counts=True)

# Find classes with fewer than 2 samples
classes_with_less_than_2_samples = unique_classes[class_counts < 2]

print("Classes with less than 2 samples:", classes_with_less_than_2_samples.shape)

Classes with less than 2 samples: (1864,)


In [47]:
print(x.shape, y.shape)

(1070518, 2) (1070518, 1)


In [36]:
unique_classes, class_counts = np.unique(y, return_counts=True)

# Find classes with fewer than 2 samples
classes_with_less_than_2_samples = unique_classes[class_counts < 2]

# Filter out samples corresponding to these classes
valid_indices = ~np.isin(y, classes_with_less_than_2_samples)
X_filtered = x[valid_indices]
y_filtered = y[valid_indices]
print(X_filtered.shape, y_filtered.shape)

(1068161, 2) (1068161, 1)


In [49]:
unique_classes, class_counts = np.unique(y, return_counts=True)

# Find classes with fewer than 2 samples
classes_with_less_than_2_samples = unique_classes[class_counts < 2]

# Filter out samples corresponding to these classes
valid_indices = ~np.isin(y, classes_with_less_than_2_samples)
X_filtered = x[valid_indices]
y_filtered = y[valid_indices]
print(X_filtered.shape, y_filtered.shape)
X_train, X_test, y_train, y_test = train_test_split(X_filtered, y_filtered, test_size=0.2, stratify=y_filtered)
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

(1068654, 2) (1068654, 1)


In [50]:
X_train.to_csv('data/balanced data/x_train_gg_dd_0.4th.txt', sep=',', header=False, index=False)
y_train.to_csv('data/balanced data/y_train_gg_dd_0.4th.txt', header=False, index=False)
X_test.to_csv('data/balanced data/x_test_gg_dd_0.4th.txt', sep=',', header=False, index=False)
y_test.to_csv('data/balanced data/y_test_gg_dd_0.4th.txt', header=False, index=False)