In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import os
import scipy.sparse as sparse
import copy

## DrugNet

In [2]:
path1 = "data/DrugNet/CSV/"
path0 = "data/DrugNet/"
path2 = "results/New/DrugNet/"

A0 = np.genfromtxt(path1+'DRUGNET.csv',delimiter=',')[1:, 1:]
A0 = np.maximum(A0, A0.T) # transform to undirected net
F0 = np.genfromtxt(path1+'DRUGATTR.csv',delimiter=',').astype(np.int64)[1:, 1:]

s = np.sum(A0, axis=1) + np.sum(A0, axis=0)
nze = np.where(s!=0)[0]

A = A0[nze,:]
A = A[:,nze]

# identify unlinked nodes
sm = np.array([105,151,51,135,145,147,35,176,181,158,166,114,117,11,73,98,120,126,192])
nn = A.shape[0]
F = F0[nze,:]
inter = np.setdiff1d(np.arange(nn), sm)

# exclude unlinked nodes out of network
A = A[inter,:]
A = A[:,inter]
F = F[inter,:]

np.savetxt(path0+'DrugNetgraph.csv', A, fmt='%d', delimiter=',')
np.savetxt(path0+'DrugNetfeature.csv', F, fmt='%d', delimiter=',')

## NBA

In [4]:
# Define paths
path1 = "data/NBA/"
path2 = "results/New/NBA/"

# -------------------------------
# 1. Load attribute and label data from nba.csv.
#    att: columns [node_id, protected_attribute] from column 0 and 37.
#    lab: columns [node_id, class_label] from column 0 and 1.
# -------------------------------
data = np.genfromtxt(path1 + 'nba.csv', delimiter=',', skip_header=1, dtype=np.int64)
att = data[:, [0, 37]]
lab = data[:, [0, 1]]

# Build dictionaries for quick lookup.
att_dict = {row[0]: row[1] for row in att}
lab_dict = {row[0]: row[1] for row in lab}

# -------------------------------
# 2. Load relationship data from nba_relationship.txt.
# -------------------------------
E = np.genfromtxt(path1 + 'nba_relationship.txt', delimiter='\t', dtype=np.int64)
# Unique node IDs (as in the relationships)
N = np.unique(E)
n = N.shape[0]
# Create a mapping from node ID to index.
node2idx = {node: idx for idx, node in enumerate(N)}

# -------------------------------
# 3. Build the adjacency matrix A.
#    Here we vectorize the edge processing.
# -------------------------------
A = np.zeros((n, n), dtype=np.int64)
indices1 = np.array([node2idx[node] for node in E[:, 0]])
indices2 = np.array([node2idx[node] for node in E[:, 1]])
A[indices1, indices2] = 1
A[indices2, indices1] = 1

# -------------------------------
# 4. Construct the feature vector F and label vector L.
#    Iterate over the unique nodes only once.
# -------------------------------
F = np.zeros(n, dtype=np.int64)
L = np.zeros(n, dtype=np.int64)
for i, node in enumerate(N):
    F[i] = att_dict.get(node, 0)  # default to 0 if missing
    L[i] = lab_dict.get(node, 0)

# Save the graph and features.
np.savetxt(path2 + "NBAgraph.csv", A, fmt='%d', delimiter=',')
np.savetxt(path2 + "NBAfeature.csv", F, fmt='%d', delimiter=',')
np.savetxt(path2 + "NBAlabel.csv", L, fmt='%d', delimiter=',')

# -------------------------------
# 5. Binarize the class labels.
#    For example, set the majority class to 1 and all others to 0.
# -------------------------------
unique_labels, counts = np.unique(L, return_counts=True)
majority_label = unique_labels[np.argmax(counts)]
binary_labels = np.where(L == majority_label, 1, 0)
np.savetxt(path2 + "NBAlabel_binary.csv", binary_labels, fmt='%d', delimiter=',')

print("NBA pre-processing complete.")
print("Majority label is:", majority_label)

## School

In [5]:
path1 = "data/School/"
path2 = "results/New/School/"

df = pd.read_csv(path1+'metadata_2013.txt', delimiter='\t', header=None)
df = df.drop(columns=[1])
df = df.replace(['F','M', 'Unknown'], [1, 2, 0])
df = df.set_index(0)

print(df.loc[34].values[0])

E = np.genfromtxt(path1+'Facebook-known-pairs_data_2013.csv', delimiter=' ').astype(np.int32)
e = E.shape[0]

N = np.unique(E[:, :-1])
n = N.shape[0]

A = np.zeros([n, n]).astype(np.int32)
F = np.zeros(n).astype(np.int32)

for i in range(e):
    l1 = int(np.where(N == E[i, 0])[0])
    l2 = int(np.where(N == E[i, 1])[0])

    F[l1] = df.loc[E[i, 0]].values[0]
    F[l2] = df.loc[E[i, 1]].values[0]

    A[l1, l2] = E[i, 2]
    A[l2, l1] = E[i, 2]

sm = np.array([5])
inter = np.setdiff1d(np.arange(n), sm)

A = A[inter, :]
A = A[:, inter]
F = F[inter]
np.savetxt(path1+'facebook.csv', A, fmt='%d', delimiter=',')
np.savetxt(path1+'school_attrib.csv', F, fmt='%d', delimiter=',')

0


In [6]:
E = np.genfromtxt(path1+'Contact-diaries-network_data_2013.csv', delimiter=' ').astype(np.int32)
e = E.shape[0]

N = np.unique(E[:, :-1])
n = N.shape[0]

A = np.zeros([n, n]).astype(np.int32)
F = np.zeros(n).astype(np.int32)

for i in range(e):
    l1 = int(np.where(N == E[i, 0])[0])
    l2 = int(np.where(N == E[i, 1])[0])

    F[l1] = df.loc[E[i, 0]].values[0]
    F[l2] = df.loc[E[i, 1]].values[0]

    A[l1, l2] = E[i, 2]
    A[l2, l1] = E[i, 2]

A[A>0] = 1
np.savetxt(path1+'diaries.csv', A, fmt='%d', delimiter=',')

In [7]:
E = np.genfromtxt(path1+'Friendship-network_data_2013.csv', delimiter=' ').astype(np.int32)
e = E.shape[0]

N = np.unique(E)
n = N.shape[0]

A = np.zeros([n, n]).astype(np.int32)
F = np.zeros(n).astype(np.int32)

for i in range(e):
    l1 = int(np.where(N == E[i, 0])[0])
    l2 = int(np.where(N == E[i, 1])[0])

    F[l1] = df.loc[E[i, 0]].values[0]
    F[l2] = df.loc[E[i, 1]].values[0]

    A[l1, l2] = 1
    A[l2, l1] = 1

nn = A.shape[0]
sm = np.array([5, 68, 126, 130, 24, 79, 125])

inter = np.setdiff1d(np.arange(nn), sm)

A = A[inter, :]
A = A[:, inter]
F = F[inter]

np.savetxt(path1+'Friendship.csv', A, fmt='%d', delimiter=',')

## LastFM

## Pokec

In [None]:
path1 = "data/Pokec/"
path2 = "results/New/Pokec/"

lab = np.genfromtxt(path1+'region_job.csv', delimiter=',', skip_header=1)[:, [0, 6]]
lab = lab.astype(np.int32)

att = np.genfromtxt(path1+'region_job.csv', delimiter=',', skip_header=1)[:, [0, 5]]
att = att.astype(np.int32)

E = np.genfromtxt(path1+'region_job_relationship.txt', delimiter='\t').astype(np.int32)
e = E.shape[0]

N = np.unique(E)
n = N.shape[0]

A = np.zeros([n, n]).astype(np.int32)
F = np.zeros(n).astype(np.int32)
label = np.zeros(n).astype(np.int32)

# ‘1’: Age[0, 18], ‘2’: Age[19, 25], ‘3’: Age[26,35], ‘4’: Age[36+]
# Similar to ECAI-2023: https://arxiv.org/pdf/2307.12065.pdf

att[att[:,1]<=18, 1] = 1
att[np.logical_and(att[:,1]>=19, att[:,1]<=25), 1] = 2
att[np.logical_and(att[:,1]>=26, att[:,1]<=35), 1] = 3
att[att[:,1]>=36, 1] = 4

for i in range(e):
    l1 = int(np.where(N == E[i, 0])[0])
    l2 = int(np.where(N == E[i, 1])[0])

    ind1 = np.where(E[i, 0] == att[:, 0])
    ind2 = np.where(E[i, 1] == att[:, 0])

    F[l1] = att[ind1, 1]
    F[l2] = att[ind2, 1]

    label[l1] = lab[ind1, 1]
    label[l2] = lab[ind2, 1]

    A[l1, l2] = 1
    A[l2, l1] = 1

Pokec_sp = sparse.csc_matrix(A)

sparse.save_npz(path1+"pre_processed/sparse_Pokec_graph_region_A.npz", Pokec_sp)
np.savetxt(path1+'pre_processed/Pokecgraph_reg1.csv', A, fmt='%d', delimiter=',')
np.savetxt(path1+'pre_processed/Pokecfeature_reg1.csv', F, fmt='%d', delimiter=',')

all_in_one = np.ones(F.shape[0])
uniqe_vals, count = np.unique(F, return_counts=True)
Pokec_balance = min(count)/max(count)

print("Dataset balance = ", Pokec_balance)