# GraphSAINT Analysis

In [1]:
from imports import *
from linkpred import *
from dataset import *

import matplotlib.pyplot as plt
import matplotlib
import matplotlib.style as style 
style.use('seaborn-paper')
%matplotlib inline

fontsize = 12
plt.rcParams.update({
    'font.size': fontsize, 
    'axes.labelsize': fontsize, 
    'legend.fontsize': fontsize,
    'xtick.labelsize': fontsize,
    'ytick.labelsize': fontsize,
    'axes.titlesize': fontsize
                    })



In [2]:
## set random seeds
random.seed(123)
np.random.seed(123)
torch.manual_seed(123)

<torch._C.Generator at 0x7f718867af70>

In [3]:
print(torch.__version__)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device =', device)

1.5.1
device = cuda


## Load the Dataset

In [4]:
node_attributes = pd.read_csv('data/NDSSL data/raw/node_attributes.csv')
node_attributes.head(5)

Unnamed: 0,person_id,household_id,age,gender,worker,relationship,household_income,household_size,zipcode,household_vehicles,household_workers
0,2509159,2201175,42,1,1,1,13,3,97201,3,2
1,2509160,2201175,43,2,1,1,13,3,97201,3,2
2,2509161,2201175,17,1,2,2,13,3,97201,3,2
3,2509162,2201176,41,1,1,1,11,2,97201,3,1
4,2509163,2201176,11,1,2,2,11,2,97201,3,1


In [None]:
plist = np.linspace(0.5, 0.4, 20)
density = []
edge_ratio = []

walk_length = 30
batch_size = 1000
num_steps = 10
sample_coverage = 20

for i in range(len(plist)):
    p = plist[i]
    print('fraction of data in training set: %.2f' %(1.0 - 2*p))

    dataset = load_dataset(dataset_name='NDSSL')
    data = dataset[0]
    data = train_test_split_big(data, val_ratio=p, test_ratio=p)

    data_columns = ['age', 'gender', 'worker', 'relationship', 'household_income', 'household_size', 'zipcode', 'household_vehicles', 'household_workers']
    data.x = dataframe2onehot(node_attributes[data_columns], node_attributes)

    train_data = Data(x=data.x, edge_index=data.train_pos_edge_index, y=data.y)
    row, col = train_data.edge_index
    train_data.edge_attr = 1. / degree(col, train_data.num_nodes)[col]  # Norm by in-degree.

    train_loader = GraphSAINTRandomWalkSampler(train_data, batch_size=batch_size, walk_length=walk_length,
                                         num_steps=num_steps, sample_coverage=sample_coverage,
                                         save_dir=None)
    density_tmp = []
    edge_ratio_tmp = []
    for batch in train_loader:
        n = batch.x.shape[0]
        m = batch.edge_index.shape[1]

        density_tmp.append((2*m/(n*(n-1))))
        edge_ratio_tmp.append((m/n))
        
    density.append(density_tmp)
    edge_ratio_tmp.append(edge_ratio_tmp)

fraction of data in training set: 0.00


Compute GraphSAINT normalization:   0%|          | 0/32026600 [00:00<?, ?it/s]

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 6))

ax[0].hist(density_train, label='train')
ax[0].hist(density_test, label='test')
ax[0].legend()

ax[1].hist(edge_ratio_train, label='train')
ax[1].hist(edge_ratio_test, label='test')
ax[1].legend()

plt.show()