# push flow

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import numpy as np
import time
from tqdm import tqdm
from torch_sparse import spspmm, transpose
import pickle
from torch_scatter import segment_coo
from neighboring.ppr_power_iteration import *
from neighboring.pernode_ppr_neighbor import construct_sparse

In [3]:
with open('/nfs/students/qian/papers100m_splits.pkl', 'rb') as handle:
    (train_indices, val_indices, test_indices) = pickle.load(handle)

In [4]:
len(train_indices), len(val_indices), len(test_indices)

(1207179, 125265, 214338)

In [5]:
adj_mask = torch.from_numpy(np.concatenate((train_indices, val_indices, test_indices)))

In [6]:
adj_t = torch.load('/nfs/students/qian/data/papers100m/adj.pt')
# adj = torch.load('reddit_adj.pt')

In [None]:
from neighboring.ppr_power_iteration import ppr_power_iter

In [None]:
adj_mask = np.concatenate((train_indices, val_indices, test_indices))
neighbors, pprmat = ppr_power_iter(adj_t, 
                                   'papers100M', 
                                   topk=32, 
                                   alpha=0.25, 
                                   thresh=1e-4, 
                                   adj_mask=torch.from_numpy(adj_mask))

In [9]:
adj_t

SparseTensor(row=tensor([        0,         0,         1,  ..., 111059955, 111059955,
                           111059955]),
             col=tensor([        0, 102309412,         1,  ...,  59629722,  95195371,
                           111059955]),
             val=tensor([0.5000, 0.0801, 0.1111,  ..., 0.1491, 0.2357, 0.3333]),
             size=(111059956, 111059956), nnz=3339184668, density=0.00%)

In [10]:
prams = {'arxiv': {'chunksize': 10000,
                   'alpha': 0.05,
                   'iters': 3,
                   'top_percent': [1, 0.01, 0.25], 
                   'thresh': 0.001}, 
         'products': {'chunksize': 10000,
                   'alpha': 0.05,
                   'iters': 3,
                   'top_percent': [1, 0.01, 0.15], 
                   'thresh': 2e-4}, 
         'reddit': {'chunksize': 8000,
                   'alpha': 0.05,
                   'iters': 3,
                   'top_percent': [0.1, 0.01, 0.15], 
                   'thresh': 1e-5}, 
         'papers100M': {'chunksize': 8000,
                   'alpha': 0.25,
                   'iters': 2,
                   'top_percent': [0.1, 0.01, 0.15], 
                   'thresh': 2e-4}, }

r0 = a * e

p0 = 0

pk = pk-1 + rk-1

rk = (1 - a) * A * rk-1

In [11]:
chunksize, alpha, iters, top_percent, thresh = prams['papers100M'].values()

topk = 32

neighbor_list = []
weights_list = []

In [66]:
# index1 = torch.stack((adj.storage._row, adj.storage._col), 0).to(device)
# value1 = adj.storage._value.to(device) * (1 - alpha)
# adj_sizes = adj.size()
if adj_mask is not None:
    prime_adj = adj_t[adj_mask, :]
# prime_adj = down_sample_adj(prime_adj, topk)

In [54]:
device = 'cuda'
alpha = 0.25

In [29]:
def adj_add(src, other):
    rowA, colA, valueA = src.coo()
    rowB, colB, valueB = other.coo()

    row = torch.cat([rowA, rowB], dim=0)
    col = torch.cat([colA, colB], dim=0)

    value: Optional[Tensor] = None
    if valueA is not None and valueB is not None:
        value = torch.cat([valueA, valueB], dim=0)

    M = max(src.size(0), other.size(0))
    N = max(src.size(1), other.size(1))
    sparse_sizes = (M, N)

    out = SparseTensor(row=row, col=col, value=value,
                       sparse_sizes=sparse_sizes)
    out = out.coalesce(reduce='sum')
    return out

In [30]:
adj_add(src=ad, other=ad)

SparseTensor(row=tensor([      688,      2069,      2729,  ..., 111048021, 111053015,
                           111059919], device='cuda:0'),
             col=tensor([ 113, 3443, 1740,  ..., 6553, 7822,  726], device='cuda:0'),
             val=tensor([0.0745, 0.0212, 0.0571,  ..., 0.0367, 0.0118, 0.1348], device='cuda:0'),
             size=(111059954, 8000), nnz=139949, density=0.00%)

In [46]:
def spspmm_sum(src: SparseTensor, other: SparseTensor) -> SparseTensor:
    assert src.sparse_size(1) == other.sparse_size(0)
    rowptrA, colA, valueA = src.csr()
    rowptrB, colB, valueB = other.csr()
    value = valueA if valueA is not None else valueB
    if valueA is not None and valueA.dtype == torch.half:
        valueA = valueA.to(torch.float)
    if valueB is not None and valueB.dtype == torch.half:
        valueB = valueB.to(torch.float)
    M, K = src.sparse_size(0), other.sparse_size(1)
    rowptrC, colC, valueC = torch.ops.torch_sparse.spspmm_sum(
        rowptrA, colA, valueA, rowptrB, colB, valueB, K)
    if valueC is not None and value is not None:
        valueC = valueC.to(value.dtype)
    return SparseTensor(row=None, rowptr=rowptrC, col=colC, value=valueC,
                        sparse_sizes=(M, K), is_sorted=True)

In [42]:
from torch_sparse import SparseTensor, cat

In [70]:
col_parts = chunk_csr_col(prime_adj, chunksize=chunksize, device='cuda', return_adj=True)

for i, col_adj in enumerate(col_parts):

    ## push flow
    with torch.no_grad():
        # after first iter
        mask = adj_mask[i * chunksize : (i + 1) * chunksize].to(device)
        col = torch.arange(len(mask), device=mask.device)
        # p_idx = torch.stack((row, col), 0)
        p_val = torch.full(mask.size(), alpha, device=device, dtype=torch.float32)
        p = SparseTensor(row = mask, col = col, value = p_val)
        
        # value2 *= alpha
        col_adj = col_adj.set_value_(col_adj.storage._value * alpha, layout='csr')

        for it in range(1, iters + 1):
            # p_idx, p_val = spspadd(p_idx, p_val, index2, value2, (adj.size()[0], size1))
            p = adj_add(p, col_adj)

            if it < iters:
                row_parts = chunk_csr_col(adj_t, chunksize=1000000, device='cuda', axis='row', return_adj=True)
                new_col_adj_list = []
                for j, row_adj in enumerate(row_parts):
                    print(j)
                    new_col_adj_list.append(spspmm_sum(row_adj, col_adj))

                col_adj = cat(new_col_adj_list, dim=0)
                new_col_adj_list = None
                row, col, value = col_adj.coo()
                mask2 = value >= thresh
                mask2[mask] = True
                row, col, value = row[mask2], col[mask2], value[mask2]
                col_adj = SparseTensor(row = mask, col = col, value = p_val, device=col_adj.device)

    col_adj = None
#     index_ascending, sorting_indices = torch.sort(p_idx[1])  # col should be sorted
#     ppr_scores_ascending = p_val[sorting_indices]
#     mask = get_topk_neighbors_mask(size1, index_ascending, ppr_scores_ascending, topk)

#     (row, col), val = transpose(p_idx, p_val, adj.size()[0], size1)
#     split_idx = ((row[1:] > row[:-1]).nonzero().squeeze() + 1).cpu().numpy()
#     mask_splits = np.array_split(mask.cpu().numpy(), split_idx)
#     col_splits = np.array_split(col.cpu().numpy(), split_idx)
#     val_splits = np.array_split(val.cpu().numpy(), split_idx)

#     neighbor_list += [c[m] for c, m in zip(col_splits, mask_splits)]
#     weights_list += [v[m] for v, m in zip(val_splits, mask_splits)]

#     p_idx, p_val = None, None

    torch.cuda.empty_cache()

    print(f'max_memory = {torch.cuda.max_memory_allocated()}, '
                     f'memory = {torch.cuda.memory_allocated()}, '
                     f'max_memory_reserve = {torch.cuda.max_memory_reserved()}, '
                     f'memory_reserve = {torch.cuda.memory_reserved()}')

index1 = index1.to('cpu')
value1 = value1.to('cpu')

torch.cuda.reset_peak_memory_stats()

pprmat = construct_sparse(neighbor_list, weights_list, adj.sizes()).tocsr()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45


RuntimeError: Trying to create tensor with negative dimension -1565362272: [-1565362272]

In [None]:
lens = [len(p) for p in neighbor_list]

In [None]:
np.mean(lens)

In [None]:
pprmat = construct_sparse(neighbor_list, weights_list, prime_adj.sizes()).tocsr()

In [None]:
pprmat

In [None]:
train_mat = pprmat[:1207179, :][:, train_indices]

In [None]:
train_mat

In [None]:
val_mat = pprmat[len(train_indices):(len(train_indices)+len(val_indices)), :][:, val_indices]

In [None]:
val_mat

In [None]:
from batching.loader_prime_orient_ppr import ppr_fixed_loader as prime_ppr_loader

In [None]:
neighbors = np.array(neighbor_list, dtype=object)[:len(train_indices)]

In [None]:
loader = prime_ppr_loader(train_mat, 
                         train_indices, 
                         neighbors, 
                         5000)

In [None]:
ps = 0
for p, n in loader:
    print(len(p), len(n))
    ps += len(p)

In [None]:
ps

In [None]:
indexC, valueC