In [1]:
import torch
import numpy as np
from scipy.stats import spearmanr

In [2]:
# Generate a batch of random sequential data that
train_input = torch.randn(100, 500, 64)
train_grad_pre_activation = torch.randn(100, 500, 64)

test_input = torch.randn(20, 500, 64)
test_grad_pre_activation = torch.randn(20, 500, 64)

precondition = True

In [3]:
# threshold the input and grad_pre_activation
threshold = 1e-4

train_input_thrd = torch.where(torch.abs(train_input) > threshold, train_input, torch.zeros_like(train_input))
train_grad_pre_activation_thrd = torch.where(torch.abs(train_grad_pre_activation) > threshold, train_grad_pre_activation, torch.zeros_like(train_grad_pre_activation))

test_input_thrd = torch.where(torch.abs(test_input) > threshold, test_input, torch.zeros_like(test_input))
test_grad_pre_activation_thrd = torch.where(torch.abs(test_grad_pre_activation) > threshold, test_grad_pre_activation, torch.zeros_like(test_grad_pre_activation))

train_grad_from_thrd = torch.einsum('ijk,ijl->ikl', train_grad_pre_activation_thrd, train_input_thrd).reshape(train_input.shape[0], -1)
test_grad_from_thrd = torch.einsum('ijk,ijl->ikl', test_grad_pre_activation_thrd, test_input_thrd).reshape(test_input.shape[0], -1)

if precondition:
    # Compute empirical covariance of the training data
    train_grad_cov_from_thrd = torch.matmul(train_grad_from_thrd.T, train_grad_from_thrd) / train_grad_from_thrd.shape[0]
    train_grad_cov_from_thrd_inv = torch.linalg.inv(train_grad_cov_from_thrd)

    # Precondition the training data
    train_grad_from_thrd_preconditioned = torch.matmul(train_grad_from_thrd, train_grad_cov_from_thrd_inv)

    # Compute the inner products between the preconditioned training grad and the original test grad
    score_from_thrd = torch.matmul(test_grad_from_thrd, train_grad_from_thrd_preconditioned.T)
else:
    # Compute the inner products between the original training grad and the original test grad
    score_from_thrd = torch.matmul(test_grad_from_thrd, train_grad_from_thrd.T)

In [4]:
# threshold the gradient
threshold = 1e-4

train_grad = torch.einsum('ijk,ijl->ikl', train_grad_pre_activation, train_input).reshape(train_input.shape[0], -1)
test_grad = torch.einsum('ijk,ijl->ikl', test_grad_pre_activation, test_input).reshape(test_input.shape[0], -1)

train_grad_thrd = torch.where(torch.abs(train_grad) > threshold, train_grad, torch.zeros_like(train_grad))
test_grad_thrd = torch.where(torch.abs(test_grad) > threshold, test_grad, torch.zeros_like(test_grad))

if precondition:
    # Compute empirical covariance of the training data
    train_grad_cov_thrd = torch.matmul(train_grad_thrd.T, train_grad_thrd) / train_grad_thrd.shape[0]
    train_grad_cov_thrd_inv = torch.linalg.inv(train_grad_cov_thrd)

    # Precondition the training data
    train_grad_thrd_preconditioned = torch.matmul(train_grad_thrd, train_grad_cov_thrd_inv)

    # Compute the inner products between the preconditioned training grad and the original test grad
    score_thrd = torch.matmul(test_grad_thrd, train_grad_thrd_preconditioned.T)
else:
    # Compute the inner products between the original training grad and the original test grad
    score_thrd = torch.matmul(test_grad_thrd, train_grad_thrd.T)

In [5]:
def cor(score1, score2):
    score1 = score1.detach().cpu()
    score2 = score2.detach().cpu()

    # Calculate correlations
    res = 0
    counter = 0
    for i in range(score1.shape[1]):
        tmp = spearmanr(
            np.array([score1[k][i] for k in range(len(score1))]),
            np.array([score2[k][i] for k in range(len(score2))])
        ).statistic
        if not np.isnan(tmp):
            res += tmp
            counter += 1

    return res/counter if counter > 0 else 0

print("Correlation between the two scores:", cor(score_from_thrd, score_thrd))

# inspect sparsity
print("Sparsity of the input for thresholding in activation-level:", torch.sum(train_input_thrd == 0) / train_input_thrd.numel())
print("Sparsity of the training grad for thresholding in activation-level:", torch.sum(train_grad_from_thrd == 0) / train_grad_from_thrd.numel())
print("Sparsity of the training grad for thresholding in gradient-level:", torch.sum(train_grad_thrd == 0) / train_grad_thrd.numel())

Correlation between the two scores: 0.01001503759398496
Sparsity of the input for thresholding in activation-level: tensor(8.4375e-05)
Sparsity of the training grad for thresholding in activation-level: tensor(0.)
Sparsity of the training grad for thresholding in gradient-level: tensor(2.4414e-06)


In [6]:
diff = score_from_thrd - score_thrd
print(diff)

tensor([[  5906.5635,   1171.7708,   6660.6597,  ..., -16374.9180,
          17444.0156,  12115.5742],
        [    40.4005,  20297.1367,  -5061.6709,  ..., -18491.5566,
         -14674.6328,  18215.2559],
        [ 10681.6045,  10351.2324,  -2357.1326,  ...,   6023.1509,
           2103.6333,  -3822.5930],
        ...,
        [ -9349.6045,  12001.2402,  10609.6855,  ...,   1023.2998,
           1928.4727,  20213.6289],
        [ -9814.1885,  12448.2676,  28042.7676,  ..., -17129.0312,
         -23087.0508,  16753.0430],
        [ -6885.1055, -13528.7432,  -7512.0830,  ...,  -2297.2737,
          17141.1172,  -4016.5667]])


In [10]:
if precondition:
    # investigate the condition number for two different covariance matrices
    train_grad_cov_from_thrd_cond = torch.linalg.cond(train_grad_cov_from_thrd)
    train_grad_cov_thrd_cond = torch.linalg.cond(train_grad_cov_thrd)
    print("Condition number of covariance matrix for thresholding in activation-level: ", train_grad_cov_from_thrd_cond)
    print("Condition number of covariance matrix for thresholding in gradient-level: ", train_grad_cov_thrd_cond)

    # investigate the condition number for two different covariance matrices
    train_grad_cov_from_thrd_inv_cond = torch.linalg.cond(train_grad_cov_from_thrd_inv)
    train_grad_cov_thrd_inv_cond = torch.linalg.cond(train_grad_cov_thrd_inv)
    print("Condition number of covariance matrix inverse for thresholding in activation-level: ", train_grad_cov_from_thrd_inv_cond)
    print("Condition number of covariance matrix inverse for thresholding in gradient-level: ", train_grad_cov_thrd_inv_cond)

    # investigate the difference of two covariance matrices
    diff = torch.linalg.norm(train_grad_cov_from_thrd - train_grad_cov_thrd)
    print("Difference of covariance matrices: ", diff)

    # investigate the difference of two covariance matrices' inverses
    diff = torch.linalg.norm(train_grad_cov_from_thrd_inv - train_grad_cov_thrd_inv)
    print("Difference of covariance matrices' inverses: ", diff)

    # investigate the sparsity of two covariance matrices
    sparsity_from_thrd = torch.sum(train_grad_cov_from_thrd == 0) / train_grad_cov_from_thrd.numel()
    sparsity_thrd = torch.sum(train_grad_cov_thrd == 0) / train_grad_cov_thrd.numel()
    print("Sparsity of covariance matrix for thresholding in activation-level: ", sparsity_from_thrd)
    print("Sparsity of covariance matrix for thresholding in gradient-level: ", sparsity_thrd)

Condition number of covariance matrix for thresholding in activation-level:  tensor(4.1528e+12)
Condition number of covariance matrix for thresholding in gradient-level:  tensor(3.3611e+11)
Condition number of covariance matrix inverse for thresholding in activation-level:  tensor(1.4817e+11)
Condition number of covariance matrix inverse for thresholding in gradient-level:  tensor(6.7773e+10)
Difference of covariance matrices:  tensor(0.2116)
Difference of covariance matrices' inverses:  tensor(57798912.)
Sparsity of covariance matrix for thresholding in activation-level:  tensor(0.)
Sparsity of covariance matrix for thresholding in gradient-level:  tensor(0.)
