# scratch-tester
3.31.22

In [1]:
import unittest
import pytest
import pandas as pd 
import numpy as np
import torch
from scipy.stats import ranksums

import sys
sys.path.append('../ms_imputer/')

from models.linear import GradNMFImputer
import util_functions

  from .autonotebook import tqdm as notebook_tqdm


#### Set configs

In [2]:
# simulated matrix configs
rng = np.random.default_rng(42) # random seed
matrix_shape = (12,10,3) # (n_rows, n_cols, rank)

# training params
n_factors = 4 
tolerance = 0.0001
batch_size = 100
max_iters = 400
learning_rate = 0.05
PXD = "tester"                             

# error assessment params
train_err_tol = 1e-8
test_err_tol = 1e-1

#### Define some helper functions

In [3]:
def simulate_matrix(matrix_shape):
    """ 
    Init simulated matrix of known size and rank

    Parameters
    ----------
    matrix_shape: tuple, (x,y,z) where x=n_rows, y=n_cols
                  and z=rank
    Returns
    -------
    X: np.ndarray, the simulated matrix
    """
    W = rng.uniform(size=(matrix_shape[0], matrix_shape[2]))
    H = rng.uniform(size=(matrix_shape[2], matrix_shape[1]))
    X = W @ H

    assert np.linalg.matrix_rank(X) == matrix_shape[2]

    return X

#### Set up

In [4]:
matrix = simulate_matrix(matrix_shape)
pd.DataFrame(matrix).to_csv("simulated.csv", index=False)

train, val, test = util_functions.split(
                                    matrix,
                                    val_frac=0.1, 
                                    test_frac=0.1, 
                                    min_present=1
)

#### Fit

In [5]:
nmf_model = GradNMFImputer(
                    n_rows=train.shape[0], 
                    n_cols=train.shape[1], 
                    n_factors=n_factors, 
                    stopping_tol=tolerance, 
                    train_batch_size=batch_size, 
                    eval_batch_size=batch_size,
                    n_epochs=max_iters, 
                    loss_func="MSE",
                    optimizer=torch.optim.Adam,
                    optimizer_kwargs={"lr": learning_rate}
                )

recon = nmf_model.fit_transform(train, val)

100%|██████████| 400/400 [00:00<00:00, 421.72epoch/s]


#### Run some tests

In [61]:
train_err = util_functions.mse_func_np(train, recon)
val_err = util_functions.mse_func_np(val, recon)
test_err = util_functions.mse_func_np(test, recon)

# make sure error tolerances of predictions for all three sets are reasonable
assert train_err < train_err_tol
assert val_err < test_err_tol
assert test_err < test_err_tol

# make sure shape of reconstructed matrix is correct
assert recon.shape == train.shape

# make sure model isn't imputing a bunch of zeros
imputed_zero_cts = np.count_nonzero(np.isnan(recon))
assert imputed_zero_cts < train.size * 0.1

# make sure model isn't imputing extreme values
assert recon.max() < matrix.max() * 2
assert recon.min() > matrix.min() / 2

# make sure all predictions are positive
assert np.count_nonzero(recon < 0) == 0

# make sure recon matrix isn't just the input matrix 
assert np.array_equal(matrix, recon) == False

n_epochs = nmf_model.history.shape[0] - 1

# make sure that validation loss has decreased
if n_epochs > 15:
    window2 = np.array(nmf_model.history["Validation MSE"][0:15])
    window1 = np.array(nmf_model.history["Validation MSE"][-15:])

    val_wilcoxon_p = ranksums(window2, window1, alternative="greater")[1]

    assert val_wilcoxon_p < 0.05
    
    window2 = np.array(nmf_model.history["Train MSE"][0:15])
    window1 = np.array(nmf_model.history["Train MSE"][-15:])

    train_wilcoxon_p = ranksums(window2, window1, alternative="greater")[1]

    assert train_wilcoxon_p < 0.05

In [92]:
train_nans = np.count_nonzero(np.isnan(train))
val_nans = np.count_nonzero(np.isnan(val))
test_nans = np.count_nonzero(np.isnan(test))

train_frac = (train.size - train_nans) / train.size
val_frac = (val.size - val_nans) / val.size
test_frac = (test.size - test_nans) / test.size

assert 0.9 > train_frac > 0.7
assert 0.25 > val_frac > 0.0
assert 0.25 > test_frac > 0.0

assert np.isclose(val_frac, test_frac, atol=0.1)

***

In [110]:
W = rng.uniform(size=(matrix_shape[0], matrix_shape[2]))
H = rng.uniform(size=(matrix_shape[2], matrix_shape[1]))
X = W @ H

assert np.linalg.matrix_rank(X) == matrix_shape[2]

X

array([[0.24427432, 0.7308385 , 0.49049565, 0.67648185, 0.49200386,
        0.37537419, 0.18842389, 0.71352224, 0.43766731, 0.33919418],
       [0.21626156, 0.71741705, 0.39502737, 0.69870411, 0.4937684 ,
        0.33232062, 0.17705923, 0.76886266, 0.39407602, 0.60576079],
       [0.30339291, 0.98684314, 0.70836877, 0.84099321, 0.58016597,
        0.40906874, 0.18937264, 0.95129726, 0.5355505 , 0.37865934],
       [0.22053357, 0.71799566, 0.29857245, 0.78019864, 0.56748027,
        0.38349717, 0.22242682, 0.82712338, 0.41265393, 0.86008498],
       [0.44403447, 1.27013807, 0.60000602, 1.39899408, 1.06404218,
        0.8116424 , 0.46110182, 1.38490508, 0.82508262, 1.23670631],
       [0.3712009 , 0.93478566, 0.46731922, 1.07111933, 0.85514286,
        0.72042453, 0.40817006, 0.97556867, 0.68917356, 0.7656962 ],
       [0.09458834, 0.37453185, 0.06366545, 0.45025723, 0.31678732,
        0.17536767, 0.11579239, 0.50147834, 0.18617782, 0.73138256],
       [0.21445184, 0.7493904 , 0.4090463

In [113]:
W * 1e4

array([[5592.07160745, 3039.50098063,  308.17834568],
       [4367.17389232, 2145.8467282 , 4085.28643725],
       [8534.03073268, 2339.39485865,  583.02741689],
       [2813.83892022, 2935.93757767, 6619.16514727],
       [5570.32152341, 7838.98209106, 6643.13540327],
       [4063.8686144 , 8140.20384666, 1669.72919908],
       [ 227.12073134,  900.47860776, 7223.59350596],
       [4618.77230251, 1612.71779034, 5010.44775103],
       [1523.12102713, 6963.20375078, 4461.56275574],
       [3810.21226096, 3015.12089148, 6302.82593119],
       [3618.12610553,  876.49919316, 1180.05902121],
       [9618.9766455 , 9085.80690708, 6997.07133811]])

In [114]:
H * 1e4

array([[2658.6996146 , 9691.76377348, 7787.50903966, 7168.90189159,
        4493.61502144, 2722.41561845,  963.90962153, 9026.02396544,
        4557.76289834, 2023.63364795],
       [3059.56624151, 5792.19568942, 1767.72782939, 8566.14284092,
        7585.19529835, 7194.62955951, 4320.93039775, 6273.08840702,
        5840.97968913, 6498.46601555],
       [ 844.4432114 , 4158.07402171,  416.14173862, 4939.90819245,
        3298.61212333, 1445.24188866, 1034.02967723, 5876.44572178,
        1705.92968537, 9251.20118377]])

In [115]:
np.random.normal(loc=0.0, scale=1.0, size=None)

1.9887575322546194

In [118]:
np.random.normal(loc=1e7, scale=1e3, size=10)

array([ 9998093.58166751, 10000972.86996854, 10000212.93663829,
       10001967.2250023 , 10001809.25294179, 10000841.21079496,
       10001305.84873391,  9999864.46167278, 10001087.41600888,
       10001913.42551018])