# test scaler scatch
4.1.22

In [11]:
import unittest
import pytest
import pandas as pd 
import numpy as np
import torch

import sys
sys.path.append('../ms_imputer/')

from models.linear import GradNMFImputer
from models.scalers import StandardScaler
import util_functions

#### Set configs

In [12]:
# simulated matrix configs
rng = np.random.default_rng(42) # random seed
matrix_shape = (12,10,3) # (n_rows, n_cols, rank)

# training params
n_factors = 4 
tolerance = 0.0001
batch_size = 100
max_iters = 400
learning_rate = 0.05
PXD = "tester"                             

# error assessment params
train_err_tol = 1e-8
test_err_tol = 1e-1

#### Some helper funcs

In [13]:
def simulate_matrix_realistic(matrix_shape):
    """
    Init a simulated matrix of known size and (approximate) rank. 
    The values of quants_mean and quants_std were derived from a 
    real peptide quants matrix, and should allow us to generate a 
    matrix that more accurately simulates a real peptide quants 
    dataset. Note that taking the abs value of W and H most likely
    changes the true rank of the matrix, thus the assert statement
    in here won't necessarily pass. 

    Parameters
    ----------
    matrix_shape: tuple, (x,y,z) where x=n_rows, y=n_cols
                    and z=rank
    Returns
    -------
    X : np.ndarray, the simulated matrix
    """
    quants_mean = 102161962.5
    quants_std = 978349975.6

    matrix_shape = (12, 10, 3) # (n_rows, n_cols, rank)
    W = np.abs(np.random.normal(loc=quants_mean, scale=quants_std, size=(matrix_shape[0], matrix_shape[2])))
    H = np.abs(np.random.normal(loc=quants_mean, scale=quants_std, size=(matrix_shape[2], matrix_shape[1])))

    X = W @ H

    # won't necessarily pass
    #assert np.linalg.matrix_rank(X) == matrix_shape[2]

    return X

#### Set up simulated matrix

In [20]:
# init the first (basic) simulated matrix
matrix = simulate_matrix_realistic(matrix_shape)

train, val, test = util_functions.split(
                                    matrix,
                                    val_frac=0.1, 
                                    test_frac=0.1, 
                                    min_present=2
)

train_tens = torch.tensor(train)
val_tens = torch.tensor(val)
test_tens = torch.tensor(test)

#### Set up scaler

In [45]:
# with StandardScaler
scaler = StandardScaler()
std_scaled = scaler.fit_transform(train_tens)

# manually
scale_factor = np.nanstd(train)
manual_scaled = train / scale_factor
manual_scaled = torch.tensor(manual_scaled)

std_scaled_nonmissing = std_scaled[~torch.isnan(std_scaled)]
manual_scaled_nonmissing = manual_scaled[~torch.isnan(manual_scaled)]

assert np.all(np.isclose(std_scaled_nonmissing, manual_scaled_nonmissing, atol=0.1))

In [47]:
std_scaled_nonmissing

tensor([1.3999, 2.1038, 1.1946, 2.1723, 5.3817, 0.4495, 1.1685, 4.0401, 1.9148,
        0.7546, 5.0880, 1.1272, 1.6801, 0.8480, 0.6123, 0.6303, 1.4315, 0.2512,
        1.7962, 0.3307, 0.6018, 1.1150, 1.3057, 1.3453, 1.0597, 1.3224, 3.1385,
        0.6977, 1.0280, 2.2642, 2.0440, 1.5769, 1.2355, 2.6857, 0.9784, 3.1620,
        1.0839, 0.9546, 0.3114, 0.6949, 0.3451, 1.1183, 0.1415, 0.3949, 0.3951,
        1.5239, 1.5118, 1.2330, 1.4709, 0.7735, 1.1382, 2.4765, 0.8257, 0.3560,
        0.6086, 0.3535, 0.6992, 0.2715, 1.2039, 0.4156, 0.5560, 2.1905, 0.9536,
        1.6510, 0.7859, 1.4389, 1.2083, 2.0876, 0.3759, 0.7544, 0.8396, 1.2048,
        0.3748, 0.8839, 0.3114, 0.4715, 0.5564, 1.0964, 0.3994, 0.2991, 0.9331,
        1.2541, 0.7724, 3.2692, 0.1938, 3.3336, 2.5300, 1.9891, 1.0850, 1.4942,
        1.0742, 2.2772, 0.6999, 3.2513, 0.5502, 1.7449], dtype=torch.float64)