# scratch
4.1.22

In [30]:
import pandas as pd
import numpy as np
import torch
from scipy.stats import ranksums

import sys
sys.path.append('../ms_imputer/')

from models.linear import GradNMFImputer
from models.scalers import StandardScaler
import util_functions

In [2]:
data_path = "~/Desktop/maxquant-data/PXD006348_peptides.csv"

peptide_quants = pd.read_csv(data_path)
peptide_quants

Unnamed: 0,Intensity Bruce01,Intensity Bruce02,Intensity Bruce03,Intensity Bruce04,Intensity Bruce05,Intensity Bruce06,Intensity Bruce07,Intensity Bruce08,Intensity Bruce09,Intensity Bruce10,...,Intensity Bruce15,Intensity Bruce16,Intensity Bruce17,Intensity Bruce18,Intensity Bruce19,Intensity Bruce20,Intensity Bruce21,Intensity Bruce22,Intensity Bruce23,Intensity Bruce24
0,0,0,0,0,0,0,0,0,0,0,...,1319600,0,0,0,0,0,0,0,0,0
1,0,0,0,0,779010,0,897060,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,6887500,0,0,0,4291500,0,1172700,0,...,0,0,0,0,0,0,0,532790,0,0
3,0,0,0,0,0,0,0,0,0,0,...,131700,0,0,0,0,0,0,0,0,0
4,0,0,5550900,0,0,0,1135700,910110,865540,0,...,14378000,2087800,0,0,1014700,1111400,765150,0,715930,1217600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10357,0,0,0,0,411550,0,0,0,0,0,...,0,0,0,0,847130,4724200,406120,652830,0,0
10358,0,0,0,0,0,0,1450700,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10359,0,0,0,0,0,645060,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10360,0,675330,0,0,0,5154200,0,0,0,0,...,0,0,0,0,0,1383800,0,0,0,0


In [3]:
peptide_quants_trim = peptide_quants[:60]
peptide_quants_trim.shape

(60, 24)

In [4]:
peptide_quants_trim.size

1440

In [33]:
peptide_quants_trim.to_csv("peptide_quants_tester.csv", index=False)

In [5]:
# training params
n_factors = 3 # for the initial test
tolerance = 0.0001
batch_size = 100
max_iters = 400
learning_rate = 0.05
min_present = 2 # for partition
lf = 4

In [6]:
train, val, test = util_functions.split(
                            peptide_quants_trim,
                            val_frac=0.1, 
                            test_frac=0.1, 
                            min_present=min_present
)

In [7]:
model = GradNMFImputer(
                n_rows=train.shape[0], 
                n_cols=train.shape[1], 
                n_factors=lf, 
                stopping_tol=tolerance, 
                train_batch_size=batch_size, 
                eval_batch_size=batch_size,
                n_epochs=max_iters, 
                loss_func="MSE",
                optimizer=torch.optim.Adam,
                optimizer_kwargs={"lr": learning_rate}
)

recon_mat = model.fit_transform(train, val)

 38%|███▊      | 151/400 [00:00<00:00, 486.73epoch/s]


In [8]:
train_err = util_functions.mse_func_np(train, recon_mat)
val_err = util_functions.mse_func_np(val, recon_mat)
test_err = util_functions.mse_func_np(test, recon_mat)

print(train_err)
print(val_err)
print(test_err)

0.00418483052507837
10914251091912.834
13620971547812.652


In [9]:
train_err_tol = 1e-1
test_err_tol = 1e15

assert train_err < train_err_tol
assert val_err < test_err_tol
assert test_err < test_err_tol

***

In [10]:
model.history

Unnamed: 0,epoch,Train MSE,Validation MSE,a,b,c
0,0,9.934776,10.858836,0.0,0.0,1.0
1,1,8.122000,8.825942,0.0,0.0,1.0
2,2,6.653830,7.173163,0.0,0.0,1.0
3,3,5.471106,5.828377,0.0,0.0,1.0
4,4,4.530153,4.748900,0.0,0.0,1.0
...,...,...,...,...,...,...
148,148,0.026129,0.169882,0.0,0.0,1.0
149,149,0.025954,0.170082,0.0,0.0,1.0
150,150,0.025781,0.170296,0.0,0.0,1.0
151,151,0.025611,0.170522,0.0,0.0,1.0


In [11]:
model.early_stopping

'wilcoxon'

In [32]:
if model.early_stopping == "wilcoxon": 
    window2 = np.array(model.history["Validation MSE"][-15:])
    window1 = np.array(model.history["Validation MSE"][-35:-20])

    wilcoxon_p = ranksums(window2, window1, alternative="greater")[1]
    
    assert wilcoxon_p < 0.05
    
if model.early_stopping == "standard":
    stopping_counter = 0
    best_loss = np.min(model.history["Validation MSE"])

    for val_loss in model.history["Validation MSE"][-10:]:
        tol = np.abs((best_loss - val_loss) / best_loss)
        loss_ratio = val_loss / best_loss
    
        if tol < tolerance:
            stopping_counter += 1
        else:
            stopping_counter = 0
    # assuming that patience == 10
    assert stopping_counter == 10

In [38]:
len(model.history.epoch)

153