# runtime-sandbox
1.26.23

Taking a crack at this. 
Figured out how to get runtime and store as a variable. 
I think it makes the most sense to represent dataset size in terms of number of observations in the training set. Mb gets a bit screwy when you consider log vs linear peptide quants datasets: the same datset will require more Mbs to encode as linear vs log. And you definetly want to look at training set # of obs, as this is what the model actually imputes. And because the partition can be stochastic, and can potentially alter the number of observations by quite a bit. 

In [1]:
import pandas as pd
import numpy as np
import sys
import os
import time
import torch
from sklearn.impute import KNNImputer
import seaborn as sns
import matplotlib.pyplot as plt

# suppressing this CUDA initialization warning I always get
    # this could be dangerous
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# import my modules
sys.path.append('../../../../bin/')
from models.linear import GradNMFImputer
import util_functions
import intermediate_plots

# for missForest:
from rpy2.robjects.packages import importr
from rpy2.robjects import numpy2ri, r

# plotting templates
sns.set(context="talk", style="ticks") 
pal = sns.color_palette()

#### Configs

In [2]:
# partitioning params
val_frac = 0.3
test_frac = 0.0
# setting this to 0 ensures that no peptides will be filtered out
min_present = 0     # during partitioning
q_anchor=0.3  # these three for MNAR partition 
t_std=0.35
brnl_prob=0.4

# NMF model params
n_factors = 4                 # 4 is default
tolerance = 0.0001            # 0.0001 is default
max_epochs = 1000             # 1000 is default
learning_rate = 0.01          # 0.01 is default
batch_size = 64               # 64 is default
loss_func = "MSE"

# kNN params
k_neighbors = 4

# missForest impute params
n_trees = 100                 # 100 is default, according to mf manual
max_iters_mf = 10             # 10 is default, according to mf manual
r_seed = 36                   # the random seed for rpy2

# the random number generator
rng = np.random.default_rng(seed=18)

# the random state for the partition
split_rand_state = 18

#### Read in peptide quants dataset

In [3]:
full_path = "/net/noble/vol2/home/lincolnh/code/2021_ljharris_ms-impute/data/peptides-data/"
df = "PXD006109_peptides.csv"
pxd = "PXD006109"

quants_raw = pd.read_csv(full_path + df)
quants_raw[quants_raw == 0] = np.nan

quants = np.array(quants_raw)

#### Partition

In [4]:
# MCAR partition 
# train, val, test = util_functions.split(
#                                     quants, 
#                                     val_frac=val_frac,
#                                     test_frac=test_frac, 
#                                     min_present=min_present,
#                                     random_state=split_rand_state,
# )
# MNAR partition 
train, val = util_functions.MNAR_partition_thresholds_matrix(
                                    quants, 
                                    q_anchor=q_anchor, 
                                    t_std=t_std, 
                                    brnl_prob=brnl_prob, 
                                    min_pres=min_present,
                                    rand_state=split_rand_state,
)

#### Get the missingness fractions of each set

In [5]:
orig_mv_frac = np.count_nonzero(np.isnan(quants)) / quants.size
train_mv_frac = np.count_nonzero(np.isnan(train)) / train.size
val_mv_frac = np.count_nonzero(np.isnan(val)) / val.size

print("mv frac original: ", np.around(orig_mv_frac, decimals=3))
print("mv frac train: ", np.around(train_mv_frac, decimals=3))
print("mv frac validation: ", np.around(val_mv_frac, decimals=3))

mv frac original:  0.165
mv frac train:  0.382
mv frac validation:  0.783


#### Get the optimal number of batches for training (NMF)

In [6]:
if len(~np.isnan(train)) > 100:
    n_batches = int(np.floor(len(~np.isnan(train)) / batch_size))
    # setting the minimum n_batches to 100
    n_batches = max(n_batches, 100) 
else: 
    n_batches = 1

print(n_batches)

595


#### How many observations are in the training set? 

In [7]:
n_obs = np.count_nonzero(~np.isnan(train))
print(n_obs)

471302


***

## Time various imputation methods

#### NMF impute

In [8]:
#%%time

# get the start time, measured since the Unix epoch
nmf_start_sec = time.time()

# init model 
nmf_model = GradNMFImputer(
                n_rows = train.shape[0], 
                n_cols = train.shape[1], 
                n_factors=n_factors, 
                stopping_tol=tolerance,
                train_batch_size=n_batches, 
                eval_batch_size=n_batches,
                n_epochs=max_epochs, 
                loss_func=loss_func,
                optimizer=torch.optim.Adam,
                optimizer_kwargs={"lr": learning_rate},
                non_negative=True,
                rand_seed=rng.random(),
)
# fit and transform
nmf_recon = nmf_model.fit_transform(train, val)

# get the elapsed time
nmf_end_sec = time.time()
nmf_sec_elapsed = nmf_end_sec - nmf_start_sec

print("nmf runtime (sec): ", nmf_sec_elapsed)

  6%|â–‹         | 64/1000 [01:48<26:31,  1.70s/epoch]

early stopping triggered: standard criteria





nmf runtime (sec):  109.57062029838562


#### kNN impute

In [9]:
# # %%time

# # get the start time, measured since the Unix epoch
# knn_start_sec = time.time()

# knn_model = KNNImputer(n_neighbors=k_neighbors)
# knn_recon = knn_model.fit_transform(train)

# # get the elapsed time
# knn_end_sec = time.time()
# knn_sec_elapsed = knn_end_sec - knn_start_sec

# print("knn runtime (sec): ", knn_sec_elapsed)

#### Sample min impute

In [10]:
# %%time

# get the start time, measured since the Unix epoch
min_start_sec = time.time()

col_min = np.nanmin(train, axis=0)
nan_idx = np.where(np.isnan(train))
min_recon = train.copy()
# nan_idx[1] -> take index of column
min_recon[nan_idx] = np.take(col_min, nan_idx[1])

# get the elapsed time
min_end_sec = time.time()
min_sec_elapsed = min_end_sec - min_start_sec

print("min runtime (sec): ", min_sec_elapsed)

min runtime (sec):  0.02395343780517578


#### Gaussian random sample impute

In [11]:
# %%time

# get the start time, measured since the Unix epoch
std_start_sec = time.time()

# get the column mins
col_min = np.nanmin(train, axis=0)

# get the mean and std of the entire training matrix
train_mean = np.nanmean(train)
train_sd = np.nanstd(train)

# get the indicies of the MVs 
nan_idx = np.where(np.isnan(train))
std_recon = train.copy()

# how many total MVs? 
n_mv = len(nan_idx[0])

# fill in the MVs with random draws 
std_recon[nan_idx] = rng.normal(
                            loc=np.mean(col_min), 
                            scale=np.std(col_min), 
                            size=n_mv
)

# don't want negative values
std_recon = np.abs(std_recon)

# get the elapsed time
std_end_sec = time.time()
std_sec_elapsed = std_end_sec - std_start_sec

print("std runtime (sec): ", std_sec_elapsed)

std runtime (sec):  0.05980038642883301


#### missForest impute

In [12]:
# %%time

# get the start time, measured since the Unix epoch
# mf_start_sec = time.time()

# set_seed = r('set.seed')
# set_seed(r_seed)

# base = importr("base")
# doParallel = importr("doParallel")
# rngtools = importr("rngtools")
# missForest = importr("missForest")

# # activate automatic conversion of NumPy arrays
# numpy2ri.activate()

# # set up parallelization
#     # not totally sure how to set this
# doParallel.registerDoParallel(cores=12)

# # run missForest
# mf_recon, err = missForest.missForest(
#                             train, 
#                             maxiter=max_iters_mf,
#                             ntree=n_trees,
#                             parallelize="forests", 
#                             verbose=True,
# )
# mf_recon = np.array(mf_recon)

# get the elapsed time
# mf_end_sec = time.time()
# mf_sec_elapsed = mf_end_sec - mf_start_sec

# print("mf runtime (sec): ", mf_sec_elapsed)

***

## Record in a dataframe

#### Init

In [15]:
rt_df = pd.DataFrame(
    columns=[
        "dataset", 
        "n observations", 
        "NMF sec", 
        "kNN sec", 
        "min sec", 
        "std sec", 
        "mf sec"]
)

toadd = {
    "dataset" : pxd, 
    "n observations" : n_obs,
    "NMF sec" : nmf_sec_elapsed,
    "kNN sec" : 0.0,
    "min sec" : min_sec_elapsed,
    "std sec" : std_sec_elapsed,
    "mf sec" : 0.0,
}

rt_df = rt_df.append(toadd, ignore_index=True)
rt_df

Unnamed: 0,dataset,n observations,NMF sec,kNN sec,min sec,std sec,mf sec
0,PXD006109,471302,109.57062,0.0,0.023953,0.0598,0.0
