In [1]:
%config Completer.use_jedi = False

import numpy as np
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import time

#import torch
from hummingbird.ml import convert, load #https://github.com/microsoft/hummingbird
# hummingbird is usefull when you have trained model and you need to speed up prediction step
# it doesn't have nmf

from utils import load_file
from utils import DataResize
from utils import DeltaF
from utils import NMF_CV, NMF_CV_loop

### User inputs

In [None]:
# data address
h5_add = 'I:/P6 Project/Analyses/Triple mutation/210127_P6_scn2a_triplet/h5/run3_LED17mA-1h56min small obj.h5'

### Data loading

In [None]:
# load data
data = load_file(add=h5_add, key_name = 'GroupHierarchy.Groups.Datasets')
working_data = data

### Preprocessing

In [None]:
# resizing data 
data_resized = DataResize(data=working_data, dim=(128,128)).transform()
working_data = data_resized
print(f'new data size: {data_resized.shape}')

In [None]:
# feature selection

# (1) check variance in the video
plt.imshow(np.var(working_data, axis=0))
plt.colorbar()

In [None]:
from sklearn.feature_selection import VarianceThreshold

n, p1, p2 = working_data.shape

# initialize class with user given threshold
var_feature = VarianceThreshold(threshold=10000)

# fit on data
var_feature.fit(working_data.reshape(n, p1 * p2))

# transform data
feature_selected = var_feature.transform(working_data.reshape(n, p1 * p2))
working_data = feature_selected

print(f'selected feature shape for given threshold: {feature_selected.shape}')

In [None]:
# reconstructing back data
data_rec = var_feature.inverse_transform(working_data)

# reshape back data to 3d tensor
data_rec = data_rec.reshape(n, p1, p2)
working_data = data_rec

print(f'reconstructed data size: {data_rec.shape} \n\n\n')
print('HINT: PLEASE NOTICE THAT, X with columns of zeros inserted where features would have been removed by transform')

In [None]:
# if you want to get selected indices
# var_feature.get_support()

In [None]:
# claculate detalf/f0
n, p1 = working_data.shape 
data_deltaf = DeltaF(data=working_data, prct = 20).run_on_matrix()
working_data = data_deltaf

In [None]:
# run NMF_CV normal
start_time = time.clock()
train_error, test_error = NMF_CV_loop(data = data_deltaf, rank_range=np.arange(5,80,5), replicates=6)
print(f'execution time: {np.rint(time.clock() - start_time)} seconds')

0it [00:00, ?it/s]

In [None]:
# run in parallel with shared memory
start_time = time.clock()
nmf_cv_results = Parallel(n_jobs=8, verbose=1, 
                                 require='sharedmem')(delayed(NMF_CV)(data = data_deltaf, 
                                                                      rank=i, 
                                                                      replicates=j) for j in range(10) for i in range(70, 75, 5))
print(f'execution time: {np.rint(time.clock() - start_time)} seconds')

In [None]:
# run in parallel
start_time = time.clock()
nmf_cv_results = Parallel(n_jobs=4, verbose=5, 
                                 backend = 'loky')(delayed(NMF_CV)(data = fp, 
                                                                      rank=i, 
                                                                      replicates=j) for j in range(1) for i in range(70, 75, 5))
print(f'execution time: {np.rint(time.clock() - start_time)} seconds')

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


In [6]:
data_delta_f = np.load("test_data.npy")
data_delta_f.shape

(48000, 3130)