In [51]:
import gc
import joblib
import func.data as dt
import h5py
import hdf5plugin

import sklearn as sk
import pandas as pd
import numpy as np
import pathlib as pl

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error as mse

from sklearn import datasets
from sklearn.linear_model import Lasso

raw_data_dir = pl.Path("D:/Data/open-problems-multimodal/raw")
pro_data_dir = pl.Path("D:/Data/open-problems-multimodal/processed")
method_dir = pl.Path("D:/OneDrive/OneDrive - UW-Madison/Kris/Code/SingleCell-Integration/methods")

data_name = "multi"
method_name = "lasso"

In [52]:
train_input = dt.readH5pyFile(raw_data_dir / f"train_{data_name}_inputs.h5")
train_target = dt.readH5pyFile(raw_data_dir / f"train_{data_name}_targets.h5")

MemoryError: Unable to allocate 90.4 GiB for an array with shape (105942, 228942) and data type float32

In [None]:
train_input_val = train_input['block0_values']
train_target_val = train_target['block0_values']

In [None]:
print(train_input_val)
print(train_target_val)
print(train_input_val.shape)
print(train_target_val.shape)

[[0.        0.        0.        ... 0.        4.090185  0.       ]
 [0.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 3.8473213 3.8473213 0.       ]
 ...
 [0.        0.        0.        ... 4.497696  0.        0.       ]
 [0.        0.        0.        ... 0.        0.        3.9009068]
 [0.        0.        0.        ... 0.        4.1895022 4.1895022]]
[[ 1.1678035   0.62253     0.10695851 ...  0.41429165  1.780314
   0.5480701 ]
 [ 0.81897014  0.50600946  1.078682   ...  0.9823084   2.736507
   2.1840627 ]
 [-0.3567033  -0.42226133 -0.82449275 ... -1.182975    3.958148
   2.8686    ]
 ...
 [ 1.2213128   0.47656643  1.4375515  ...  0.98188305  6.911032
   3.4153101 ]
 [-0.15143251 -0.850024    0.46155566 ...  1.0102471   1.8648046
   3.4492888 ]
 [-0.4392985  -0.33832696  0.80367655 ...  0.6188518   7.220644
   3.2343602 ]]
(70988, 22050)
(70988, 140)


In [None]:
# Run standardization and PCA
p = 500

train_sdsc = StandardScaler()
train_input_sdsc = train_sdsc.fit_transform(train_input_val)

train_pca = PCA(n_components=p)
X_train = train_pca.fit_transform(train_input_sdsc)

# X_train = pd.DataFrame(train_pca.transform(train_input_sdsc))

In [None]:
# save intermediate results

joblib.dump(train_sdsc, method_dir / f"train_{data_name}_sdsc.m")
joblib.dump(train_pca, method_dir / f"train_{data_name}_pca.m")

np.savetxt(pro_data_dir / f"train_{data_name}_pca.csv", X_train, delimiter=",")
# X_train.to_csv(data_dir / f"train_{data_name}_pca.csv")

In [None]:
y_train = train_target_val
print(X_train.shape)
print(y_train.shape)

(70988, 500)
(70988, 140)


In [None]:
param_grid = [{'alpha': [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]}]
model = Lasso()
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=Lasso(),
             param_grid=[{'alpha': [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]}],
             scoring='neg_mean_squared_error')

In [None]:
best_model = grid_search.best_estimator_
joblib.dump(best_model, method_dir / f"{data_name}_{method_name}.m")

['D:\\Data\\open-problems-multimodal\\raw\\cite_lasso.m']

In [None]:
print(best_model)
print(grid_search.cv_results_)

Lasso(alpha=0.01)
{'mean_fit_time': array([19.79302073, 14.89189587, 15.05706944, 14.04256368, 12.72947178,
       11.99321351]), 'std_fit_time': array([1.16901937, 0.21752129, 0.74390683, 0.34442715, 0.22871756,
       0.30382576]), 'mean_score_time': array([0.04976168, 0.02622051, 0.02902412, 0.03018684, 0.02810974,
       0.02840638]), 'std_score_time': array([0.04185741, 0.00235663, 0.00263576, 0.00174985, 0.00150136,
       0.00135673]), 'param_alpha': masked_array(data=[0.01, 0.05, 0.1, 0.2, 0.5, 1.0],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'alpha': 0.01}, {'alpha': 0.05}, {'alpha': 0.1}, {'alpha': 0.2}, {'alpha': 0.5}, {'alpha': 1.0}], 'split0_test_score': array([-2.32549405, -2.32505178, -2.33663487, -2.37575936, -2.49759436,
       -2.65978646]), 'split1_test_score': array([-2.76539707, -2.79367328, -2.83223963, -2.89754891, -3.04684067,
       -3.22223663]), 'split2_test_score': array([-3.369

In [None]:
# Make prediction

test_input = dt.readH5pyFile(raw_data_dir / f"test_{data_name}_inputs.h5")

In [None]:
test_input_val = test_input["block0_values"]
print(test_input_val.shape)

In [None]:
X_test = train_pca.transform(train_sdsc.transform(test_input_val))
y_pred = best_model.predict(X_test)

In [None]:
print(y_pred.shape)

(48663, 140)


In [47]:
np.savetxt(pro_data_dir / f"test_{data_name}_{method_name}_pred.csv", y_pred, delimiter=",")