In [2]:
import os
base_dir = "/home/skovtun/Python_projects/Kaggle/Single_cell"
data_dir = os.path.join(base_dir, "data")
os.chdir(data_dir)

In [3]:
import tables

with tables.open_file("train_multi_inputs.h5", "r") as f:
    in_axis1 = f.get_node("/train_multi_inputs/axis1")[:]

with tables.open_file("train_multi_targets.h5", "r") as f:
    y_axis1 = f.get_node("/train_multi_targets/axis1")[:]

print(in_axis1.shape, y_axis1.shape)
print("Same row ids and order:", (in_axis1 == y_axis1).all())


(105942,) (105942,)
Same row ids and order: True


In [3]:
import numpy as np
import tables
from scipy.sparse import csr_matrix, vstack, save_npz

batch = 1024  
blocks = []
with tables.open_file("train_multi_inputs.h5", "r") as f:
    peak_names_bytes = f.get_node("/train_multi_inputs/block0_items")[:]  
    cell_ids_bytes   = f.get_node("/train_multi_inputs/axis1")[:]  
    values = f.get_node("/train_multi_inputs/block0_values")
    n_cells, n_peaks = values.shape
    print("ATAC values shape:", (n_cells, n_peaks), "dtype:", values.dtype)
    np.save("train_multi_peak_names.npy", peak_names_bytes)
    np.save("train_multi_cell_ids.npy", cell_ids_bytes)

    for start in range(0, n_cells, batch):
        end = min(n_cells, start + batch)
        Xb = values[start:end, :]      
        Xb_csr = csr_matrix(Xb)        
        blocks.append(Xb_csr)
        if (start // batch) % 50 == 0:
            print(f"rows {start}:{end}  batch nnz={Xb_csr.nnz}")

X_csr = vstack(blocks, format="csr")
print("Final CSR shape:", X_csr.shape, "nnz:", X_csr.nnz, "dtype:", X_csr.dtype)

save_npz("train_multi_cell.npz", X_csr)
print("Saved CSR:", "train_multi_cell.npz")

# -------- quick sanity check: decode a few names (optional) --------
print("Example peak names:", peak_names_bytes[:3].astype("U"))
print("Example cell ids  :", cell_ids_bytes[:3].astype("U"))


ATAC values shape: (np.int64(105942), np.int64(228942)) dtype: float32
rows 0:1024  batch nnz=5147553
rows 51200:52224  batch nnz=7796845
rows 102400:103424  batch nnz=4986962
Final CSR shape: (105942, 228942) nnz: 607301546 dtype: float32
Saved CSR: train_multi_cell_ids.npy
Example peak names: ['GL000194.1:114519-115365' 'GL000194.1:55758-56597'
 'GL000194.1:58217-58957']
Example cell ids  : ['56390cf1b95e' 'fc0c60183c33' '9b4a87e22ad0']


In [6]:
#TruncatedSVD
import time
from sklearn.decomposition import TruncatedSVD
start_time = time.time()
svd_multi = TruncatedSVD(n_components=500, n_iter=7, random_state=42)
X_csr_500 = svd_multi.fit_transform(X_csr)
print(svd_multi.explained_variance_ratio_.sum())
print((start_time - time.time())/60)

0.021796776
-34.68766652743022


In [10]:
#TruncatedSVD
import time
from sklearn.decomposition import TruncatedSVD
start_time = time.time()
svd_multi = TruncatedSVD(n_components=1000, n_iter=7, random_state=42)
X_csr_1000 = svd_multi.fit_transform(X_csr)
print(svd_multi.explained_variance_ratio_.sum())
print((time.time() - start_time)/60)

0.035892006
58.249768761793774


In [11]:

ev = svd_multi.explained_variance_ratio_
print(ev[:500].sum(), ev[:800].sum(), ev[:1000].sum())


0.022117399 0.030495284 0.035892006


In [32]:
os.chdir(data_dir)
np.save("X_csr_1000.npy", X_csr_1000)

In [18]:
import numpy as np
import tables
from sklearn.decomposition import IncrementalPCA

batch = 1024

ipca = IncrementalPCA(n_components=300, batch_size=batch)

with tables.open_file("train_multi_targets.h5", "r") as f:
    gene_names_bytes = f.get_node("/train_multi_targets/block0_items")[:]
    cell_ids_bytes   = f.get_node("/train_multi_targets/axis1")[:]
    Y = f.get_node("/train_multi_targets/block0_values")
    n_cells, n_genes = Y.shape
    print("RNA gene expression shape:", (n_cells, n_genes), "dtype:", Y.dtype)

    np.save("train_multi_genes.npy", gene_names_bytes)
    np.save("train_multi_cell_ids.npy", cell_ids_bytes)

    for start in range(0, n_cells, batch):
        end = min(n_cells, start + batch)
        ipca.partial_fit(Y[start:end, :])
    np.save("Y_ipca_components_300.npy", ipca.components_.astype("float32", copy=False))  # (300, n_genes)
    np.save("Y_ipca_mean.npy", ipca.mean_.astype("float32", copy=False))                  # (n_genes,)
    print("Saved PCA params:",
          "Y_ipca_components_300.npy", ipca.components_.shape,
          "Y_ipca_mean.npy", ipca.mean_.shape)

Y_pca = np.memmap(
    "Y_train_pca_300.f32",
    mode="w+",
    dtype="float32",
    shape=(n_cells, 300),
)

with tables.open_file("train_multi_targets.h5", "r") as f:
    Y = f.get_node("/train_multi_targets/block0_values")
    for start in range(0, n_cells, batch):
        end = min(n_cells, start + batch)
        Y_pca[start:end, :] = ipca.transform(Y[start:end, :]).astype("float32", copy=False)

Y_pca.flush()
print("Saved:", "Y_train_pca_300.f32")


RNA gene expression shape: (np.int64(105942), np.int64(23418)) dtype: float32
Saved PCA params: Y_ipca_components_300.npy (300, 23418) Y_ipca_mean.npy (23418,)
Saved: Y_train_pca_300.f32


Preprocessing complete. Compressed features are saved to disk. Modeling continues in 02_Model_Training_and_Validation.ipynb.