In [20]:
import os
print(os.getpid())

9041


In [30]:
%whos

Variable                       Type                 Data/Info
-------------------------------------------------------------
GroupShuffleSplit              ABCMeta              <class 'sklearn.model_sel<...>split.GroupShuffleSplit'>
IncrementalPCA                 ABCMeta              <class 'sklearn.decomposi<...>ntal_pca.IncrementalPCA'>
Ridge                          ABCMeta              <class 'sklearn.linear_model._ridge.Ridge'>
TruncatedSVD                   type                 <class 'sklearn.decomposi<...>ncated_svd.TruncatedSVD'>
X_csr                          csr_matrix           <Compressed Sparse Row sp<...>228922)	2.790402412414551
X_csr_1000                     ndarray              105942x1000: 105942000 elems, type `float32`, 423768000 bytes (404.13665771484375 Mb)
X_dims                         list                 n=3
Xb_csr                         csr_matrix           <Compressed Sparse Row sp<...>228922)	2.790402412414551
Y                              memmap         

In [31]:
del Xb_csr

In [1]:
import os
base_dir = "/home/skovtun/Python_projects/Kaggle/Single cell"
data_dir = os.path.join(base_dir, "Single_cell_data")
os.chdir(data_dir)

In [2]:
import tables

with tables.open_file("train_multi_inputs.h5", "r") as f:
    in_axis1 = f.get_node("/train_multi_inputs/axis1")[:]

with tables.open_file("train_multi_targets.h5", "r") as f:
    y_axis1 = f.get_node("/train_multi_targets/axis1")[:]

print(in_axis1.shape, y_axis1.shape)
print("Same row ids and order:", (in_axis1 == y_axis1).all())


(105942,) (105942,)
Same row ids and order: True


In [3]:
import numpy as np
import tables
from scipy.sparse import csr_matrix, vstack, save_npz

batch = 1024  
blocks = []
with tables.open_file("train_multi_inputs.h5", "r") as f:
    peak_names_bytes = f.get_node("/train_multi_inputs/block0_items")[:]  
    cell_ids_bytes   = f.get_node("/train_multi_inputs/axis1")[:]  
    values = f.get_node("/train_multi_inputs/block0_values")
    n_cells, n_peaks = values.shape
    print("ATAC values shape:", (n_cells, n_peaks), "dtype:", values.dtype)
    np.save("train_multi_peak_names.npy", peak_names_bytes)
    np.save("train_multi_cell_ids.npy", cell_ids_bytes)

    for start in range(0, n_cells, batch):
        end = min(n_cells, start + batch)
        Xb = values[start:end, :]      
        Xb_csr = csr_matrix(Xb)        
        blocks.append(Xb_csr)
        if (start // batch) % 50 == 0:
            print(f"rows {start}:{end}  batch nnz={Xb_csr.nnz}")

X_csr = vstack(blocks, format="csr")
print("Final CSR shape:", X_csr.shape, "nnz:", X_csr.nnz, "dtype:", X_csr.dtype)

save_npz("train_multi_cell_ids.npy", X_csr)
print("Saved CSR:", "train_multi_cell_ids.npy")

# -------- quick sanity check: decode a few names (optional) --------
print("Example peak names:", peak_names_bytes[:3].astype("U"))
print("Example cell ids  :", cell_ids_bytes[:3].astype("U"))


ATAC values shape: (np.int64(105942), np.int64(228942)) dtype: float32
rows 0:1024  batch nnz=5147553
rows 51200:52224  batch nnz=7796845
rows 102400:103424  batch nnz=4986962
Final CSR shape: (105942, 228942) nnz: 607301546 dtype: float32
Saved CSR: train_multi_cell_ids.npy
Example peak names: ['GL000194.1:114519-115365' 'GL000194.1:55758-56597'
 'GL000194.1:58217-58957']
Example cell ids  : ['56390cf1b95e' 'fc0c60183c33' '9b4a87e22ad0']


In [6]:
#TruncatedSVD
# 6000 components explains ~70% of variance
import time
from sklearn.decomposition import TruncatedSVD
start_time = time.time()
svd_multi = TruncatedSVD(n_components=500, n_iter=7, random_state=42)
X_csr_500 = svd_multi.fit_transform(X_csr)
print(svd_multi.explained_variance_ratio_.sum())
print((start_time - time.time())/60)

0.021796776
-34.68766652743022


In [10]:
#TruncatedSVD
# 6000 components explains ~70% of variance
import time
from sklearn.decomposition import TruncatedSVD
start_time = time.time()
svd_multi = TruncatedSVD(n_components=1000, n_iter=7, random_state=42)
X_csr_1000 = svd_multi.fit_transform(X_csr)
print(svd_multi.explained_variance_ratio_.sum())
print((time.time() - start_time)/60)

0.035892006
58.249768761793774


In [11]:

ev = svd_multi.explained_variance_ratio_
print(ev[:500].sum(), ev[:800].sum(), ev[:1000].sum())


0.022117399 0.030495284 0.035892006


In [32]:
os.chdir(data_dir)
np.save("X_csr_1000.npy", X_csr_1000)

In [18]:
import numpy as np
import tables
from sklearn.decomposition import IncrementalPCA

batch = 1024

ipca = IncrementalPCA(n_components=300, batch_size=batch)

with tables.open_file("train_multi_targets.h5", "r") as f:
    gene_names_bytes = f.get_node("/train_multi_targets/block0_items")[:]
    cell_ids_bytes   = f.get_node("/train_multi_targets/axis1")[:]
    Y = f.get_node("/train_multi_targets/block0_values")
    n_cells, n_genes = Y.shape
    print("RNA gene expression shape:", (n_cells, n_genes), "dtype:", Y.dtype)

    np.save("train_multi_genes.npy", gene_names_bytes)
    np.save("train_multi_cell_ids.npy", cell_ids_bytes)

    for start in range(0, n_cells, batch):
        end = min(n_cells, start + batch)
        ipca.partial_fit(Y[start:end, :])
    np.save("Y_ipca_components_300.npy", ipca.components_.astype("float32", copy=False))  # (300, n_genes)
    np.save("Y_ipca_mean.npy", ipca.mean_.astype("float32", copy=False))                  # (n_genes,)
    print("Saved PCA params:",
          "Y_ipca_components_300.npy", ipca.components_.shape,
          "Y_ipca_mean.npy", ipca.mean_.shape)

Y_pca = np.memmap(
    "Y_train_pca_300.f32",
    mode="w+",
    dtype="float32",
    shape=(n_cells, 300),
)

with tables.open_file("train_multi_targets.h5", "r") as f:
    Y = f.get_node("/train_multi_targets/block0_values")
    for start in range(0, n_cells, batch):
        end = min(n_cells, start + batch)
        Y_pca[start:end, :] = ipca.transform(Y[start:end, :]).astype("float32", copy=False)

Y_pca.flush()
print("Saved:", "Y_train_pca_300.f32")


RNA gene expression shape: (np.int64(105942), np.int64(23418)) dtype: float32
Saved PCA params: Y_ipca_components_300.npy (300, 23418) Y_ipca_mean.npy (23418,)
Saved: Y_train_pca_300.f32


In [None]:
cell_ids = [x.decode("utf-8") for x in cell_ids_bytes[:20]]
print(cell_ids)

def kaggle_mean_cellwise_pearson(y_true, y_pred):
    """
    Mean per-cell Pearson correlation with the rule that constant prediction 
    should give -1 in correlation
    """
    yt = np.asarray(y_true)
    yp = np.asarray(y_pred)
    # constant prediction per row -> -1
    const_pred = np.all(yp == yp[:, [0]], axis=1)
    y_pred_centered = yp - yp.mean(axis=1, keepdims=True)
    y_true_centered = yt - yt.mean(axis=1, keepdims=True)
    num = np.sum(y_true_centered * y_pred_centered, axis=1)
    den = np.sqrt(np.sum(y_true_centered **2, axis=1) * np.sum(y_pred_centered**2, axis=1))
    corrs = np.empty(y_true.shape[0], dtype=float)
    corrs[const_pred] = -1.0
    # Pearson can be calculated
    valid = (~const_pred) & (den > 0)
    corrs[valid] = num[valid] / den[valid]
    # non-constant with zero denominator
    corrs[~const_pred & (den == 0)] = np.nan
    return np.nanmean(corrs)  
from sklearn.linear_model import Ridge
from sklearn.model_selection import GroupShuffleSplit
# --- FIRST CHANGE: load what you need to score in gene space ---
Y_pca_components = np.load("Y_ipca_components_300.npy")   # shape (300, n_genes)
Y_pca_mean       = np.load("Y_ipca_mean.npy")             # shape (n_genes,)

# Y_full must be the true RNA in gene space (n_cells, n_genes)
# If you don't already have it as an array/memmap, load it from the h5 (this will be slow if done repeatedly).
import tables
with tables.open_file("train_multi_targets.h5", "r") as f:
    Y_full = f.get_node("/train_multi_targets/block0_values")[:]   # shape (n_cells, n_genes)

X_dims = [500, 800, 1000]
Y_dims = [100, 200, 300]

groups = (np.array([hash(s.decode()) for s in cell_ids_bytes])
          & 0x7fffffffffffffff) % 50

gss = GroupShuffleSplit(n_splits=3, test_size=0.2, random_state=42)

for xd in X_dims:
    X = X_csr_1000[:, :xd]
    for yd in Y_dims:
        Y = Y_pca[:, :yd]

        scores = []
        for tr, va in gss.split(X, Y, groups):
            model = Ridge(alpha=1.0)
            model.fit(X[tr], Y[tr])
            pred = model.predict(X[va])                  # (n_val, yd)
            W = Y_pca_components[:yd, :]                 # (yd, n_genes)
            Y_pred_full = pred @ W + Y_pca_mean[None, :] # (n_val, n_genes)
            Y_true_full = Y_full[va]

            score = kaggle_mean_cellwise_pearson(Y_true_full, Y_pred_full)

            scores.append(score)

        print(
            f"X={xd:4d} Y={yd:3d}  "
            f"Pearson={np.mean(scores):.5f} Â± {np.std(scores):.5f}"
        )
