In [3]:
print(os.getpid())

121798


In [2]:
import os
import gc
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import tables
import re

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import make_scorer

# Limit threads for numerical libraries to manage CPU usage
os.environ["OPENBLAS_NUM_THREADS"] = "7"
os.environ["OMP_NUM_THREADS"] = "7"

base_dir = "/home/skovtun/Python_projects/Kaggle/Single_cell/"
data_dir = os.path.join(base_dir, "data")
random_state = 77

os.chdir(data_dir)

In [3]:
#Getting external file providing the mapping of human genes to their chromosome coordinates on 
#the GRCh38 genome to use for reducing number of columns for every target.
# RAW LINE:
# 1	havana	gene	11869	14409	.	+	.	gene_id "ENSG00000223972"; gene_version "5"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene";

gtf_path = "Homo_sapiens.GRCh38.98.gtf"

genes = []

with open(gtf_path) as f:
    for line in f:
        #skipping comments
        if line.startswith("#"):
            continue
        #splitting the line
        fields = line.strip().split("\t")
        if fields[2] != "gene":
            continue
        
        chrom = fields[0]
        start = int(fields[3])
        end = int(fields[4])
        
        attr = fields[8]
        match_id = re.search(r'gene_id "([^"]+)"',attr)
        match_name = re.search(r'gene_name "([^"]+)"',attr)
        if match_id:
            gene_id = match_id.group(1)
        else:
            continue
        if match_name:
            gene_name = match_name.group(1)
        else:
            gene_name = None
        
        genes.append([gene_id, gene_name, chrom, start, end])

gene_df = pd.DataFrame(genes, columns=["gene_id", "gene_name", "chr", "start", "end"])
mapping = {str(i): f"chr{i}" for i in range(1,23)}
mapping['X'] = 'chrX'
mapping['Y'] = 'chrY'
gene_df['chr'] = gene_df['chr'].map(mapping).fillna(gene_df['chr'])
#gene_df['chr'].unique()

In [79]:
gene_df.shape

(60623, 5)

In [4]:
#Getting a list of genes and all coordinates, parsing ATAC Peaks
path = "train_multi_inputs.h5"
cols = pd.read_hdf(path, key="train_multi_inputs", start=0, stop=1).columns
path = "train_multi_targets.h5"
cols_t = pd.read_hdf(path, key = 'train_multi_targets', start = 0, stop=1).columns

#From the column names of the multi creating a dataframe with location name, start and end.
arr = pd.Series(cols.values)
r_p = r'([^:]+):([\d]+)-([\d]+)'
chr_ranges = arr.str.extract(r_p)
chr_ranges.columns = ['chr','start','end']
chr_ranges['start'] = chr_ranges['start'].astype(int)
chr_ranges['end'] = chr_ranges['end'].astype(int)

#Reducing gene_df by choosing onnly chr present in chr_ranges and only gene_id's present as targets.
multi_chr = list(chr_ranges['chr'].unique())
gene_df_multi = gene_df[gene_df['chr'].isin(multi_chr)]
gene_df_multi = gene_df_multi.set_index('gene_id')
missing = cols_t.difference(gene_df_multi.index)
gene_df_multi = gene_df_multi.loc[gene_df_multi.index.intersection(cols_t)]
gene_df_multi.shape

(23404, 4)

In [5]:
#calculating maximum amount of features for every target.
window = 2000

genes = gene_df_multi.loc[gene_df_multi.index.intersection(cols_t)].copy()
genes["left"]  = genes["start"] - window
genes["right"] = genes["end"]   + window
genes["gene_id"] = genes.index  # preserve gene_id as a column

chr_ranges = chr_ranges.rename(columns={
    "start": "start_peak",
    "end":   "end_peak"
})

target_cols = pd.DataFrame()
results = []
gene_id = []
start_peak = []
end_peak = []

for chr_name, chr_peaks in chr_ranges.groupby("chr"):
    sub_genes = genes[genes["chr"] == chr_name]
    if sub_genes.empty or chr_peaks.empty:
        continue

    merged = sub_genes[['gene_id', 'chr', 'left', 'right']].merge(chr_peaks[['chr', 'start_peak', 'end_peak']], on="chr")

    mask = (
        (merged["start_peak"] >= merged["left"]) &
        (merged["end_peak"]   <= merged["right"])
    )
    m = merged[mask]
    results.append(m)
        
target_cols = pd.concat(results)
target_cols['col'] = target_cols['chr'].astype(str)+":"+target_cols['start_peak'].astype(str)+"-"+target_cols['end_peak'].astype(str)

In [6]:
print("Loading pre-computed features from Notebook 1...")

# 1. Load the SVD-compressed Chromatin Features (Input)
# Shape: (105942, 1000)
X_csr_1000 = np.load("X_csr_1000.npy")

# 2. Load the PCA-compressed RNA Targets (Output)
# Shape: (105942, 300)
n_cells = X_csr_1000.shape[0]
Y_pca = np.memmap("Y_train_pca_300.f32", dtype="float32", mode="r", shape=(n_cells, 300))

# 3. Load PCA Components (Decoder)
# Shape: (300, 23418)
Y_components = np.load("Y_ipca_components_300.npy")
Y_mean = np.load("Y_ipca_mean.npy")

print(f"‚úÖ Data Loaded Successfully.")
print(f"X (Features): {X_csr_1000.shape} - 1000 Dimensions")
print(f"Y (Targets):  {Y_pca.shape} - 300 Dimensions")

Loading pre-computed features from Notebook 1...
‚úÖ Data Loaded Successfully.
X (Features): (105942, 1000) - 1000 Dimensions
Y (Targets):  (105942, 300) - 300 Dimensions


In [7]:
gene_importance = np.sum(np.abs(Y_components), axis = 0)
ranked_genes = pd.Series(gene_importance, index = cols_t).sort_values(ascending= False)
genes_1000 = ranked_genes[:1000]
target_cols_1000 = target_cols[target_cols['gene_id'].isin(genes_1000.index)]
peaks_1000 = target_cols_1000['col'].unique()
with tables.open_file("train_multi_inputs.h5", "r") as f:
    peaks = f.get_node("/train_multi_inputs/axis0")[:]
peaks_d = np.char.decode(peaks, encoding = 'utf-8')
peak_to_id = {str(peak): i for i,peak in enumerate(peaks_d)}
peaks_1000_idx = sorted(set([peak_to_id[peak] for peak in peaks_1000]))

In [8]:
import scipy
X_crs = scipy.sparse.load_npz('train_multi_cell.npz')
Xgene_1000 = X_crs[:,peaks_1000_idx]
print(f"Original shape: {X_crs.shape}")
print(f"Sliced shape: {Xgene_1000.shape}")

Original shape: (105942, 228942)
Sliced shape: (105942, 15581)


In [9]:
from scipy.sparse import hstack, csr_matrix
X_pca_sparse = csr_matrix(X_csr_1000)
X = hstack([X_pca_sparse,Xgene_1000])
X.shape

(105942, 16581)

In [10]:
# --- Data Splitting and Feature Separation ---
# Random Train-Test Split (80/20)
with tables.open_file("train_multi_inputs.h5", "r") as f:
    values = f.get_node("/train_multi_inputs/axis1")[:]
cell_names = np.char.decode(values, encoding = 'utf-8')
cell_to_id = {str(name): i for i,name in enumerate(cell_names)}


random_state=77
rng = np.random.default_rng(random_state)
shuffled_names = rng.permutation(cell_names)

split_point = int(len(shuffled_names) * 0.8)
train_names = shuffled_names[:split_point]
test_names = shuffled_names[split_point:]

train_id = sorted([cell_to_id[name] for name in train_names])
test_id = sorted([cell_to_id[name] for name in test_names])


In [100]:
import numpy as np

# 1. Length Check: Do we have the same number of rows?
print(f"H5 Names Count: {len(cell_names)}")
print(f"Meta Rows Count: {len(multi_metadata)}")
assert len(cell_names) == len(multi_metadata), "‚ùå Mismatch in length!"

# 2. Order Check: Is every single name in the exact same spot?
# We compare the list of names from H5 vs the Index of the Dataframe
are_aligned = np.array_equal(multi_metadata.index.values, cell_names)

if are_aligned:
    print("‚úÖ VERIFIED: The metadata is perfectly aligned with the H5 file.")
else:
    print("‚ùå WARNING: The order is different!")
    # Show the first mismatch if it failed
    for i, (name_h5, name_meta) in enumerate(zip(cell_names, multi_metadata.index)):
        if name_h5 != name_meta:
            print(f"First mismatch at row {i}:")
            print(f"  H5 says:   {name_h5}")
            print(f"  Meta says: {name_meta}")
            break

H5 Names Count: 105942
Meta Rows Count: 105942
‚úÖ VERIFIED: The metadata is perfectly aligned with the H5 file.


In [11]:
inputs_train, inputs_test = X[train_id,:], X[test_id,:]
y_subset = Y_pca[:,:100]
y_train, y_test = y_subset[train_id,:], y_subset[test_id,:]
print(inputs_train.shape, inputs_test.shape, y_train.shape, y_test.shape)

(84753, 16581) (21189, 16581) (84753, 100) (21189, 100)


In [14]:
# loading metatdata
metadata_old = pd.read_csv('metadata.csv', index_col = 'cell_id')
fix = pd.read_csv('metadata_cite_day_2_donor_27678.csv', index_col = 'cell_id')
metadata = pd.concat([metadata_old, fix], axis = 0)
del metadata_old, fix
multi_metadata  = metadata.loc[cell_names,['day','donor']]
del metadata


In [16]:
meta_train, meta_test = multi_metadata.iloc[train_id], multi_metadata.iloc[test_id]
day_col_train = meta_train['day'].values.reshape(-1,1)
day_sparse_train = csr_matrix(day_col_train)
x_train = hstack([inputs_train,day_sparse_train])

day_col_test = meta_test['day'].values.reshape(-1,1)
day_sparse_test = csr_matrix(day_col_test)
x_test = hstack([inputs_test, day_sparse_test])


In [17]:
x_train.shape

(84753, 16582)

In [25]:
# 1. Choosing a donor to_hold_out
donors = meta_train['donor'].unique()
rng = np.random.default_rng(seed=77) 
val_donor = rng.choice(donors)

print(f" Randomly Selected Holdout Donor: {val_donor}")

# 2. Create Indices based on that random choice
tr_idx = np.where(meta_train['donor']!= val_donor)[0]
tr_mask = (meta_train['donor'] != val_donor).values

va_idx = np.where(meta_train['donor'] == val_donor)[0]
va_mask = (meta_train['donor'] == val_donor).values

 Randomly Selected Holdout Donor: 32606


In [105]:

tr_mask[:5]

array([False, False, False, False, False])

In [26]:
import xgboost as xgb
import numpy as np
import gc 

# --- 1. Define Candidates B and C ---
candidate_params_remaining = [
    # Candidate B: "The Deep Miner" 
    # (Depth 10, Slow but powerful)
    dict(
        max_depth=10, 
        learning_rate=0.03,       
        min_child_weight=50,      
        colsample_bytree=0.4, 
        subsample=0.7, 
        reg_alpha=0.5, 
        reg_lambda=5.0            
    ),

    # Candidate C: "The Feature Scanner" 
    # (Depth 7, Fast and diverse)
    dict(
        max_depth=7, 
        learning_rate=0.05,
        min_child_weight=30, 
        colsample_bytree=0.15,    
        subsample=0.9, 
        reg_alpha=0.1, 
        reg_lambda=2.0
    ),
]

print(f"üöÄ Tuning Candidates B & C (PC1 Only)...")
print(f"   (Comparing against expected baseline ~23.0)")
# Slice the Sparse Matrix (x_train) and Dense Targets (Y_dev)
# CRITICAL: We only use the Top 5 PCA components for tuning speed
X_tr = x_train[tr_mask]
X_va = x_train[va_mask]

Y_tr = y_train[tr_mask][:, :1]  # Top 1 components only
Y_va = y_train[va_mask][:, :1]

print(f"Train size: {X_tr.shape[0]} | Validation size: {X_va.shape[0]}")
scores = []

# Start enumeration at 2 (Candidate A was 1)
for k, cand in enumerate(candidate_params_remaining, 2):
    print(f"\n--- Testing Candidate {k} ---")
    
    params = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "tree_method": "hist",
        "verbosity": 0,
        "n_jobs": -1,
        **cand
    }

    # NO LOOPS. Just Index 0 (PC1).
    j = 0 
    
    # 1. Create DMatrices (PC1 Only)
    dtrain = xgb.DMatrix(X_tr, label=Y_tr[:, j])
    dvalid = xgb.DMatrix(X_va, label=Y_va[:, j])
    
    # 2. Train
    booster = xgb.train(
        params, 
        dtrain, 
        num_boost_round=1000,
        evals=[(dvalid, "val")],
        early_stopping_rounds=50,
        verbose_eval=False
    )
    
    # 3. Capture Score
    best_rmse = booster.best_score
    scores.append({"cand": cand, "score": best_rmse})
    
    print(f"   >> PC1 RMSE: {best_rmse:.5f}")

    # 4. Clean Memory immediately
    del dtrain, dvalid, booster
    gc.collect()

# --- Final Decision ---
print("\n--- üèÅ Tuning Complete ---")
# Compare B and C
winner = min(scores, key=lambda x: x["score"])

print(f"Winner of Round 2:")
print(f"   RMSE: {winner['score']:.5f}")
print(f"   Params: {winner['cand']}")

üöÄ Tuning Candidates B & C (PC1 Only)...
   (Comparing against expected baseline ~23.0)
Train size: 57824 | Validation size: 26929

--- Testing Candidate 2 ---
   >> PC1 RMSE: 23.71737

--- Testing Candidate 3 ---
   >> PC1 RMSE: 24.18898

--- üèÅ Tuning Complete ---
Winner of Round 2:
   RMSE: 23.71737
   Params: {'max_depth': 10, 'learning_rate': 0.03, 'min_child_weight': 50, 'colsample_bytree': 0.4, 'subsample': 0.7, 'reg_alpha': 0.5, 'reg_lambda': 5.0}


In [None]:
 Randomly Selected Holdout Donor: 32606
Hyperparameter Tuning on Holdout Donor: 32606
Train size: 57824 | Validation size: 26929

Starting tuning on Top 5 PCA components...
[1/3] Avg RMSE: 11.31940 | Params: {'max_depth': 6, 'learning_rate': 0.05, 'min_child_weight': 20, 'colsample_bytree': 0.3, 'subsample': 0.8, 'reg_alpha': 1.0, 'reg_lambda': 1.0}

Randomly Selected Holdout Donor: 32606
Hyperparameter Tuning on Holdout Donor: 32606
Train size: 57824 | Validation size: 26929

Starting tuning on Top 5 PCA components...
[1/3] Avg RMSE: 23.85963 | Params: {'max_depth': 6, 'learning_rate': 0.05, 'min_child_weight': 20, 'colsample_bytree': 0.3, 'subsample': 0.8, 'reg_alpha': 1.0, 'reg_lambda': 1.0}


In [28]:
del dtrain, dvalid, X_tr, X_va
gc.collect()

4103

In [29]:
import numpy as np
import xgboost as xgb
import gc
import time

# -----------------------------
# Efficient A vs B comparison
# - Reuse QuantileDMatrix/DMatrix (build once)
# - Reuse feature binning (hist)
# - Update labels in-place per PC (no DMatrix rebuild)
# - Evaluate on a small PC panel and aggregate RMSE + nRMSE
# -----------------------------

# ====== Choose the PC panel (edit if you want) ======
PC_PANEL = [1, 4, 9, 29]  # PCs: 1,2,5,10,30,60,90 (0-based)
# If you only have 100 PCs, index 89 is fine. If fewer, filter automatically below.

# ====== Two candidates to compare ======
cand_A = dict(
    max_depth=6,
    learning_rate=0.05,
    min_child_weight=20,
    colsample_bytree=0.3,
    subsample=0.8,
    reg_alpha=1.0,
    reg_lambda=1.0
)

cand_B_prime = dict(
    max_depth=8,        # ‚Üì from 10 (huge win)
    learning_rate=0.04, # ‚Üë slightly
    min_child_weight=50,
    colsample_bytree=0.35,
    subsample=0.7,
    reg_alpha=0.5,
    reg_lambda=5.0
)

CANDS = [("A", cand_A), ("B", cand_B)]

# ====== Base params tuned for laptop efficiency ======
BASE_PARAMS = dict(
    objective="reg:squarederror",
    eval_metric="rmse",
    num_boost_round=1200,
    tree_method="hist",
    max_bin=256,      # speed/CPU-friendly
    n_jobs=8,         # avoid -1 on laptop (thermal/OS contention)
    verbosity=0,
    seed=77,
)

# ====== Slice once (avoid repeated CSR fancy indexing cost) ======
X_tr = x_train[tr_mask]
X_va = x_train[va_mask]

Y_tr_full = y_train[tr_mask]
Y_va_full = y_train[va_mask]

n_targets = Y_tr_full.shape[1]
pc_panel = [j for j in PC_PANEL if j < n_targets]
if len(pc_panel) == 0:
    raise ValueError(f"PC_PANEL indices {PC_PANEL} are out of range for n_targets={n_targets}")

# Make sure float32 (faster + smaller)
Y_tr_full = np.asarray(Y_tr_full, dtype=np.float32, order="C")
Y_va_full = np.asarray(Y_va_full, dtype=np.float32, order="C")

print(f"Train: {X_tr.shape} | Val: {X_va.shape} | PCs in panel: {len(pc_panel)} -> {pc_panel}")

# ====== Build matrix objects ONCE ======
# QuantileDMatrix usually faster + more memory efficient for hist.
# If it fails (older xgboost), fall back to DMatrix.
use_qdm = True
try:
    dtrain = xgb.QuantileDMatrix(X_tr, label=Y_tr_full[:, pc_panel[0]], max_bin=BASE_PARAMS["max_bin"])
    dvalid = xgb.QuantileDMatrix(X_va, label=Y_va_full[:, pc_panel[0]], ref=dtrain, max_bin=BASE_PARAMS["max_bin"])
    print("Using QuantileDMatrix")
except Exception:
    use_qdm = False
    dtrain = xgb.DMatrix(X_tr, label=Y_tr_full[:, pc_panel[0]])
    dvalid = xgb.DMatrix(X_va, label=Y_va_full[:, pc_panel[0]])
    print("Using DMatrix")

# ====== Helper: run one candidate across PC panel with in-place label swap ======
def eval_candidate(name, cand_params, num_boost_round=2000, early_stop=50):
    params = {**BASE_PARAMS, **cand_params}

    per_pc = []
    t0 = time.perf_counter()

    for j in pc_panel:
        ytr = Y_tr_full[:, j]
        yva = Y_va_full[:, j]

        # Update labels in-place (fast; avoids rebuilding dtrain/dvalid)
        dtrain.set_float_info("label", ytr)
        dvalid.set_float_info("label", yva)

        booster = xgb.train(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            evals=[(dvalid, "val")],
            early_stopping_rounds=early_stop,
            verbose_eval=False
        )

        rmse = float(booster.best_score)
        best_iter = int(booster.best_iteration)

        # Normalized RMSE (robust if PCs not whitened)
        ystd = float(np.std(yva))
        nrmse = rmse / ystd if ystd > 0 else np.nan

        per_pc.append((j, rmse, nrmse, best_iter))

        del booster
        gc.collect()

    t1 = time.perf_counter()

    rmse_mean = float(np.mean([x[1] for x in per_pc]))
    nrmse_mean = float(np.nanmean([x[2] for x in per_pc]))
    it_mean = float(np.mean([x[3] for x in per_pc]))

    return {
        "name": name,
        "time_s": t1 - t0,
        "rmse_mean": rmse_mean,
        "nrmse_mean": nrmse_mean,
        "best_iter_mean": it_mean,
        "per_pc": per_pc
    }

# ====== Run A vs B ======
results = []
for nm, cand in CANDS:
    print(f"\n=== Evaluating Candidate {nm} on PCs {pc_panel} ===")
    res = eval_candidate(nm, cand)
    results.append(res)
    print(f"Candidate {nm}: rmse_mean={res['rmse_mean']:.5f} | nrmse_mean={res['nrmse_mean']:.5f} "
          f"| best_iter_mean={res['best_iter_mean']:.1f} | time={res['time_s']:.1f}s")

# ====== Decide winner (prefer nRMSE for cross-PC comparability) ======
winner_by_nrmse = min(results, key=lambda r: r["nrmse_mean"])
winner_by_rmse  = min(results, key=lambda r: r["rmse_mean"])

print("\n--- Summary (per-PC) ---")
for r in results:
    print(f"\nCandidate {r['name']} details:")
    for (j, rmse, nrmse, it) in r["per_pc"]:
        print(f"  PC{j+1:>3}: RMSE={rmse:>8.4f} | nRMSE={nrmse:>8.4f} | best_iter={it}")

print("\n--- Winners ---")
print(f"Winner by mean nRMSE (recommended): Candidate {winner_by_nrmse['name']}")
print(f"Winner by mean RMSE:              Candidate {winner_by_rmse['name']}")

# Cleanup big objects
del dtrain, dvalid, X_tr, X_va
gc.collect()


Train: (57824, 16582) | Val: (26929, 16582) | PCs in panel: 4 -> [1, 4, 9, 29]
Using QuantileDMatrix

=== Evaluating Candidate A on PCs [1, 4, 9, 29] ===


KeyboardInterrupt: 

In [None]:
# ============================================================
# Fast A-vs-B tuning for Multiome PCA targets on a laptop
# - Converts CSR to float32 (+ int32 indices)
# - Row-subsamples train/val for tuning
# - Uses DMatrix (reliable label swapping)
# - Replaces "depth=10" with lossguide + max_leaves (much faster)
# - Selects by Pearson over CELLS (per PC), not RMSE
# - Includes a timing "pilot" block to estimate full runtime
# ============================================================

import time
import gc
import numpy as np
import xgboost as xgb
import scipy.sparse as sp

# -----------------------------
# USER SETTINGS
# -----------------------------
PC_PANEL = [1, 4, 9, 29]          # 0-based PCs -> [PC2, PC5, PC10, PC30]
TRAIN_SUB = 20000                 # tuning subsample sizes
VAL_SUB   = 12000

NUM_BOOST_ROUND = 800             # cap work
EARLY_STOP = 30
VERBOSE_EVAL = False              # set to 50 if you want to see progress

N_JOBS = 8
MAX_BIN = 128                     # 128 for tuning speed; use 256 later if needed

SEED = 77

# -----------------------------
# CANDIDATES (A + B_fast)
# -----------------------------
cand_A = dict(
    max_depth=6,
    learning_rate=0.05,
    min_child_weight=20,
    colsample_bytree=0.30,
    subsample=0.80,
    reg_alpha=1.0,
    reg_lambda=1.0,
)

# Replace depth-10 "B" with a compute-controlled variant.
# lossguide grows best-first with a leaf budget; typically much faster.
cand_B_fast = dict(
    grow_policy="lossguide",
    max_leaves=256,          # try 128 if still slow, or 512 if you want more capacity
    max_depth=0,             # ignored for lossguide; keep 0
    learning_rate=0.05,      # slightly higher than 0.03 for fewer rounds
    min_child_weight=50,
    colsample_bytree=0.35,
    subsample=0.70,
    reg_alpha=0.5,
    reg_lambda=5.0,
)

CANDS = [("A", cand_A), ("B_fast", cand_B_fast)]

BASE_PARAMS = dict(
    objective="reg:squarederror",
    eval_metric="rmse",      # used for early stopping only
    tree_method="hist",
    max_bin=MAX_BIN,
    n_jobs=N_JOBS,
    verbosity=0,
    seed=SEED,
)

# -----------------------------
# SAFETY CHECKS / PREP
# -----------------------------
assert sp.isspmatrix_csr(x_train), "Expected x_train to be CSR matrix."
assert y_train.ndim == 2, "Expected y_train shape (n_samples, n_pcs)."

# Filter PC panel to available targets
n_targets = y_train.shape[1]
pc_panel = [j for j in PC_PANEL if 0 <= j < n_targets]
if not pc_panel:
    raise ValueError(f"PC_PANEL {PC_PANEL} out of range. n_targets={n_targets}")

print(f"PC panel (0-based): {pc_panel} (n={len(pc_panel)})")
print(f"Train/Val full shapes: {x_train[tr_mask].shape} / {x_train[va_mask].shape}")

# Convert to float32 CSR + int32 indices (big speed win)
# copy=False keeps memory low if already in desired dtype
x_train32 = x_train.astype(np.float32, copy=False)
x_train32.indices = x_train32.indices.astype(np.int32, copy=False)
x_train32.indptr  = x_train32.indptr.astype(np.int32, copy=False)

# Ensure y is float32 contiguous for fast slicing
y_train32 = np.asarray(y_train, dtype=np.float32, order="C")

# -----------------------------
# Subsample rows for tuning (HUGE speed win)
# -----------------------------
rng = np.random.default_rng(SEED)
tr_idx = np.where(tr_mask)[0]
va_idx = np.where(va_mask)[0]

tr_sub = rng.choice(tr_idx, size=min(TRAIN_SUB, tr_idx.size), replace=False)
va_sub = rng.choice(va_idx, size=min(VAL_SUB,   va_idx.size), replace=False)

X_tr = x_train32[tr_sub]
X_va = x_train32[va_sub]
Y_tr_full = y_train32[tr_sub]
Y_va_full = y_train32[va_sub]

print(f"Subsampled shapes: Train {X_tr.shape} | Val {X_va.shape}")
print(f"X_tr nnz={X_tr.nnz:,} | density={X_tr.nnz/(X_tr.shape[0]*X_tr.shape[1]):.4f}")

# -----------------------------
# Utilities
# -----------------------------
def pearson_corr(a: np.ndarray, b: np.ndarray) -> float:
    """Pearson correlation between two 1D arrays. Returns nan if degenerate."""
    a = np.asarray(a, dtype=np.float32)
    b = np.asarray(b, dtype=np.float32)
    a = a - a.mean()
    b = b - b.mean()
    denom = float(np.sqrt((a*a).sum()) * np.sqrt((b*b).sum()))
    if denom == 0.0:
        return np.nan
    return float((a*b).sum() / denom)

def build_dmatrices(X_tr, X_va, ytr0, yva0):
    """Build once; later we'll swap labels via set_float_info."""
    dtrain = xgb.DMatrix(X_tr, label=ytr0)
    dvalid = xgb.DMatrix(X_va, label=yva0)
    return dtrain, dvalid

def train_one_pc(dtrain, dvalid, params, ytr, yva,
                 num_boost_round=NUM_BOOST_ROUND,
                 early_stop=EARLY_STOP,
                 verbose_eval=VERBOSE_EVAL):
    """Train for one PC; returns rmse, pearson, best_iter, elapsed_s."""
    dtrain.set_float_info("label", ytr)
    dvalid.set_float_info("label", yva)

    t0 = time.perf_counter()
    booster = xgb.train(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        evals=[(dvalid, "val")],
        early_stopping_rounds=early_stop,
        verbose_eval=verbose_eval,
    )
    elapsed = time.perf_counter() - t0

    # Predict and compute Pearson over CELLS (stable!)
    pred = booster.predict(dvalid, iteration_range=(0, booster.best_iteration + 1))
    rmse = float(booster.best_score)
    pr = pearson_corr(yva, pred)
    best_iter = int(booster.best_iteration)

    del booster, pred
    gc.collect()
    return rmse, pr, best_iter, elapsed

# ============================================================
# PILOT TIMING BLOCK (reliable estimate before full run)
# ============================================================
print("\n=== PILOT TIMING (estimate runtime before full tuning) ===")

# Use a single mid PC (PC10 if available, else first in panel)
pilot_pc = 9 if 9 in pc_panel else pc_panel[0]
ytr0 = Y_tr_full[:, pilot_pc]
yva0 = Y_va_full[:, pilot_pc]

dtrain, dvalid = build_dmatrices(X_tr, X_va, ytr0, yva0)

pilot_times = {}
for name, cand in CANDS:
    params = {**BASE_PARAMS, **cand}
    rmse, pr, best_iter, sec = train_one_pc(dtrain, dvalid, params, ytr0, yva0)
    pilot_times[name] = sec
    print(f"Pilot {name} on PC{pilot_pc+1}: time={sec:.1f}s | best_iter={best_iter} | rmse={rmse:.4f} | pearson={pr:.4f}")

# Extrapolate to full panel (rough but usually within ~20‚Äì30%)
panel_factor = len(pc_panel)
est_total_s = sum(pilot_times.values()) * panel_factor
print(f"\nEstimated total time for A vs B_fast on {len(pc_panel)} PCs ~ {est_total_s/60:.1f} minutes")
print("Rule: if this estimate is too high, reduce TRAIN_SUB/VAL_SUB, max_leaves, or NUM_BOOST_ROUND.\n")

# ============================================================
# FULL RUN (A vs B_fast) ON PC PANEL
# ============================================================
print("=== FULL EVALUATION (A vs B_fast) ===")

# Initialize DMatrices once with any PC labels; we'll swap inside loop
init_pc = pc_panel[0]
dtrain.set_float_info("label", Y_tr_full[:, init_pc])
dvalid.set_float_info("label", Y_va_full[:, init_pc])

results = []
for name, cand in CANDS:
    params = {**BASE_PARAMS, **cand}

    per_pc = []
    t_start = time.perf_counter()
    print(f"\n=== Evaluating Candidate {name} on PCs {[j for j in pc_panel]} ===")

    for j in pc_panel:
        ytr = Y_tr_full[:, j]
        yva = Y_va_full[:, j]
        rmse, pr, best_iter, sec = train_one_pc(dtrain, dvalid, params, ytr, yva)
        per_pc.append((j, rmse, pr, best_iter, sec))
        print(f"  PC{j+1:>3}: time={sec:>6.1f}s | best_iter={best_iter:>4} | rmse={rmse:>7.3f} | pearson={pr:>7.4f}")

    total_s = time.perf_counter() - t_start
    pearson_mean = float(np.nanmean([x[2] for x in per_pc]))
    rmse_mean = float(np.mean([x[1] for x in per_pc]))

    results.append(dict(
        name=name,
        pearson_mean=pearson_mean,
        rmse_mean=rmse_mean,
        total_s=total_s,
        per_pc=per_pc,
        params=cand,
    ))

    print(f"\nCandidate {name} summary: mean_pearson={pearson_mean:.5f} | mean_rmse={rmse_mean:.5f} | total={total_s/60:.1f} min")

# Winner by mean Pearson (recommended)
winner = max(results, key=lambda r: r["pearson_mean"])
print("\n=== WINNER (by mean Pearson over cells, averaged across PCs) ===")
print(f"Winner: {winner['name']}")
print(f"mean_pearson={winner['pearson_mean']:.5f} | mean_rmse={winner['rmse_mean']:.5f} | time={winner['total_s']/60:.1f} min")
print("Params:", winner["params"])

# Cleanup
del dtrain, dvalid
gc.collect()



In [35]:
import numpy as np
import xgboost as xgb
import scipy.sparse as sp

# ---- choose which PC to diagnose (0-based) ----
j = 1   # PC2

# ---- ensure float32 CSR + int32 indices (fast) ----
x_train32 = x_train.astype(np.float32, copy=False)
x_train32.indices = x_train32.indices.astype(np.int32, copy=False)
x_train32.indptr  = x_train32.indptr.astype(np.int32, copy=False)

y_train32 = np.asarray(y_train, dtype=np.float32, order="C")

# ---- slice your existing val split ----
X_tr = x_train32[tr_mask]
X_va = x_train32[va_mask]
y_tr = y_train32[tr_mask, j]
y_va = y_train32[va_mask, j]

# ---- Candidate A params (as before) ----
params = dict(
    objective="reg:squarederror",
    eval_metric="rmse",
    tree_method="hist",
    max_bin=128,
    n_jobs=8,
    verbosity=0,
    seed=77,

    max_depth=6,
    learning_rate=0.05,
    min_child_weight=20,
    colsample_bytree=0.30,
    subsample=0.80,
    reg_alpha=1.0,
    reg_lambda=1.0,
)

dtrain = xgb.DMatrix(X_tr, label=y_tr)
dvalid = xgb.DMatrix(X_va, label=y_va)

booster = xgb.train(
    params,
    dtrain,
    num_boost_round=800,
    evals=[(dvalid, "val")],
    early_stopping_rounds=30,
    verbose_eval=False
)

pred = booster.predict(dvalid, iteration_range=(0, booster.best_iteration + 1)).astype(np.float32, copy=False)

# ---- calibration functions (drop-in) ----
def pearson(a, b):
    a = np.asarray(a, dtype=np.float32); b = np.asarray(b, dtype=np.float32)
    a = a - a.mean(); b = b - b.mean()
    denom = np.sqrt((a*a).sum()) * np.sqrt((b*b).sum())
    return np.nan if denom == 0 else float((a*b).sum() / denom)

def rmse(y, p):
    y = np.asarray(y, dtype=np.float32); p = np.asarray(p, dtype=np.float32)
    return float(np.sqrt(np.mean((y - p) ** 2)))

def fit_affine(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=np.float32)
    y_pred = np.asarray(y_pred, dtype=np.float32)
    xm = float(y_pred.mean()); ym = float(y_true.mean())
    xvar = float(((y_pred - xm) ** 2).mean())
    if xvar == 0.0:
        a, b = 0.0, ym
    else:
        cov = float(((y_pred - xm) * (y_true - ym)).mean())
        a = cov / xvar
        b = ym - a * xm
    return a, b, a * y_pred + b

def calibration_report(y_true, y_pred, name="PC"):
    p0 = pearson(y_true, y_pred); r0 = rmse(y_true, y_pred)
    std_ratio = float(np.std(y_pred) / np.std(y_true)) if np.std(y_true) > 0 else np.nan
    bias = float(y_pred.mean() - y_true.mean())

    a, b, y_hat = fit_affine(y_true, y_pred)
    p1 = pearson(y_true, y_hat); r1 = rmse(y_true, y_hat)

    print(f"\n{name} calibration")
    print(f"  Pearson:  {p0:.5f}  ->  {p1:.5f}")
    print(f"  RMSE:     {r0:.5f}  ->  {r1:.5f}")
    print(f"  std_ratio(pred/true): {std_ratio:.3f}")
    print(f"  bias(pred-true mean): {bias:.5f}")
    print(f"  affine: a={a:.5f}, b={b:.5f}")

calibration_report(y_va, pred, name=f"PC{j+1}")



PC2 calibration
  Pearson:  0.88763  ->  0.88763
  RMSE:     13.80343  ->  12.13494
  std_ratio(pred/true): 0.928
  bias(pred-true mean): 6.49391
  affine: a=0.95695, b=-6.32698


In [36]:
# ============================================================
# Feature Ablation: PCA-only vs Peaks-only vs Full (same model)
# Efficient + laptop-friendly:
# - float32 CSR (+ int32 indices)
# - row subsample
# - small PC panel
# - Candidate A fixed
# - evaluates mean Pearson over CELLS per PC
# ============================================================

import time, gc
import numpy as np
import xgboost as xgb
import scipy.sparse as sp

# -----------------------------
# CONFIG (edit if needed)
# -----------------------------
SEED = 77
N_JOBS = 8
MAX_BIN = 128
NUM_BOOST_ROUND = 800
EARLY_STOP = 30

TRAIN_SUB = 20000
VAL_SUB   = 12000

# PCs to evaluate (0-based). Keep small and spread out.
PC_PANEL = [1, 4, 9, 29]   # PCs 2,5,10,30

# ---- Column layout in your x_train ----
# Based on your description: 1000 PCA + 15000 peaks + day (1 col)
N_PCA   = 1000
N_PEAKS = 15000
# If you added more than one "day" feature, set N_META accordingly.
N_META  = None  # auto: total_cols - (N_PCA + N_PEAKS)

# Candidate A (locked)
cand_A = dict(
    max_depth=6,
    learning_rate=0.05,
    min_child_weight=20,
    colsample_bytree=0.30,
    subsample=0.80,
    reg_alpha=1.0,
    reg_lambda=1.0,
)

BASE_PARAMS = dict(
    objective="reg:squarederror",
    eval_metric="rmse",
    tree_method="hist",
    max_bin=MAX_BIN,
    n_jobs=N_JOBS,
    verbosity=0,
    seed=SEED,
)

# -----------------------------
# Helpers
# -----------------------------
def pearson_corr(a: np.ndarray, b: np.ndarray) -> float:
    a = np.asarray(a, dtype=np.float32)
    b = np.asarray(b, dtype=np.float32)
    a = a - a.mean()
    b = b - b.mean()
    denom = float(np.sqrt((a*a).sum()) * np.sqrt((b*b).sum()))
    if denom == 0.0:
        return np.nan
    return float((a*b).sum() / denom)

def eval_one_ablation(X_tr, X_va, Y_tr, Y_va, pc_panel, params,
                      num_boost_round=NUM_BOOST_ROUND, early_stop=EARLY_STOP):
    """
    For fixed X_tr/X_va and multiple PCs:
      - build DMatrix once
      - swap label via set_float_info
      - train each PC
      - return per-PC pearson + mean pearson
    """
    pc0 = pc_panel[0]
    dtrain = xgb.DMatrix(X_tr, label=Y_tr[:, pc0])
    dvalid = xgb.DMatrix(X_va, label=Y_va[:, pc0])

    per_pc = []
    t0 = time.perf_counter()

    for j in pc_panel:
        ytr = Y_tr[:, j]
        yva = Y_va[:, j]
        dtrain.set_float_info("label", ytr)
        dvalid.set_float_info("label", yva)

        booster = xgb.train(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            evals=[(dvalid, "val")],
            early_stopping_rounds=early_stop,
            verbose_eval=False
        )

        pred = booster.predict(dvalid, iteration_range=(0, booster.best_iteration + 1))
        pr = pearson_corr(yva, pred)
        per_pc.append((j, pr, int(booster.best_iteration), float(booster.best_score)))

        del booster, pred
        gc.collect()

    total_s = time.perf_counter() - t0
    mean_pr = float(np.nanmean([x[1] for x in per_pc]))

    del dtrain, dvalid
    gc.collect()

    return mean_pr, per_pc, total_s

# -----------------------------
# PREP DATA (float32 CSR + subsample)
# -----------------------------
assert sp.isspmatrix_csr(x_train), "Expected x_train to be CSR"
x_train32 = x_train.astype(np.float32, copy=False)
x_train32.indices = x_train32.indices.astype(np.int32, copy=False)
x_train32.indptr  = x_train32.indptr.astype(np.int32, copy=False)

y_train32 = np.asarray(y_train, dtype=np.float32, order="C")
n_targets = y_train32.shape[1]

pc_panel = [j for j in PC_PANEL if 0 <= j < n_targets]
if not pc_panel:
    raise ValueError(f"PC_PANEL {PC_PANEL} out of range for n_targets={n_targets}")

n_cols = x_train32.shape[1]
if N_META is None:
    N_META = n_cols - (N_PCA + N_PEAKS)
if N_META < 0:
    raise ValueError(
        f"Column split invalid: total_cols={n_cols}, N_PCA={N_PCA}, N_PEAKS={N_PEAKS} "
        f"implies N_META={N_META} < 0. Fix N_PCA/N_PEAKS."
    )

pca_cols   = slice(0, N_PCA)
peaks_cols = slice(N_PCA, N_PCA + N_PEAKS)
meta_cols  = slice(N_PCA + N_PEAKS, N_PCA + N_PEAKS + N_META)

print(f"Total cols: {n_cols}")
print(f"PCA cols:   [0, {N_PCA})")
print(f"Peaks cols: [{N_PCA}, {N_PCA+N_PEAKS})")
print(f"Meta cols:  [{N_PCA+N_PEAKS}, {N_PCA+N_PEAKS+N_META})  (N_META={N_META})")
print(f"PC panel:   {pc_panel}")

rng = np.random.default_rng(SEED)
tr_idx = np.where(tr_mask)[0]
va_idx = np.where(va_mask)[0]
tr_sub = rng.choice(tr_idx, size=min(TRAIN_SUB, tr_idx.size), replace=False)
va_sub = rng.choice(va_idx, size=min(VAL_SUB,   va_idx.size), replace=False)

Y_tr = y_train32[tr_sub]
Y_va = y_train32[va_sub]

# Pre-slice rows once
X_tr_full = x_train32[tr_sub]
X_va_full = x_train32[va_sub]

# Build ablation matrices (still CSR)
X_tr_pca   = X_tr_full[:, pca_cols]
X_va_pca   = X_va_full[:, pca_cols]

X_tr_peaks = X_tr_full[:, peaks_cols]
X_va_peaks = X_va_full[:, peaks_cols]

# Full = PCA + Peaks (+ meta if present)
# If meta exists, include it consistently across all experiments.
# (Day can matter; ablation is about PCA vs peaks, not about dropping meta.)
if N_META > 0:
    X_tr_meta = X_tr_full[:, meta_cols]
    X_va_meta = X_va_full[:, meta_cols]

    X_tr_pca_m   = sp.hstack([X_tr_pca,   X_tr_meta], format="csr")
    X_va_pca_m   = sp.hstack([X_va_pca,   X_va_meta], format="csr")

    X_tr_peaks_m = sp.hstack([X_tr_peaks, X_tr_meta], format="csr")
    X_va_peaks_m = sp.hstack([X_va_peaks, X_va_meta], format="csr")

    X_tr_full_m  = X_tr_full
    X_va_full_m  = X_va_full
else:
    X_tr_pca_m, X_va_pca_m = X_tr_pca, X_va_pca
    X_tr_peaks_m, X_va_peaks_m = X_tr_peaks, X_va_peaks
    X_tr_full_m, X_va_full_m = X_tr_full, X_va_full

paramsA = {**BASE_PARAMS, **cand_A}

# -----------------------------
# RUN ABLATIONS
# -----------------------------
ablations = [
    ("PCA_only",   X_tr_pca_m,   X_va_pca_m),
    ("Peaks_only", X_tr_peaks_m, X_va_peaks_m),
    ("Full",       X_tr_full_m,  X_va_full_m),
]

results = []
print("\n=== Running feature ablation (Candidate A) ===")
for name, Xtr, Xva in ablations:
    nnz = Xtr.nnz
    dens = nnz / (Xtr.shape[0] * Xtr.shape[1])
    print(f"\n-- {name} -- shape={Xtr.shape}, nnz={nnz:,}, density={dens:.4f}")

    mean_pr, per_pc, total_s = eval_one_ablation(Xtr, Xva, Y_tr, Y_va, pc_panel, paramsA)
    results.append((name, mean_pr, per_pc, total_s))

    print(f"{name}: mean Pearson over PCs = {mean_pr:.5f} | time={total_s/60:.1f} min")
    for (j, pr, best_it, best_rmse) in per_pc:
        print(f"  PC{j+1:>3}: pearson={pr:.5f} | best_iter={best_it:>4} | rmse={best_rmse:.4f}")

# -----------------------------
# SUMMARY
# -----------------------------
print("\n=== Ablation summary (higher mean Pearson is better) ===")
results_sorted = sorted(results, key=lambda x: x[1], reverse=True)
for name, mean_pr, per_pc, total_s in results_sorted:
    print(f"{name:>10}: meanPearson={mean_pr:.5f} | time={total_s/60:.1f} min")

winner = results_sorted[0]
print(f"\nWinner: {winner[0]}")

# Cleanup big objects
del X_tr_full, X_va_full, X_tr_pca, X_va_pca, X_tr_peaks, X_va_peaks
gc.collect()


Total cols: 16582
PCA cols:   [0, 1000)
Peaks cols: [1000, 16000)
Meta cols:  [16000, 16582)  (N_META=582)
PC panel:   [1, 4, 9, 29]

=== Running feature ablation (Candidate A) ===

-- PCA_only -- shape=(20000, 1582), nnz=20,288,764, density=0.6412
PCA_only: mean Pearson over PCs = 0.76387 | time=3.9 min
  PC  2: pearson=0.88290 | best_iter= 175 | rmse=13.9719
  PC  5: pearson=0.91093 | best_iter= 432 | rmse=6.1703
  PC 10: pearson=0.71369 | best_iter= 268 | rmse=9.5020
  PC 30: pearson=0.54797 | best_iter= 273 | rmse=3.4331

-- Peaks_only -- shape=(20000, 15582), nnz=9,078,947, density=0.0291
Peaks_only: mean Pearson over PCs = 0.49625 | time=10.4 min
  PC  2: pearson=0.77826 | best_iter= 795 | rmse=17.5165
  PC  5: pearson=0.69959 | best_iter= 799 | rmse=10.6508
  PC 10: pearson=0.42399 | best_iter= 386 | rmse=11.1888
  PC 30: pearson=0.08314 | best_iter= 124 | rmse=4.0412

-- Full -- shape=(20000, 16582), nnz=29,078,947, density=0.0877
Full: mean Pearson over PCs = 0.76225 | time=12

0

In [None]:
# --- XGBoost Helper Functions ---

def kaggle_mean_cellwise_pearson(y_true, y_pred):
    """
    Calculates the mean Pearson correlation per cell (row-wise).
    Mimics the competition metric. Constant predictions yield -1.
    """
    yt = np.asarray(y_true)
    yp = np.asarray(y_pred)
    
    # Center data (subtract mean)
    y_pred_centered = yp - yp.mean(axis=1, keepdims=True)
    y_true_centered = yt - yt.mean(axis=1, keepdims=True)
    
    # Calculate numerator and denominator
    num = np.sum(y_true_centered * y_pred_centered, axis=1)
    den = np.sqrt(np.sum(y_true_centered ** 2, axis=1) * np.sum(y_pred_centered ** 2, axis=1))
    
    # Handle constant predictions (zero variance)
    const_pred = np.all(yp == yp[:, [0]], axis=1)
    
    corrs = np.empty(y_true.shape[0], dtype=float)
    corrs[const_pred] = -1.0  # Penalize constant predictions
    
    # Calculate valid correlations
    valid = (~const_pred) & (den > 0)
    corrs[valid] = num[valid] / den[valid]
    
    # Handle remaining edge cases
    corrs[~const_pred & (den == 0)] = np.nan
    
    return np.nanmean(corrs)

def split_indices_donor(X_df, donor_col="donor", random_state=77):
    """
    Splits data by holding out ONE donor for validation.
    Crucial for testing generalization to new batches/donors.
    """
    donors = X_df[donor_col].unique()
    rng = np.random.RandomState(random_state)
    val_donor = rng.choice(donors)
    
    train_idx = np.where(X_df[donor_col] != val_donor)[0]
    val_idx   = np.where(X_df[donor_col] == val_donor)[0]
    
    return train_idx, val_idx, val_donor

def to_float32(df):
    """Converts DataFrame to float32 numpy array (XGBoost friendly)."""
    return df.to_numpy().astype(np.float32, copy=False)

In [62]:
print(valid_ids[:20])

def kaggle_mean_cellwise_pearson(y_true, y_pred):
    """
    Mean per-cell Pearson correlation with the rule that constant prediction 
    should give -1 in correlation
    """
    yt = np.asarray(y_true)
    yp = np.asarray(y_pred)
    # constant prediction per row -> -1
    const_pred = np.all(yp == yp[:, [0]], axis=1)
    y_pred_centered = yp - yp.mean(axis=1, keepdims=True)
    y_true_centered = yt - yt.mean(axis=1, keepdims=True)
    num = np.sum(y_true_centered * y_pred_centered, axis=1)
    den = np.sqrt(np.sum(y_true_centered **2, axis=1) * np.sum(y_pred_centered**2, axis=1))
    corrs = np.empty(y_true.shape[0], dtype=float)
    corrs[const_pred] = -1.0
    # Pearson can be calculated
    valid = (~const_pred) & (den > 0)
    corrs[valid] = num[valid] / den[valid]
    # non-constant with zero denominator
    corrs[~const_pred & (den == 0)] = np.nan
    return np.nanmean(corrs)  
    
from sklearn.linear_model import Ridge
from sklearn.model_selection import GroupShuffleSplit
# --- FIRST CHANGE: load what you need to score in gene space ---
Y_pca_components = np.load("Y_ipca_components_300.npy")   # shape (300, n_genes)
Y_pca_mean       = np.load("Y_ipca_mean.npy")             # shape (n_genes,)

# Y_full must be the true RNA in gene space (n_cells, n_genes)
# If you don't already have it as an array/memmap, load it from the h5 (this will be slow if done repeatedly).
import tables
with tables.open_file("train_multi_targets.h5", "r") as f:
    Y_full = f.get_node("/train_multi_targets/block0_values")[:]   # shape (n_cells, n_genes)

X_dims = [500, 800, 1000]
Y_dims = [100, 200, 300]

groups = (np.array([hash(s) for s in valid_ids])
          & 0x7fffffffffffffff) % 50

gss = GroupShuffleSplit(n_splits=3, test_size=0.2, random_state=42)

for xd in X_dims:
    X = X_csr_1000[:, :xd]
    for yd in Y_dims:
        Y = Y_pca[:, :yd]

        scores = []
        for tr, va in gss.split(X, Y, groups):
            model = Ridge(alpha=1.0)
            model.fit(X[tr], Y[tr])
            pred = model.predict(X[va])                  # (n_val, yd)
            W = Y_pca_components[:yd, :]                 # (yd, n_genes)
            Y_pred_full = pred @ W + Y_pca_mean[None, :] # (n_val, n_genes)
            Y_true_full = Y_full[va]

            score = kaggle_mean_cellwise_pearson(Y_true_full, Y_pred_full)

            scores.append(score)

        print(
            f"X={xd:4d} Y={yd:3d}  "
            f"Pearson={np.mean(scores):.5f} ¬± {np.std(scores):.5f}"
        )


['56390cf1b95e' 'fc0c60183c33' '9b4a87e22ad0' '81cccad8cd81'
 '15cb3d85c232' 'a7791bcf1152' '072790e768b1' '404459b1005b'
 '627a5071cbd7' '00f283126092' '627703f5faa0' '3894c8880096'
 'e0af51ad3900' 'e31ca103a4ac' '47711761153f' 'c9d7ec67e230'
 '4495e228dcbd' '89c1d660a925' '7d66e9fac697' '73e80a80ac36']
X= 500 Y=100  Pearson=0.66602 ¬± 0.00015
X= 500 Y=200  Pearson=0.66599 ¬± 0.00015
X= 500 Y=300  Pearson=0.66595 ¬± 0.00015
X= 800 Y=100  Pearson=0.66601 ¬± 0.00015
X= 800 Y=200  Pearson=0.66595 ¬± 0.00015
X= 800 Y=300  Pearson=0.66589 ¬± 0.00015
X=1000 Y=100  Pearson=0.66599 ¬± 0.00015
X=1000 Y=200  Pearson=0.66591 ¬± 0.00015
X=1000 Y=300  Pearson=0.66584 ¬± 0.00015


In [11]:
import numpy as np
import pandas as pd

# 1. Load and Standardize Gene Names
all_gene_names = np.load("train_multi_genes.npy", allow_pickle=True)
if all_gene_names.dtype.kind == 'S':
    all_gene_names = np.char.decode(all_gene_names)
all_gene_names = all_gene_names.astype(str)

# 2. Standardize target_cols
if isinstance(target_cols, (pd.DataFrame, pd.Series)):
    target_cols = target_cols.values.flatten()
target_cols = np.asarray(target_cols)

# 3. Robust Logic
is_numeric = np.issubdtype(target_cols.dtype, np.number)

if is_numeric:
    print(f"‚úÖ Input detected as Integers. Checking bounds...")
    # Filter out any indices that are too large
    valid_mask = target_cols < len(all_gene_names)
    target_indices = target_cols[valid_mask].astype(int)
    
    if len(target_indices) < len(target_cols):
        print(f"‚ö†Ô∏è Warning: Dropped {len(target_cols) - len(target_indices)} indices that were out of bounds.")

else:
    print(f"üîç Input detected as Strings. Mapping to indices with validation...")
    target_cols_str = target_cols.astype(str)
    
    # Sort for fast searching
    sorter = np.argsort(all_gene_names)
    
    # Get insertion points
    insertion_idx = np.searchsorted(all_gene_names, target_cols_str, sorter=sorter)
    
    # Identify which ones are actually valid
    # 1. Must be within bounds (index < length)
    # 2. The value at that index must actually MATCH the target name
    is_in_bounds = insertion_idx < len(all_gene_names)
    
    # Check for exact matches
    # We only check the ones that are in bounds to avoid the IndexError you just saw
    valid_mask = is_in_bounds.copy()
    valid_mask[is_in_bounds] &= (all_gene_names[sorter[insertion_idx[is_in_bounds]]] == target_cols_str[is_in_bounds])
    
    # Select the valid indices
    target_indices = sorter[insertion_idx[valid_mask]]
    
    print(f"‚úÖ Found {len(target_indices)} of {len(target_cols)} requested genes.")
    if len(target_indices) < len(target_cols):
        print(f"‚ö†Ô∏è {len(target_cols) - len(target_indices)} genes were not found in the dataset and will be skipped.")

print(f"Evaluation locked to {len(target_indices)} genes.")

üîç Input detected as Strings. Mapping to indices with validation...
‚úÖ Found 1376144 of 9633008 requested genes.
‚ö†Ô∏è 8256864 genes were not found in the dataset and will be skipped.
Evaluation locked to 1376144 genes.


In [13]:
import gc
from sklearn.linear_model import Ridge
from sklearn.model_selection import GroupShuffleSplit

# 1. Load the TRUE RNA values into RAM (It's only ~9 GB, you have 62 GB)
# This makes everything much faster than reading from disk
import tables
with tables.open_file("train_multi_targets.h5", "r") as f:
    # Use the target_indices immediately to only load what we need
    # If target_indices is still broken, we default to all genes
    if len(target_indices) > 30000: 
        target_indices = np.arange(23418)
        
    print("Loading Validation Targets into RAM...")
    Y_full = f.get_node("/train_multi_targets/block0_values")[:]
    # Filter Y_full to only the genes we care about right now
    Y_full = Y_full[:, target_indices] 

print(f"‚úÖ Targets Loaded. Shape: {Y_full.shape}")

# 2. Training Loop
X_dims = [1000]
Y_dims = [100, 300] 

gss = GroupShuffleSplit(n_splits=2, test_size=0.2, random_state=42)





Loading Validation Targets into RAM...
‚úÖ Targets Loaded. Shape: (105942, 23418)

üöÄ Training: X=1000, Y=100


NameError: name 'groups' is not defined

In [14]:
import numpy as np

# 1. Load the Cell IDs (saved from Notebook 1)
cell_ids_bytes = np.load("train_multi_cell_ids.npy")

# 2. Decode them to strings (required for consistent hashing)
if cell_ids_bytes.dtype.kind == 'S':
    valid_ids = np.char.decode(cell_ids_bytes)
else:
    valid_ids = cell_ids_bytes.astype(str)

# 3. Create the 'groups' variable
# We use hashing to assign arbitrary groups if donor info isn't explicitly loaded
groups = (np.array([hash(s) for s in valid_ids]) & 0x7fffffffffffffff) % 50

print(f"‚úÖ Groups defined. Shape: {groups.shape}")
for xd in X_dims:
    X_feat = X_csr_1000[:, :xd]
    
    for yd in Y_dims:
        print(f"\nüöÄ Training: X={xd}, Y={yd}")
        
        # Slice Decoder Weights for the specific targets we are using
        W_sliced = Y_components[:yd, target_indices]
        Y_mean_sliced = Y_mean[target_indices]
        
        scores = []
        
        for split_i, (tr, va) in enumerate(gss.split(X_csr_1000, Y_pca, groups)):
            # Train
            model = Ridge(alpha=1.0)
            model.fit(X_feat[tr], Y_pca[tr, :yd])
            
            # Predict (Latent)
            pred_latent = model.predict(X_feat[va])
            
            # Decode (Gene Space)
            pred_gene = pred_latent @ W_sliced + Y_mean_sliced
            
            # Score
            # Y_full is already filtered and in memory, so we just slice rows [va]
            true_gene = Y_full[va]
            
            # Pearson Calculation (Vectorized)
            # Centering
            p_centered = pred_gene - pred_gene.mean(axis=1, keepdims=True)
            t_centered = true_gene - true_gene.mean(axis=1, keepdims=True)
            # Correlation
            num = np.sum(p_centered * t_centered, axis=1)
            den = np.sqrt(np.sum(p_centered**2, axis=1) * np.sum(t_centered**2, axis=1))
            corr = num / den
            corr[den == 0] = 0
            
            mean_score = np.nanmean(corr)
            scores.append(mean_score)
            print(f"  Split {split_i+1}: {mean_score:.5f}")
            
            del model, pred_latent, pred_gene, corr
            gc.collect()
            
        print(f"‚úÖ Result: X={xd} Y={yd} | Pearson={np.mean(scores):.5f}")

‚úÖ Groups defined. Shape: (105942,)

üöÄ Training: X=1000, Y=100
  Split 1: 0.66596
  Split 2: 0.66644
‚úÖ Result: X=1000 Y=100 | Pearson=0.66620

üöÄ Training: X=1000, Y=300
  Split 1: 0.66582
  Split 2: 0.66629
‚úÖ Result: X=1000 Y=300 | Pearson=0.66605
