In [3]:
print(os.getpid())

121798


In [2]:
import os
import gc
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import tables
import re

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import make_scorer

# Limit threads for numerical libraries to manage CPU usage
os.environ["OPENBLAS_NUM_THREADS"] = "7"
os.environ["OMP_NUM_THREADS"] = "7"

base_dir = "/home/skovtun/Python_projects/Kaggle/Single_cell/"
data_dir = os.path.join(base_dir, "data")
random_state = 77

os.chdir(data_dir)

In [3]:
#Getting external file providing the mapping of human genes to their chromosome coordinates on 
#the GRCh38 genome to use for reducing number of columns for every target.
# RAW LINE:
# 1	havana	gene	11869	14409	.	+	.	gene_id "ENSG00000223972"; gene_version "5"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene";

gtf_path = "Homo_sapiens.GRCh38.98.gtf"

genes = []

with open(gtf_path) as f:
    for line in f:
        #skipping comments
        if line.startswith("#"):
            continue
        #splitting the line
        fields = line.strip().split("\t")
        if fields[2] != "gene":
            continue
        
        chrom = fields[0]
        start = int(fields[3])
        end = int(fields[4])
        
        attr = fields[8]
        match_id = re.search(r'gene_id "([^"]+)"',attr)
        match_name = re.search(r'gene_name "([^"]+)"',attr)
        if match_id:
            gene_id = match_id.group(1)
        else:
            continue
        if match_name:
            gene_name = match_name.group(1)
        else:
            gene_name = None
        
        genes.append([gene_id, gene_name, chrom, start, end])

gene_df = pd.DataFrame(genes, columns=["gene_id", "gene_name", "chr", "start", "end"])
mapping = {str(i): f"chr{i}" for i in range(1,23)}
mapping['X'] = 'chrX'
mapping['Y'] = 'chrY'
gene_df['chr'] = gene_df['chr'].map(mapping).fillna(gene_df['chr'])
#gene_df['chr'].unique()

In [79]:
gene_df.shape

(60623, 5)

In [4]:
#Getting a list of genes and all coordinates, parsing ATAC Peaks
path = "train_multi_inputs.h5"
cols = pd.read_hdf(path, key="train_multi_inputs", start=0, stop=1).columns
path = "train_multi_targets.h5"
cols_t = pd.read_hdf(path, key = 'train_multi_targets', start = 0, stop=1).columns

#From the column names of the multi creating a dataframe with location name, start and end.
arr = pd.Series(cols.values)
r_p = r'([^:]+):([\d]+)-([\d]+)'
chr_ranges = arr.str.extract(r_p)
chr_ranges.columns = ['chr','start','end']
chr_ranges['start'] = chr_ranges['start'].astype(int)
chr_ranges['end'] = chr_ranges['end'].astype(int)

#Reducing gene_df by choosing onnly chr present in chr_ranges and only gene_id's present as targets.
multi_chr = list(chr_ranges['chr'].unique())
gene_df_multi = gene_df[gene_df['chr'].isin(multi_chr)]
gene_df_multi = gene_df_multi.set_index('gene_id')
missing = cols_t.difference(gene_df_multi.index)
gene_df_multi = gene_df_multi.loc[gene_df_multi.index.intersection(cols_t)]
gene_df_multi.shape

(23404, 4)

In [5]:
#calculating maximum amount of features for every target.
window = 2000

genes = gene_df_multi.loc[gene_df_multi.index.intersection(cols_t)].copy()
genes["left"]  = genes["start"] - window
genes["right"] = genes["end"]   + window
genes["gene_id"] = genes.index  # preserve gene_id as a column

chr_ranges = chr_ranges.rename(columns={
    "start": "start_peak",
    "end":   "end_peak"
})

target_cols = pd.DataFrame()
results = []
gene_id = []
start_peak = []
end_peak = []

for chr_name, chr_peaks in chr_ranges.groupby("chr"):
    sub_genes = genes[genes["chr"] == chr_name]
    if sub_genes.empty or chr_peaks.empty:
        continue

    merged = sub_genes[['gene_id', 'chr', 'left', 'right']].merge(chr_peaks[['chr', 'start_peak', 'end_peak']], on="chr")

    mask = (
        (merged["start_peak"] >= merged["left"]) &
        (merged["end_peak"]   <= merged["right"])
    )
    m = merged[mask]
    results.append(m)
        
target_cols = pd.concat(results)
target_cols['col'] = target_cols['chr'].astype(str)+":"+target_cols['start_peak'].astype(str)+"-"+target_cols['end_peak'].astype(str)

In [37]:
#Loading pre-computed features from Notebook 1

X_pca = np.load("X_csr_1000.npy", mmap_mode="r")
if X_pca.dtype != np.float32:
    X_pca = np.asarray(X_pca, dtype=np.float32)

n_cells = X_pca.shape[0]

Y_pca = np.memmap("Y_train_pca_300.f32", dtype="float32", mode="r", shape=(n_cells, 300))
Y_components = np.load("Y_ipca_components_300.npy")
Y_mean = np.load("Y_ipca_mean.npy")

print("X_pca:", X_pca.shape, X_pca.dtype, type(X_pca))
print("Y_pca:", Y_pca.shape, Y_pca.dtype, type(Y_pca))


X_pca: (105942, 1000) float32 <class 'numpy.memmap'>
Y_pca: (105942, 300) float32 <class 'numpy.memmap'>


In [7]:
gene_importance = np.sum(np.abs(Y_components), axis = 0)
ranked_genes = pd.Series(gene_importance, index = cols_t).sort_values(ascending= False)
genes_1000 = ranked_genes[:1000]
target_cols_1000 = target_cols[target_cols['gene_id'].isin(genes_1000.index)]
peaks_1000 = target_cols_1000['col'].unique()
with tables.open_file("train_multi_inputs.h5", "r") as f:
    peaks = f.get_node("/train_multi_inputs/axis0")[:]
peaks_d = np.char.decode(peaks, encoding = 'utf-8')
peak_to_id = {str(peak): i for i,peak in enumerate(peaks_d)}
peaks_1000_idx = sorted(set([peak_to_id[peak] for peak in peaks_1000]))

In [8]:
import scipy
X_crs = scipy.sparse.load_npz('train_multi_cell.npz')
Xgene_1000 = X_crs[:,peaks_1000_idx]
print(f"Original shape: {X_crs.shape}")
print(f"Sliced shape: {Xgene_1000.shape}")

Original shape: (105942, 228942)
Sliced shape: (105942, 15581)


In [9]:
from scipy.sparse import hstack, csr_matrix
X_pca_sparse = csr_matrix(X_csr_1000)
X = hstack([X_pca_sparse,Xgene_1000])
X.shape

(105942, 16581)

In [10]:
# --- Data Splitting and Feature Separation ---
# Random Train-Test Split (80/20)
with tables.open_file("train_multi_inputs.h5", "r") as f:
    values = f.get_node("/train_multi_inputs/axis1")[:]
cell_names = np.char.decode(values, encoding = 'utf-8')
cell_to_id = {str(name): i for i,name in enumerate(cell_names)}


random_state=77
rng = np.random.default_rng(random_state)
shuffled_names = rng.permutation(cell_names)

split_point = int(len(shuffled_names) * 0.8)
train_names = shuffled_names[:split_point]
test_names = shuffled_names[split_point:]

train_id = sorted([cell_to_id[name] for name in train_names])
test_id = sorted([cell_to_id[name] for name in test_names])


In [100]:
import numpy as np

# 1. Length Check: Do we have the same number of rows?
print(f"H5 Names Count: {len(cell_names)}")
print(f"Meta Rows Count: {len(multi_metadata)}")
assert len(cell_names) == len(multi_metadata), "❌ Mismatch in length!"

# 2. Order Check: Is every single name in the exact same spot?
# We compare the list of names from H5 vs the Index of the Dataframe
are_aligned = np.array_equal(multi_metadata.index.values, cell_names)

if are_aligned:
    print("✅ VERIFIED: The metadata is perfectly aligned with the H5 file.")
else:
    print("❌ WARNING: The order is different!")
    # Show the first mismatch if it failed
    for i, (name_h5, name_meta) in enumerate(zip(cell_names, multi_metadata.index)):
        if name_h5 != name_meta:
            print(f"First mismatch at row {i}:")
            print(f"  H5 says:   {name_h5}")
            print(f"  Meta says: {name_meta}")
            break

H5 Names Count: 105942
Meta Rows Count: 105942
✅ VERIFIED: The metadata is perfectly aligned with the H5 file.


In [11]:
inputs_train, inputs_test = X[train_id,:], X[test_id,:]
y_subset = Y_pca[:,:100]
y_train, y_test = y_subset[train_id,:], y_subset[test_id,:]
print(inputs_train.shape, inputs_test.shape, y_train.shape, y_test.shape)

(84753, 16581) (21189, 16581) (84753, 100) (21189, 100)


In [14]:
# loading metatdata
metadata_old = pd.read_csv('metadata.csv', index_col = 'cell_id')
fix = pd.read_csv('metadata_cite_day_2_donor_27678.csv', index_col = 'cell_id')
metadata = pd.concat([metadata_old, fix], axis = 0)
del metadata_old, fix
multi_metadata  = metadata.loc[cell_names,['day','donor']]
del metadata


In [16]:
meta_train, meta_test = multi_metadata.iloc[train_id], multi_metadata.iloc[test_id]
day_col_train = meta_train['day'].values.reshape(-1,1)
day_sparse_train = csr_matrix(day_col_train)
x_train = hstack([inputs_train,day_sparse_train])

day_col_test = meta_test['day'].values.reshape(-1,1)
day_sparse_test = csr_matrix(day_col_test)
x_test = hstack([inputs_test, day_sparse_test])


In [17]:
x_train.shape

(84753, 16582)

In [25]:
# 1. Choosing a donor to_hold_out
donors = meta_train['donor'].unique()
rng = np.random.default_rng(seed=77) 
val_donor = rng.choice(donors)

print(f" Randomly Selected Holdout Donor: {val_donor}")

# 2. Create Indices based on that random choice
tr_idx = np.where(meta_train['donor']!= val_donor)[0]
tr_mask = (meta_train['donor'] != val_donor).values

va_idx = np.where(meta_train['donor'] == val_donor)[0]
va_mask = (meta_train['donor'] == val_donor).values

 Randomly Selected Holdout Donor: 32606
