In [16]:
# (scratch) runtime / memory check — safe to delete before publishing
print(os.getpid())

89018


In [11]:
import os
import gc
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import make_scorer

# Limit threads for numerical libraries to manage CPU usage
os.environ["OPENBLAS_NUM_THREADS"] = "7"
os.environ["OMP_NUM_THREADS"] = "7"

base_dir = "/home/skovtun/Python_projects/Kaggle/Single_cell/"
data_dir = os.path.join(base_dir, "data")
random_state = 77

os.chdir(data_dir)

In [2]:
# fixing Day 2 issues in metadata file
metadata_old = pd.read_csv('metadata.csv', index_col = 'cell_id')
fix = pd.read_csv('metadata_cite_day_2_donor_27678.csv', index_col = 'cell_id')
metadata = pd.concat([metadata_old, fix], axis = 0)
del fix, metadata_old

## Appendix: archived cell-type–specific experiments (not used in final pipeline)

#splitting cite test to 7 groups based on cell type.
# fix has more columns than initial test file, so I am cutting the columns so the datasets 
#whoud be aligned
# import time
# evaluation_old = pd.read_hdf('Single_cell_data/test_cite_inputs.h5')
# fix = pd.read_hdf('Single_cell_data/test_cite_inputs_day_2_donor_27678.h5')
# common_cols = evaluation_old.columns
# fix_aligned = fix[common_cols]
# cite_evaluation = pd.concat([evaluation_old,fix_aligned], axis = 0)
# del evaluation_old, fix
# cite_metadata = metadata[metadata['technology'] == 'citeseq']
# cell_types = cite_metadata['cell_type'].unique()
# base_dir = "/home/skovtun/Python_projects/Kaggle/Single cell"
# data_dir = os.path.join(base_dir, "Single_cell_data")

# os.chdir(data_dir)
# for cell_type in cell_types:
#     print(f"Processing {cell_type}...")
#     start_time = time.time()
#     idx = cite_metadata[cite_metadata['cell_type'] == cell_type].index
#     valid_ids = idx.intersection(cite_evaluation.index)
#     subset = cite_evaluation.loc[valid_ids]

#     # write directly to file under its own key
#     subset.to_parquet(
#     f"X_eval{cell_type}.parquet",
#     engine="pyarrow",
#     index=True,
#     coerce_timestamps="ms",
#     allow_truncated_timestamps=True,
#     version="2.6")
#     print((time.time() - start_time)/60)
#     # free memory immediately
#     del subset

# os.chdir(base_dir)
# import gc; gc.collect()

# os.chdir(data_dir)
# targets = pd.read_hdf('train_cite_targets.h5') #all type of cells
# inputs1 = pd.read_parquet('X_HSC.parquet')
# inputs2 = pd.read_parquet('X_MasP.parquet')
# inputs3 = pd.read_parquet('X_MkP.parquet')
# inputs4 = pd.read_parquet('X_MoP.parquet')
# inputs5 = pd.read_parquet('X_NeuP.parquet')
# inputs6 = pd.read_parquet('X_BP.parquet')
# inputs7 = pd.read_parquet('X_EryP.parquet')

# inputs = pd.concat([inputs1,inputs2,inputs3,inputs4,inputs5,inputs6, inputs7], axis = 0)
# del inputs1,inputs2,inputs3,inputs4,inputs5,inputs6,inputs7

# evaluation1 = pd.read_parquet('X_evalHSC.parquet')
# evaluation2 = pd.read_parquet('X_evalMasP.parquet')
# evaluation3 = pd.read_parquet('X_evalMkP.parquet')
# evaluation4 = pd.read_parquet('X_evalMoP.parquet')
# evaluation5 = pd.read_parquet('X_evalNeuP.parquet')
# evaluation6 = pd.read_parquet('X_evalBP.parquet')
# evaluation7 = pd.read_parquet('X_evalEryP.parquet')
# evaluation = pd.concat([evaluation1,evaluation2,evaluation3,evaluation4,evaluation5,evaluation6,evaluation7], axis=0)

# del evaluation1,evaluation2,evaluation3,evaluation4,evaluation5,evaluation6,evaluation7

In [3]:
# --- Load and Process Training Data ---

print("Loading CITEseq training data...")

# 1. Load the main training inputs
cite_train = pd.read_hdf(os.path.join(data_dir, 'train_cite_inputs.h5'))

# 2. Filter for specific cell types defined in metadata
cite_metadata = metadata[metadata['technology'] == 'citeseq']

# Intersection ensures we only get IDs that exist in both metadata and input data
valid_ids = (cite_metadata.index).intersection(cite_train.index)
cite_train = cite_train.loc[valid_ids]

# --- Load and Process Evaluation Data ---
# The evaluation set (test) also requires the Day 2 donor fix

print("Loading CITEseq evaluation data and applying fix...")
evaluation_base = pd.read_hdf(os.path.join(data_dir, 'test_cite_inputs.h5'))
evaluation_fix = pd.read_hdf(os.path.join(data_dir, 'test_cite_inputs_day_2_donor_27678.h5'))

# Align columns: The fix has extra columns, so we align it to the base
common_cols = evaluation_base.columns
evaluation_fix = evaluation_fix[common_cols]

# Combine base evaluation data with the fix
cite_evaluation = pd.concat([evaluation_base, evaluation_fix], axis=0)

# Filter evaluation data for the same target cell types
valid_eval_ids = (cite_metadata.index).intersection(cite_evaluation.index)
cite_evaluation = cite_evaluation.loc[valid_eval_ids]

# Clean up memory
del evaluation_base, evaluation_fix
gc.collect()

print(f"Data loaded. Train shape: {cite_train.shape}, Evaluation shape: {cite_evaluation.shape}")

Loading CITEseq training data...
Loading CITEseq evaluation data and applying fix...
Data loaded. Train shape: (70988, 22050), Evaluation shape: (55679, 22050)


In [4]:
# --- Feature Selection Strategy ---
# Goal: Retain raw features for genes that directly code for the target proteins (Surface Markers).
# All other genes will be compressed using PCA to reduce dimensionality while preserving variance.

# 1. Load Targets
targets = pd.read_hdf(os.path.join(data_dir, 'train_cite_targets.h5'))

# 2. Identify "Important" Genes using Barcode Lookup
# We use an external file (TotalSeq antibody reference) 
# from https://www.biolegend.com/en-us/totalseq/barcode-lookup 
# to map protein targets to their corresponding gene IDs.
barcode_path = os.path.join(data_dir, 'Totalseq_a.csv')
barcode = pd.read_csv(barcode_path)

proteins = targets.columns
all_protein_barcodes = barcode[barcode['Description'].isin(proteins)].sort_values('Description')

# Filter for Human reactivity to exclude mouse controls
h_protein_barcodes = all_protein_barcodes[all_protein_barcodes['Reactivity'].str.contains('Human')]
ens_gene_ids = list(h_protein_barcodes['Ensembl Gene Id'].unique())

del barcode, all_protein_barcodes, h_protein_barcodes
gc.collect()

# 3. Clean Column Names (Remove suffixes to match gene IDs)
clean_input_cols = [c.split('_')[0] for c in cite_train.columns]
eval_input_cols = [c.split('_')[0] for c in cite_evaluation.columns]

# Verify consistency between train and test sets
assert clean_input_cols == eval_input_cols

cite_train.columns = clean_input_cols
cite_evaluation.columns = eval_input_cols

# 4. Split Features into "Important" (keep raw) vs "Background" (PCA)
common_cols = list(set(ens_gene_ids) & set(cite_train.columns))
pca_cols = list(set(cite_train.columns) - set(common_cols))

print(f"Directly relevant features (preserved): {len(common_cols)}")
print(f"Features to compress via PCA: {len(pca_cols)}")

Directly relevant features (preserved): 103
Features to compress via PCA: 21947


In [5]:
# --- Data Splitting and Feature Separation ---

# 1. Align Inputs and Targets
# Ensure we only use rows where we have both input data and target values
valid_ids = cite_train.index.intersection(targets.index)
X_subset = cite_train.loc[valid_ids]
y_subset = targets.loc[valid_ids]

# 2. Random Train-Test Split (80/20)
rng = np.random.default_rng(random_state)
shuffled_idx = rng.permutation(valid_ids)

split_point = int(len(shuffled_idx) * 0.8)
train_idx = shuffled_idx[:split_point]
test_idx = shuffled_idx[split_point:]

# 3. Create Subsets and Split Features
# We split both Train and Test data into two parts:
#   A) _orig: Features to keep as-is (biologically relevant)
#   B) _compress: Features to reduce via PCA (high-dimensional background)

print("Splitting data into Train/Test and separating features...")

# Train sets
inputs_train = X_subset.loc[train_idx]
inputs_train_orig = inputs_train[common_cols]
inputs_train_to_compress = inputs_train[pca_cols]

# Test sets
inputs_test = X_subset.loc[test_idx]
inputs_test_orig = inputs_test[common_cols]
inputs_test_to_compress = inputs_test[pca_cols]

# Targets
y_train, y_test = y_subset.loc[train_idx], y_subset.loc[test_idx]

# Evaluation sets (Final submission data)
# We apply the same feature separation to the evaluation set
eval_orig = cite_evaluation[common_cols]
eval_to_compress = cite_evaluation[pca_cols]

# Clean up large objects to free memory
del cite_train, cite_evaluation, X_subset, y_subset, inputs_train, inputs_test, targets
gc.collect()

print(f"Train samples: {len(train_idx)}, Test samples: {len(test_idx)}")

Splitting data into Train/Test and separating features...
Train samples: 56790, Test samples: 14198


In [6]:
# --- Scaling and Dimensionality Reduction (PCA) ---

# 1. Standard Scaling (on background genes only)
# Fit ONLY on training data to prevent data leakage
scaler = StandardScaler()

inputs_train_scaled = scaler.fit_transform(inputs_train_to_compress)
inputs_test_scaled = scaler.transform(inputs_test_to_compress)
evaluation_scaled = scaler.transform(eval_to_compress)

# Free memory
del inputs_train_to_compress, inputs_test_to_compress, eval_to_compress
gc.collect()

# 2. PCA Compression
# Reducing background transcriptome to 200 latent featurespca = PCA(n_components=200, random_state=random_state)
pca = PCA(n_components=200)
inputs_train_pca = pca.fit_transform(inputs_train_scaled)
inputs_test_pca = pca.transform(inputs_test_scaled)
evaluation_pca = pca.transform(evaluation_scaled)

# Free memory
del inputs_train_scaled, inputs_test_scaled, evaluation_scaled
gc.collect()

0

In [8]:
# --- Feature Combination ---
# Mixing original "important" genes with PCA-transformed background features.
# Note: Dealing with this mixture makes it difficult to simply regress out the donor effect 
# at this stage because the signal is very strong and complex (linear/non-linear).

# 1. Horizontal Stack 
inputs_train_all = np.hstack([inputs_train_orig.values, inputs_train_pca])
inputs_test_all = np.hstack([inputs_test_orig.values, inputs_test_pca])
evaluation_all = np.hstack([eval_orig.values, evaluation_pca])

# 2. Create DataFrames with Column Names
pca_col_names = [f'PC{i+1}' for i in range(inputs_train_pca.shape[1])]
all_cols = list(inputs_train_orig.columns) + pca_col_names

print(f"Total feature count: {len(all_cols)}")

X_train_all = pd.DataFrame(inputs_train_all, index=train_idx, columns=all_cols)
X_test_all = pd.DataFrame(inputs_test_all, index=test_idx, columns=all_cols)
X_eval_all = pd.DataFrame(evaluation_all, index=valid_eval_ids, columns=all_cols)

# Free up numpy arrays to save memory
del inputs_train_all, inputs_test_all, evaluation_all, inputs_train_orig, inputs_train_pca
del inputs_test_orig, inputs_test_pca, eval_orig, evaluation_pca
gc.collect()

Total feature count: 303


1042

In [9]:
# --- Metadata Enrichment ---
# Merging the biological metadata (day, donor, cell type) into our feature sets.
# We perform a left join on the index (cell_id) to ensure alignment.

# 1. Define metadata columns to keep
meta_cols = ['day', 'donor', 'cell_type']

# 2. Merge metadata
# Note: X_train_all, X_test_all, X_eval_all are the dataframes created in the previous step
X_train = pd.merge(X_train_all, metadata[meta_cols], how='left', left_index=True, right_index=True)
X_test = pd.merge(X_test_all, metadata[meta_cols], how='left', left_index=True, right_index=True)
X_eval = pd.merge(X_eval_all, metadata[meta_cols], how='left', left_index=True, right_index=True)

# Free intermediate memory
del X_train_all, X_test_all, X_eval_all, metadata
gc.collect()

print("Metadata merged.")
print(f"X_train shape: {X_train.shape}")

Metadata merged.
X_train shape: (56790, 306)


In [10]:
# --- Categorical Encoding ---
# Converting 'cell_type' into One-Hot Encoded features.
# This allows the model to explicitly use cell identity (e.g., HSC vs EryP) as a predictor.

# 1. Ensure categorical data type (optimizes memory and speed)
X_train['cell_type'] = X_train['cell_type'].astype('category')
X_test['cell_type']  = X_test['cell_type'].astype('category') 
X_eval['cell_type']  = X_eval['cell_type'].astype('category') 

# 2. Apply One-Hot Encoding
# We use the 'ct' prefix (e.g., 'ct_HSC') to make these features easy to track
X_train = pd.get_dummies(X_train, columns=['cell_type'], prefix='ct', dummy_na=False)
X_test  = pd.get_dummies(X_test,  columns=['cell_type'], prefix='ct', dummy_na=False)
X_eval  = pd.get_dummies(X_eval,  columns=['cell_type'], prefix='ct', dummy_na=False)

print("One-Hot Encoding complete.")
print(f"Current feature shape: {X_train.shape}")

One-Hot Encoding complete.
Current feature shape: (56790, 312)


In [12]:
# --- Representative Protein Selection (Optimal Grid Strategy) ---
# Strategy: Select ~12 proteins that span the full "Signal vs. Difficulty" space.
# Axis 1: Modeling Difficulty (Ridge Baseline Pearson)
# Axis 2: Signal Strength (Variance)
print("Assessing protein modeling difficulty (Pearson)...")

# 1. Define Custom Scorer (Pearson Correlation)
def pearson_score(y_true, y_pred):
    # Handle constant predictions to avoid division by zero
    if np.std(y_pred) < 1e-9:
        return 0.0
    return np.corrcoef(y_true, y_pred)[0, 1]

pearson_scorer = make_scorer(pearson_score)

# 2. Calculate Metrics for All Proteins
protein_names = y_train.columns
stats = []
ridge = Ridge(alpha=1.0) 

for protein in protein_names:
    y = y_train[protein].values
    
    # Axis 1: Difficulty (Baseline Model Performance)
    score = cross_val_score(
        ridge, X_train, y_train[protein],
        cv=3,
        scoring=pearson_scorer, 
        n_jobs=-1
    ).mean()
    
    # Axis 2: Signal Strength (Variance)
    var = np.var(y)
    
    stats.append({"protein": protein, "var": var, "score": score})

stats_df = pd.DataFrame(stats)

# 3. Define Quadrants (2x2 Grid)
stats_df["var_bin"] = pd.qcut(stats_df["var"], 2, labels=["Low Signal", "High Signal"])
stats_df["score_bin"] = pd.qcut(stats_df["score"], 2, labels=["Hard", "Easy"])

# 4. Select 3 Proteins per Quadrant (Min, Median, Max)
rep_proteins = []

for v in ["Low Signal", "High Signal"]:
    for r in ["Hard", "Easy"]:
        block = stats_df[(stats_df["var_bin"] == v) & (stats_df["score_bin"] == r)]
        
        if len(block) == 0: continue
            
        # A) The "Worst" in class (Boundary Test)
        rep_proteins.append(block.iloc[block["score"].argmin()]["protein"])
        
        # B) The "Average" in class (Typical Case)
        median_idx = (block["score"] - block["score"].median()).abs().argmin()
        rep_proteins.append(block.iloc[median_idx]["protein"])

        # C) The "Best" in class (Boundary Test)
        rep_proteins.append(block.iloc[block["score"].argmax()]["protein"])

# Cleanup
rep_proteins = list(dict.fromkeys(rep_proteins))
print(f"Selected {len(rep_proteins)} proteins spanning biological & technical diversity.")
print(rep_proteins)

Assessing protein modeling difficulty (Pearson)...
Selected 12 proteins spanning biological & technical diversity.
['CD194', 'CD196', 'CD105', 'CD278', 'CD335', 'CD141', 'IgD', 'CD1c', 'CD19', 'CD86', 'CD155', 'CD41']


In [13]:
# 2. Manual Override: Force-include specific targets for safety:
#    - 'Mouse-IgG2a': Isotype control (sanity check for overfitting).
#    - 'CD22': B-cell marker to ensure coverage of the rare 'BP' cell type.
manual_adds = ['Mouse-IgG2a', 'CD22']
rep_proteins.extend(manual_adds)

# Cleanup
rep_proteins = list(dict.fromkeys(rep_proteins))

# Filter to ensure they actually exist in the data (just in case)
rep_proteins = [p for p in rep_proteins if p in y_train.columns]

print(f"Selected {len(rep_proteins)} proteins for tuning.")
print(f"Includes manual overrides: {manual_adds}")
print(rep_proteins)

Selected 14 proteins for tuning.
Includes manual overrides: ['Mouse-IgG2a', 'CD22']
['CD194', 'CD196', 'CD105', 'CD278', 'CD335', 'CD141', 'IgD', 'CD1c', 'CD19', 'CD86', 'CD155', 'CD41', 'Mouse-IgG2a', 'CD22']


In [14]:
# --- XGBoost Helper Functions ---

def kaggle_mean_cellwise_pearson(y_true, y_pred):
    """
    Calculates the mean Pearson correlation per cell (row-wise).
    Mimics the competition metric. Constant predictions yield -1.
    """
    yt = np.asarray(y_true)
    yp = np.asarray(y_pred)
    
    # Center data (subtract mean)
    y_pred_centered = yp - yp.mean(axis=1, keepdims=True)
    y_true_centered = yt - yt.mean(axis=1, keepdims=True)
    
    # Calculate numerator and denominator
    num = np.sum(y_true_centered * y_pred_centered, axis=1)
    den = np.sqrt(np.sum(y_true_centered ** 2, axis=1) * np.sum(y_pred_centered ** 2, axis=1))
    
    # Handle constant predictions (zero variance)
    const_pred = np.all(yp == yp[:, [0]], axis=1)
    
    corrs = np.empty(y_true.shape[0], dtype=float)
    corrs[const_pred] = -1.0  # Penalize constant predictions
    
    # Calculate valid correlations
    valid = (~const_pred) & (den > 0)
    corrs[valid] = num[valid] / den[valid]
    
    # Handle remaining edge cases
    corrs[~const_pred & (den == 0)] = np.nan
    
    return np.nanmean(corrs)

def split_indices_donor(X_df, donor_col="donor", random_state=77):
    """
    Splits data by holding out ONE donor for validation.
    Crucial for testing generalization to new batches/donors.
    """
    donors = X_df[donor_col].unique()
    rng = np.random.RandomState(random_state)
    val_donor = rng.choice(donors)
    
    train_idx = np.where(X_df[donor_col] != val_donor)[0]
    val_idx   = np.where(X_df[donor_col] == val_donor)[0]
    
    return train_idx, val_idx, val_donor

def to_float32(df):
    """Converts DataFrame to float32 numpy array (XGBoost friendly)."""
    return df.to_numpy().astype(np.float32, copy=False)

In [15]:
# --- XGBoost Hyperparameter Tuning (Final Champion vs. Challengers) ---
# Strategy: My previous "Winner" against 3 distinct challengers.
# Goal: See if the new protein list requires a shift in strategy.

import xgboost as xgb

# 1. Define the Candidate Grid
candidate_params = [
    # Candidate A: "The Champion" (previous winner)
    # L1 regularization (alpha=0.3) helps sparse feature selection.
    dict(max_depth=5, min_child_weight=40, subsample=0.88, colsample_bytree=0.4,
         learning_rate=0.05, reg_alpha=0.3, reg_lambda=4.0, gamma=0.0),

    # Candidate B: "The Deep & Rigid" (Challenger)
    # Deeper trees (6) to catch complex interactions, but heavily clamped (Lambda=15)
    # to prevent overfitting. Good if the Champion is underfitting.
    dict(max_depth=6, min_child_weight=30, subsample=0.85, colsample_bytree=0.4,
         learning_rate=0.05, reg_alpha=0.1, reg_lambda=15.0, gamma=0.0),

    # Candidate C: "The Conservative" (Challenger)
    # Shallower (4) and higher smoothing (60). 
    # Best if the new protein list (e.g. CD22, IgG2a) is noisier than before.
    dict(max_depth=4, min_child_weight=60, subsample=0.80, colsample_bytree=0.45,
         learning_rate=0.05, reg_alpha=0.0, reg_lambda=5.0, gamma=0.0),

    # Candidate D: "The Feature Bagger" (Challenger)
    # Aggressive feature sampling (0.3) and high subsample (0.9).
    # Forces diversity between trees. Good for highly correlated gene data.
    dict(max_depth=5, min_child_weight=40, subsample=0.9, colsample_bytree=0.3,
         learning_rate=0.05, reg_alpha=0.5, reg_lambda=5.0, gamma=0.1),
]

# 2. Prepare Data (Split by Donor)
# Note: Using the helper function defined earlier
tr_idx, va_idx, val_donor = split_indices_donor(X_train, donor_col="donor", random_state=77)

print(f"Validation Donor: {val_donor}")
print(f"Train size: {len(tr_idx)} | Validation size: {len(va_idx)}")

# Drop metadata columns ('day', 'donor') before converting
X_feat = X_train.drop(columns=['day', 'donor']) 
X_tr = to_float32(X_feat.iloc[tr_idx])
X_va = to_float32(X_feat.iloc[va_idx])

# Select only the chosen representative proteins
Y_tr = y_train.iloc[tr_idx][rep_proteins].to_numpy(dtype=np.float32)
Y_va = y_train.iloc[va_idx][rep_proteins].to_numpy(dtype=np.float32)

# Create DMatrices once
dtrain = xgb.DMatrix(X_tr)
dvalid = xgb.DMatrix(X_va)

# 3. The Tuning Loop
scores = []

print(f"\nStarting tuning on {len(rep_proteins)} representative proteins...")

for k, cand in enumerate(candidate_params, 1):
    # Merge candidate params with fixed params
    params = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse", 
        "tree_method": "hist",
        "seed": 77,
        "verbosity": 0,
        **cand
    }
    
    preds_accum = np.zeros_like(Y_va, dtype=np.float32)

    # Train a model for each representative protein
    for j, protein_name in enumerate(rep_proteins):
        dtrain.set_label(Y_tr[:, j])
        dvalid.set_label(Y_va[:, j])

        booster = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=5000, 
            evals=[(dvalid, "valid")],
            early_stopping_rounds=100, 
            verbose_eval=False
        )
        
        # Predict
        preds_accum[:, j] = booster.predict(dvalid, iteration_range=(0, booster.best_iteration + 1))

    # Score: Row-wise Pearson (Competition Metric)
    score = kaggle_mean_cellwise_pearson(Y_va, preds_accum)
    scores.append({"cand": cand, "score": score})

    print(f"[{k}/{len(candidate_params)}] Score: {score:.5f} | Params: {cand}")

# 4. Select Best
best_result = max(scores, key=lambda x: x["score"])
best_params = best_result["cand"]

print("\n--- Best Configuration ---")
print(f"Score: {best_result['score']:.5f}")
print(best_params)

Validation Donor: 31800
Train size: 36965 | Validation size: 19825

Starting tuning on 14 representative proteins...
[1/4] Score: 0.81211 | Params: {'max_depth': 5, 'min_child_weight': 40, 'subsample': 0.88, 'colsample_bytree': 0.4, 'learning_rate': 0.05, 'reg_alpha': 0.3, 'reg_lambda': 4.0, 'gamma': 0.0}
[2/4] Score: 0.81235 | Params: {'max_depth': 6, 'min_child_weight': 30, 'subsample': 0.85, 'colsample_bytree': 0.4, 'learning_rate': 0.05, 'reg_alpha': 0.1, 'reg_lambda': 15.0, 'gamma': 0.0}
[3/4] Score: 0.81055 | Params: {'max_depth': 4, 'min_child_weight': 60, 'subsample': 0.8, 'colsample_bytree': 0.45, 'learning_rate': 0.05, 'reg_alpha': 0.0, 'reg_lambda': 5.0, 'gamma': 0.0}
[4/4] Score: 0.81170 | Params: {'max_depth': 5, 'min_child_weight': 40, 'subsample': 0.9, 'colsample_bytree': 0.3, 'learning_rate': 0.05, 'reg_alpha': 0.5, 'reg_lambda': 5.0, 'gamma': 0.1}

--- Best Configuration ---
Score: 0.81235
{'max_depth': 6, 'min_child_weight': 30, 'subsample': 0.85, 'colsample_bytree': 

In [16]:
# --- Production Training & Diagnostics ---
# Training final models on the full dataset using the winning "Deep & Rigid" hyperparameters.
# We predict on BOTH Train and Test sets to measure the "Generalization Gap" (Overfitting).

print("Starting production training...")

# 1. Setup Data
# Keep 'day' (differentiation time), drop 'donor' (batch effect).
X_tr_full = to_float32(X_train.drop(columns=['donor']))
X_te_full = to_float32(X_test.drop(columns=['donor']))

dtrain_full = xgb.DMatrix(X_tr_full)
dtest_full  = xgb.DMatrix(X_te_full)

# 2. Output Containers
# We need to store Train predictions now too
y_pred_train = pd.DataFrame(index=X_train.index, columns=y_train.columns, dtype=np.float32)
y_pred_test  = pd.DataFrame(index=X_test.index,  columns=y_train.columns, dtype=np.float32)

# 3. Final Parameters (The Winner: Deep & Rigid)
final_params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "tree_method": "hist",
    "seed": 77,
    "verbosity": 0,
    
    # Winning Settings
    "learning_rate": 0.05,
    "max_depth": 6,              
    "min_child_weight": 30,      
    "subsample": 0.85, 
    "colsample_bytree": 0.4,
    "reg_alpha": 0.1,
    "reg_lambda": 15.0,          
    "gamma": 0.0,
}

# 4. Training Loop (All Proteins)
NUM_ROUNDS = 700

print(f"Training on {len(y_train.columns)} proteins...")


for i, protein in enumerate(y_train.columns, 1):
    # Prepare target
    y_target = y_train[protein].to_numpy(dtype=np.float32)
    dtrain_full.set_label(y_target)

    # Train
    booster = xgb.train(
        params=final_params,
        dtrain=dtrain_full,
        num_boost_round=NUM_ROUNDS,
        verbose_eval=False
    )

    # Predict on Train AND Test to check overfitting
    y_pred_train[protein] = booster.predict(dtrain_full)
    y_pred_test[protein]  = booster.predict(dtest_full)
    
    if i % 10 == 0:
        print(f"Processed {i}/{len(y_train.columns)} proteins")

# 5. Diagnostic Evaluation
print("\n--- Model Diagnostics ---")

# Calculate scores
train_score = kaggle_mean_cellwise_pearson(y_train, y_pred_train)
test_score  = kaggle_mean_cellwise_pearson(y_test,  y_pred_test)

print(f"Train Score (Pearson): {train_score:.5f}")
print(f"Test Score  (Pearson): {test_score:.5f}")

gap = train_score - test_score
print(f"Generalization Gap:    {gap:.5f}")

if gap > 0.05:
    print("WARNING: Significant overfitting detected (>0.05 gap). Consider increasing reg_lambda.")
else:
    print("SUCCESS: Model is generalizing well.")

Starting production training...
Training on 140 proteins...
Feature set includes 'day'.
Processed 10/140 proteins
Processed 20/140 proteins
Processed 30/140 proteins
Processed 40/140 proteins
Processed 50/140 proteins
Processed 60/140 proteins
Processed 70/140 proteins
Processed 80/140 proteins
Processed 90/140 proteins
Processed 100/140 proteins
Processed 110/140 proteins
Processed 120/140 proteins
Processed 130/140 proteins
Processed 140/140 proteins

--- Model Diagnostics ---
Train Score (Pearson): 0.94234
Test Score  (Pearson): 0.90156
Generalization Gap:    0.04077
SUCCESS: Model is generalizing well.


In [17]:
# --- Grand Finale: Full Retrain & Submission ---
# Strategy: 
# 1. Combine Train + Test sets into one massive training matrix (Maximize Data).
# 2. Retrain the "Deep & Rigid" model on this combined data.
# 3. Predict on the Evaluation set for the final Leaderboard submission.
print("Initiating Grand Finale: Retraining on ALL available data (Train + Test)...")

# 1. Concatenate Data (Maximize Signal)
# We join the training and testing sets to give the model the most complete view of biology.
X_total = pd.concat([X_train, X_test])
y_total = pd.concat([y_train, y_test])

print(f"Combined Training Shape: {X_total.shape}")

# 2. Preprocessing
# Keep 'day', drop 'donor' (as per our winning strategy).
X_total_feat = to_float32(X_total.drop(columns=['donor']))
X_eval_feat  = to_float32(X_eval.drop(columns=['donor']))

# Create DMatrices (Optimized for speed)
dtrain_final = xgb.DMatrix(X_total_feat)
deval_final  = xgb.DMatrix(X_eval_feat)

# 3. Output Container
y_pred_eval = pd.DataFrame(index=X_eval.index, columns=y_train.columns, dtype=np.float32)

# 4. Final Parameters (The Verified Winner)
final_params = {
    "objective": "reg:squarederror",
    "tree_method": "hist",
    "seed": 77,
    "verbosity": 0,
    
    # "Deep & Rigid" Config
    "learning_rate": 0.05,
    "max_depth": 6,              # Deep trees for complex biology
    "min_child_weight": 30,      
    "subsample": 0.85, 
    "colsample_bytree": 0.4,
    "reg_alpha": 0.1,
    "reg_lambda": 15.0,          # Strong regularization for robustness
    "gamma": 0.0,
}

# 5. The Final Training Loop
# We use 700 rounds. Since we have MORE data now (Train+Test), 
# 700 is a conservative safe zone to prevent underfitting without exploding runtime.
NUM_ROUNDS = 700

print(f"Training on {len(y_train.columns)} proteins...")

for i, protein in enumerate(y_train.columns, 1):
    # Set label for the combined dataset
    y_target = y_total[protein].to_numpy(dtype=np.float32)
    dtrain_final.set_label(y_target)

    # Train
    booster = xgb.train(
        params=final_params,
        dtrain=dtrain_final,
        num_boost_round=NUM_ROUNDS,
        verbose_eval=False
    )

    # Predict on Evaluation Set
    y_pred_eval[protein] = booster.predict(deval_final)
    
    if i % 20 == 0:
        print(f"Processed {i}/{len(y_train.columns)} proteins")

# 6. Save Final Submission
submission_path = os.path.join(data_dir, "Eval_cite", "X_eval_results.parquet")
os.makedirs(os.path.dirname(submission_path), exist_ok=True)

y_pred_eval.to_parquet(
    submission_path,
    engine="pyarrow",
    index=True,
    coerce_timestamps="ms",
    allow_truncated_timestamps=True,
    version="2.6"
)

print(f"\nSUCCESS: Grand Finale complete.")
print(f"Submission saved to: {submission_path}")

Initiating Grand Finale: Retraining on ALL available data (Train + Test)...
Combined Training Shape: (70988, 312)
Training on 140 proteins...
Processed 20/140 proteins
Processed 40/140 proteins
Processed 60/140 proteins
Processed 80/140 proteins
Processed 100/140 proteins
Processed 120/140 proteins
Processed 140/140 proteins

SUCCESS: Grand Finale complete.
Submission saved to: /home/skovtun/Python_projects/Kaggle/Single_cell/data/Eval_cite/X_eval_results.parquet


Kaggle Pierson correlation train(old best version): 0.9253355768254198
Kaggle Pierson correlation train (old best version) : 0.9014217099264067
Train Score (Pearson): 0.94234
Test Score  (Pearson): 0.90156