### Credit goes to this author and notebook

https://www.kaggle.com/code/analyticaobscura/cafa-6-decoding-protein-mysteries 

-  here trying weighted average models
-  meta ensemble model is upcoming 

In [None]:
import pandas as pd
import numpy as np
import os
from tqdm.auto import tqdm
from typing import Callable, Optional, Dict, Any
from multiprocessing import Pool, cpu_count
import gc

def process_single_chunk(args):
    """Process a single chunk - used for parallel processing"""
    start, end, all_keys, file_paths, weights, method, num_models = args
    
    key_chunk = all_keys[start:end]
    result = pd.DataFrame({'key': key_chunk})

    # Load scores from all models
    for idx, path in enumerate(file_paths):
        model_scores = []
        for chunk in pd.read_csv(path, sep='\t', header=None,
                                 names=['protein', 'go_term', 'score'],
                                 dtype={'protein': str, 'go_term': str, 'score': float},
                                 chunksize=1_000_000):
            chunk['key'] = chunk['protein'] + '_' + chunk['go_term']
            filtered = chunk[chunk['key'].isin(key_chunk)][['key', 'score']]
            filtered = filtered.rename(columns={'score': f'score_{idx}'})
            model_scores.append(filtered)
        if model_scores:
            model_df = pd.concat(model_scores, ignore_index=True)
            model_df = model_df.groupby('key', as_index=False).mean()
            result = result.merge(model_df, on='key', how='left')

    score_cols = [col for col in result.columns if col.startswith('score_')]
    result[score_cols] = result[score_cols].fillna(0)

    # Calculate ensemble
    if method == 'median':
        result['final_score'] = result[score_cols].median(axis=1)
    elif method == 'weighted_average':
        result['final_score'] = sum(result[f'score_{i}'] * weights[i] for i in range(num_models))
    elif method == 'rank_average':
        for i in range(num_models):
            result[f'rank_{i}'] = result[f'score_{i}'].rank(pct=True)
        result['final_score'] = sum(result[f'rank_{i}'] * weights[i] for i in range(num_models))
    elif method == 'all':
        result['median_score'] = result[score_cols].median(axis=1)
        result['weighted_avg'] = sum(result[f'score_{i}'] * weights[i] for i in range(num_models))
        for i in range(num_models):
            result[f'rank_{i}'] = result[f'score_{i}'].rank(pct=True)
        result['rank_avg'] = sum(result[f'rank_{i}'] * weights[i] for i in range(num_models))
        result['final_score'] = (result['median_score'] * 0.25 + 
                                result['weighted_avg'] * 0.40 + 
                                result['rank_avg'] * 0.35)

    result['protein'], result['go_term'] = zip(*result['key'].str.rsplit('_', n=1))
    return result[['protein', 'go_term', 'final_score']]


def stacking_ensemble_fast(
    file_paths, 
    weights=None, 
    method='all', 
    output_path='submission.tsv', 
    chunksize=5_000_000,
    n_jobs=-1,
    oof_callback: Optional[Callable] = None,
    oof_data: Optional[Dict[str, Any]] = None
):
    """OPTIMIZED stacking ensemble with parallel processing"""
    
    if weights is None:
        weights = [1.0 / len(file_paths)] * len(file_paths)
    else:
        weights = np.array(weights) / np.array(weights).sum()
    
    if n_jobs == -1:
        n_jobs = max(1, cpu_count() - 1)
    
    print(f"Models: {len(file_paths)} | Weights: {weights} | Method: {method}")
    print(f"Parallel jobs: {n_jobs} | Chunk size: {chunksize:,}")

    # Step 1: Collect unique keys FASTER
    print("\nScanning files...")
    all_keys = set()
    for path in tqdm(file_paths, desc="Files"):
        for chunk in pd.read_csv(path, sep='\t', header=None,
                                 names=['protein', 'go_term', 'score'],
                                 dtype={'protein': str, 'go_term': str},
                                 usecols=[0, 1],  # Only read first 2 columns
                                 chunksize=5_000_000):
            chunk = chunk.dropna()
            keys = chunk['protein'] + '_' + chunk['go_term']
            all_keys.update(keys.values)
            del chunk, keys
            gc.collect()

    all_keys = sorted(all_keys)
    print(f"Total predictions: {len(all_keys):,}")

    # Step 2: Process chunks in parallel
    print("\nProcessing chunks in parallel...")
    
    chunk_args = []
    for start in range(0, len(all_keys), chunksize):
        end = min(start + chunksize, len(all_keys))
        chunk_args.append((start, end, all_keys, file_paths, weights, method, len(file_paths)))
    
    # Parallel processing
    if n_jobs > 1:
        with Pool(n_jobs) as pool:
            results = list(tqdm(
                pool.imap(process_single_chunk, chunk_args),
                total=len(chunk_args),
                desc="Chunks"
            ))
    else:
        results = [process_single_chunk(args) for args in tqdm(chunk_args, desc="Chunks")]
    
    # Step 3: Combine results
    print("\nCombining results...")
    final_df = pd.concat(results, ignore_index=True)
    del results
    gc.collect()
    
    # Step 4: OOF validation
    oof_metrics = None
    if oof_callback and oof_data:
        print("\nOOF Validation...")
        if 'proteins' in oof_data:
            oof_predictions = final_df[final_df['protein'].isin(oof_data['proteins'])].copy()
            print(f"OOF predictions: {len(oof_predictions):,}")
            oof_metrics = oof_callback(oof_predictions, oof_data)
            print(f"F-max: {oof_metrics.get('f_max', 0):.4f} | Threshold: {oof_metrics.get('best_threshold', 0):.3f}")
    
    # Step 5: Save
    print(f"\nSaving to {output_path}...")
    final_df.to_csv(output_path, sep='\t', index=False, header=False)
    print(f"âœ“ Done! {len(final_df):,} predictions saved\n")
    
    return final_df, oof_metrics


def cafa_oof_callback(oof_predictions: pd.DataFrame, oof_data: Dict[str, Any]) -> Dict[str, float]:
    """Calculate F-max and CAFA metrics"""
    labels = oof_data['labels']
    merged = oof_predictions.merge(labels, on=['protein', 'go_term'], how='outer')
    merged['final_score'] = merged['final_score'].fillna(0)
    merged['label'] = merged['label'].fillna(0)
    
    thresholds = oof_data.get('thresholds', np.arange(0.01, 1.0, 0.01))
    best_f1, best_threshold = 0, 0
    precision_list, recall_list = [], []
    
    for threshold in thresholds:
        pred_binary = (merged['final_score'] >= threshold).astype(int)
        true_binary = merged['label'].astype(int)
        
        tp = ((pred_binary == 1) & (true_binary == 1)).sum()
        fp = ((pred_binary == 1) & (true_binary == 0)).sum()
        fn = ((pred_binary == 0) & (true_binary == 1)).sum()
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        
        precision_list.append(precision)
        recall_list.append(recall)
        
        if f1 > best_f1:
            best_f1, best_threshold = f1, threshold
    
    return {
        'f_max': best_f1,
        'best_threshold': best_threshold,
        'mean_precision': np.mean(precision_list),
        'mean_recall': np.mean(recall_list),
        'auc_pr': np.trapz(precision_list, recall_list)
    }



In [None]:

# =============================================================================
# USAGE - FAST VERSION
# =============================================================================

file_paths = [
    '/kaggle/input/cafa-6-t5-embeddings-with-ensemble/submission.tsv',
    '/kaggle/input/cafa-6-predictions/submission.tsv'
]

weights = [0.35, 0.30]

# Fast ensemble with parallel processing
result, _ = stacking_ensemble_fast(
    file_paths=file_paths,
    weights=weights,
    method='all',
    output_path='submission.tsv',
    chunksize=15_000_000,  # Larger = faster
    n_jobs=-1  # Use all CPUs
)
