### Credit goes to this author and notebook

https://www.kaggle.com/code/analyticaobscura/cafa-6-decoding-protein-mysteries 

-  here trying weighted average models
-  meta ensemble model is upcoming 

In [None]:
import pandas as pd
import numpy as np
import os
from tqdm.auto import tqdm
from typing import Callable, Optional, Dict, Any

def stacking_ensemble(
    file_paths, 
    weights=None, 
    method='all', 
    output_path='submission.tsv', 
    chunksize=100_000,
    oof_callback: Optional[Callable] = None,
    oof_data: Optional[Dict[str, Any]] = None
):
    """
    Advanced stacking ensemble combining median, weighted average, and rank averaging
    
    Parameters:
    -----------
    file_paths : list - Paths to prediction files
    weights : list - Model weights (normalized automatically)
    method : str - 'median', 'weighted_average', 'rank_average', or 'all' (stacking)
    output_path : str - Output file path
    chunksize : int - Rows per chunk
    oof_callback : callable - Validation callback function
    oof_data : dict - OOF validation data {'labels': df, 'proteins': list}
    
    Returns: (predictions_df, oof_metrics)
    """
    
    if weights is None:
        weights = [1.0 / len(file_paths)] * len(file_paths)
    else:
        weights = np.array(weights) / np.array(weights).sum()
    
    print(f"Models: {len(file_paths)} | Weights: {weights} | Method: {method}")

    # Step 1: Collect unique keys
    print("\nScanning files...")
    all_keys = set()
    for path in tqdm(file_paths, desc="Files"):
        for chunk in pd.read_csv(path, sep='\t', header=None,
                                 names=['protein', 'go_term', 'score'],
                                 dtype={'protein': str, 'go_term': str, 'score': float},
                                 chunksize=chunksize):
            chunk = chunk.dropna(subset=['protein', 'go_term'])
            chunk['key'] = chunk['protein'] + '_' + chunk['go_term']
            all_keys.update(chunk['key'].values)

    all_keys = sorted(all_keys)
    print(f"Total predictions: {len(all_keys):,}")

    # Step 2: Process chunks
    print("\nProcessing chunks...")
    temp_files = []
    for start in tqdm(range(0, len(all_keys), chunksize), desc="Chunks"):
        end = min(start + chunksize, len(all_keys))
        key_chunk = all_keys[start:end]
        result = pd.DataFrame({'key': key_chunk})

        # Load scores from all models
        for idx, path in enumerate(file_paths):
            model_scores = []
            for chunk in pd.read_csv(path, sep='\t', header=None,
                                     names=['protein', 'go_term', 'score'],
                                     dtype={'protein': str, 'go_term': str, 'score': float},
                                     chunksize=chunksize):
                chunk['key'] = chunk['protein'] + '_' + chunk['go_term']
                filtered = chunk[chunk['key'].isin(key_chunk)][['key', 'score']]
                filtered = filtered.rename(columns={'score': f'score_{idx}'})
                model_scores.append(filtered)
            if model_scores:
                model_df = pd.concat(model_scores, ignore_index=True)
                result = result.merge(model_df, on='key', how='left')

        score_cols = [col for col in result.columns if col.startswith('score_')]
        result[score_cols] = result[score_cols].fillna(0)

        # Calculate ensemble
        if method == 'median':
            result['final_score'] = result[score_cols].median(axis=1)
            
        elif method == 'weighted_average':
            result['final_score'] = sum(result[f'score_{i}'] * weights[i] 
                                       for i in range(len(file_paths)))
            
        elif method == 'rank_average':
            for i in range(len(file_paths)):
                result[f'rank_{i}'] = result[f'score_{i}'].rank(pct=True)
            result['final_score'] = sum(result[f'rank_{i}'] * weights[i] 
                                       for i in range(len(file_paths)))
            
        elif method == 'all':
            # Stacking: combine all three methods
            result['median_score'] = result[score_cols].median(axis=1)
            result['weighted_avg'] = sum(result[f'score_{i}'] * weights[i] 
                                        for i in range(len(file_paths)))
            for i in range(len(file_paths)):
                result[f'rank_{i}'] = result[f'score_{i}'].rank(pct=True)
            result['rank_avg'] = sum(result[f'rank_{i}'] * weights[i] 
                                    for i in range(len(file_paths)))
            
            # Meta-ensemble weights (tune these!)
            result['final_score'] = (
                result['median_score'] * 0.25 +
                result['weighted_avg'] * 0.40 +
                result['rank_avg'] * 0.35
            )

        result['protein'], result['go_term'] = zip(*result['key'].str.rsplit('_', n=1))
        temp_file = f'temp_chunk_{start}.csv'
        result[['protein', 'go_term', 'final_score']].to_csv(temp_file, index=False, sep='\t', header=False)
        temp_files.append(temp_file)

    # Step 3: Combine chunks
    print("\nCombining chunks...")
    df_list = [pd.read_csv(f, sep='\t', header=None, 
                          names=['protein', 'go_term', 'final_score']) 
               for f in tqdm(temp_files, desc="Combining")]
    final_df = pd.concat(df_list, ignore_index=True)
    
    # Step 4: OOF validation
    oof_metrics = None
    if oof_callback and oof_data:
        print("\nOOF Validation...")
        if 'proteins' in oof_data:
            oof_predictions = final_df[final_df['protein'].isin(oof_data['proteins'])].copy()
            print(f"OOF predictions: {len(oof_predictions):,}")
            oof_metrics = oof_callback(oof_predictions, oof_data)
            print(f"F-max: {oof_metrics.get('f_max', 0):.4f} | Threshold: {oof_metrics.get('best_threshold', 0):.3f}")
    
    # Step 5: Save and cleanup
    print(f"\nSaving to {output_path}...")
    final_df.to_csv(output_path, sep='\t', index=False, header=False)
    
    for temp_file in temp_files:
        try:
            os.remove(temp_file)
        except:
            pass
    
    print(f"âœ“ Done! {len(final_df):,} predictions saved\n")
    return final_df, oof_metrics


def cafa_oof_callback(oof_predictions: pd.DataFrame, oof_data: Dict[str, Any]) -> Dict[str, float]:
    """Calculate F-max and CAFA metrics"""
    
    labels = oof_data['labels']
    merged = oof_predictions.merge(labels, on=['protein', 'go_term'], how='outer')
    merged['final_score'] = merged['final_score'].fillna(0)
    merged['label'] = merged['label'].fillna(0)
    
    thresholds = oof_data.get('thresholds', np.arange(0.01, 1.0, 0.01))
    best_f1, best_threshold = 0, 0
    precision_list, recall_list = [], []
    
    for threshold in thresholds:
        pred_binary = (merged['final_score'] >= threshold).astype(int)
        true_binary = merged['label'].astype(int)
        
        tp = ((pred_binary == 1) & (true_binary == 1)).sum()
        fp = ((pred_binary == 1) & (true_binary == 0)).sum()
        fn = ((pred_binary == 0) & (true_binary == 1)).sum()
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        
        precision_list.append(precision)
        recall_list.append(recall)
        
        if f1 > best_f1:
            best_f1, best_threshold = f1, threshold
    
    return {
        'f_max': best_f1,
        'best_threshold': best_threshold,
        'mean_precision': np.mean(precision_list),
        'mean_recall': np.mean(recall_list),
        'auc_pr': np.trapz(precision_list, recall_list)
    }



In [None]:

# =============================================================================
# USAGE
# =============================================================================

if __name__ == "__main__":
    
    file_paths = [
        '/kaggle/input/cafa-6-t5-embeddings-with-ensemble/submission.tsv',
        '/kaggle/input/cafa-6-predictions/submission.tsv'
    ]
    
    weights = [0.35, 0.30]  # Based on validation F-max
    
    # Basic stacking ensemble
    #result, _ = stacking_ensemble(
    #    file_paths=file_paths,
    #    weights=weights,
    #    method='all',
    #    output_path='submission.tsv',
    #    chunksize=10_000_000
    #)
    
     #With OOF validation (uncomment to use)
     oof_data = {
         'labels': pd.read_csv('oof_labels.tsv', sep='\t', 
                              names=['protein', 'go_term', 'label']),
         'proteins': ['prot1', 'prot2'],
         'thresholds': np.arange(0.01, 1.0, 0.01)
     }
     
     result, metrics = stacking_ensemble(
         file_paths=file_paths,
         weights=weights,
         method='all',
         output_path='submission.tsv',
         chunksize=10_000_000,
         oof_callback=cafa_oof_callback,
         oof_data=oof_data
     )
