In [None]:
import pandas as pd
import numpy as np
import os

def weighted_average_ensemble(file_paths, weights, output_path='submission.tsv', chunksize=100000):
    print(f"Processing in chunks of {chunksize}")
    
    all_keys = set()
    for path in file_paths:
        for chunk in pd.read_csv(path, sep='\t', header=None, names=['protein', 'go_term', 'score'], dtype={'protein': str, 'go_term': str, 'score': float}, chunksize=chunksize):
            chunk = chunk.dropna(subset=['protein', 'go_term'])
            chunk['key'] = chunk['protein'] + '_' + chunk['go_term']
            all_keys.update(chunk['key'].values)

    all_keys = sorted(all_keys)
    print(f"Total unique predictions: {len(all_keys)}")

    temp_files = []
    for start_idx in range(0, len(all_keys), chunksize):
        end_idx = min(start_idx + chunksize, len(all_keys))
        key_chunk = all_keys[start_idx:end_idx]
        result = pd.DataFrame({'key': key_chunk})

        for i, path in enumerate(file_paths):
            model_data = []
            for chunk in pd.read_csv(path, sep='\t', header=None, names=['protein', 'go_term', 'score'], dtype={'protein': str, 'go_term': str, 'score': float}, chunksize=chunksize):
                chunk['key'] = chunk['protein'] + '_' + chunk['go_term']
                chunk_filtered = chunk[chunk['key'].isin(key_chunk)][['key', 'score']]
                model_data.append(chunk_filtered)
            if model_data:
                model_df = pd.concat(model_data, ignore_index=True)
                model_df = model_df.rename(columns={'score': f'score_{i}'})
                result = result.merge(model_df, on='key', how='left')

        for i in range(len(file_paths)):
            result[f'score_{i}'] = result[f'score_{i}'].fillna(0)

        result['final_score'] = sum(weights[i] * result[f'score_{i}'] for i in range(len(file_paths)))
        result['protein'] = result['key'].str.rsplit('_', n=1).str[0]
        result['go_term'] = result['key'].str.rsplit('_', n=1).str[-1]

        temp_file = f'temp_chunk_{start_idx}.csv'
        result[['protein', 'go_term', 'final_score']].to_csv(temp_file, index=False, sep='\t', header=False)
        temp_files.append(temp_file)
        print(f"Processed chunk {len(temp_files)}")

    all_data = [pd.read_csv(f, sep='\t', header=None, names=['protein', 'go_term', 'final_score']) for f in temp_files]
    final_result = pd.concat(all_data, ignore_index=True)
    final_result.to_csv(output_path, sep='\t', index=False, header=False)

    for temp_file in temp_files:
        os.remove(temp_file)

    print(f"Saved to {output_path}")
    return final_result


if __name__ == "__main__":
    file_paths = [
        '/kaggle/input/cafa-6-t5-embeddings-with-ensemble/submission.tsv',
        '/kaggle/input/cafa-6-predictions/submission.tsv'
    ]
    
    weights = [0.4, 0.6]  # Increase weight of the better model
    result = weighted_average_ensemble(file_paths, weights, chunksize=10_000_000)
