In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def ensemble_submissions(file_paths, weights, output_path='submission.tsv'):
    dfs = []
    for i, path in enumerate(file_paths):
        df = pd.read_csv(path, sep='\t', header=None, names=['protein', 'go_term', 'score'])
        df['key'] = df['protein'] + '_' + df['go_term']
        df = df.rename(columns={'score': f'score_{i}'})
        dfs.append(df)
        print(f"Loaded {len(df)} predictions from file {i+1}")
    
    result = dfs[0][['protein', 'go_term', 'key', 'score_0']].copy()
    for i in range(1, len(dfs)):
        result = result.merge(dfs[i][['key', f'score_{i}']], on='key', how='outer')
    
    for i in range(len(dfs)):
        result[f'score_{i}'] = result[f'score_{i}'].fillna(0)
    
    result['score'] = sum(weights[i] * result[f'score_{i}'] for i in range(len(dfs)))
    result['protein'] = result['protein'].fillna(result['key'].str.split('_').str[0])
    result['go_term'] = result['go_term'].fillna(result['key'].str.split('_').str[-1])
    
    result = result.sort_values('score', ascending=False)
    result[['protein', 'go_term', 'score']].to_csv(
        output_path,
        sep='\t',
        index=False,
        header=False
    )
    
    print(f"\nSaved {len(result)} predictions to {output_path}")
    
    return result

def stacking_model(file_paths, weights, output_path='submission.tsv'):
    # Load the ensemble submission
    result = ensemble_submissions(file_paths, weights, output_path)
    
    # Assume the 'protein' and 'go_term' are features
    X = result[['score_0', 'score_1']]  # Example: if you have 2 models, extend as needed
    y = result['score']  # Target is the final combined score
    
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Define base models for stacking
    base_learners = [
        ('svm', SVC(kernel='linear', probability=True)),
        ('logreg', LogisticRegression()),
        ('rf', RandomForestClassifier(n_estimators=100))
    ]
    
    # Define the meta-model (final model)
    meta_model = LogisticRegression()
    
    # Create a stacking model
    stacking_model = StackingClassifier(estimators=base_learners, final_estimator=meta_model)
    
    # Train the stacking model
    stacking_model.fit(X_train, y_train)
    
    # Predict using the stacking model
    y_pred = stacking_model.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Stacking Model Accuracy: {accuracy * 100:.2f}%")
    
    # Save the final predictions
    result['stacking_score'] = stacking_model.predict(X)  # Apply model on all data
    result[['protein', 'go_term', 'stacking_score']].to_csv(
        output_path,
        sep='\t',
        index=False,
        header=False
    )
    
    print(f"\nSaved stacked predictions to {output_path}")
    
    return result

if __name__ == "__main__":
    file_paths = [
        '/kaggle/input/gaf-submission/submission.tsv',
        '/kaggle/input/cafa-6-predictions/submission.tsv'
    ]
    weights = [0.5, 0.5]
    
    # Run stacking model
    stacking_model(file_paths, weights)