# **Ensemble Submissions: Code Analysis and Explanation**

## **1. Import Libraries**

```python
import pandas as pd


In [None]:
import pandas as pd


## 2. Function Definition

* `ensemble_submissions:` The core function takes the following arguments:
    * `file_paths:` A list of paths to submission files.
    * `weights:` A list of weights used to combine the scores from the different files.
    * `output_path:` The path where the final combined results will be saved.


## 3. Read and Process Files


- **File Reading:** Each file is read using `pandas.read_csv()`, which is tab-separated and has no header.
- **Key Creation:** A new `key` column is created by combining the `protein` and `go_term`.
- **Renaming Score Column:** The `score` column is renamed to include the file index.

## 4. Merging DataFrames

- **Merge:** The first file is initialized as the base result. Subsequent files are merged using the `key` column, ensuring all protein-GO term pairs are included.

## 5. Handling Missing Data

* **Missing Scores:** Any missing score (`NaN`) is replaced with `0` to ensure there are no missing values in the score columns.
  

## 6. Weighted Averaging of Scores

* **Weighted Score Calculation:** The final score is calculated by summing the weighted scores from each file.
  

## 7. Handling Missing Protein/GO Term Data

* **Filling Missing Protein/GO Term:** If any protein or GO term is missing, it is inferred from the `key` column by splitting the key string.


## 8. Submission

* **submit** the file in to the output

## 9. Example Usage

* **Example:** This code demonstrates how to call the function with a list of file paths and weights to generate the ensemble prediction.
  

In [None]:
# 2. Function defination
def load_submission(path):
    df = pd.read_csv(path, sep='\t', header=None)
    df['pred_key'] = df[0].astype(str) + '_' + df[1].astype(str)
    return df

In [None]:
# 3. # Takes a while
A = load_submission('/kaggle/input/gaf-submission/submission.tsv')
B = load_submission('/kaggle/input/merge-of-2submission-lb-0-25/submission.tsv')

A_idx = A.set_index('pred_key')
B_idx = B.set_index('pred_key')

common_keys = A_idx.index.intersection(B_idx.index)
common_ensemble = A_idx.loc[common_keys].copy()
common_ensemble[2] = (A_idx.loc[common_keys, 2] + B_idx.loc[common_keys, 2]) / 2

not_common_in_A = A_idx.loc[A_idx.index.difference(common_keys)]
not_common_in_B = B_idx.loc[B_idx.index.difference(common_keys)]

submission = pd.concat([common_ensemble, not_common_in_A, not_common_in_B])
submission = submission.reset_index(drop=True)[[0, 1, 2]]
submission.shape

In [None]:
# 4. Merging dataframe
# Remove 0 scores and scores > 1
submission.columns = ['ProteinID', 'GO_Term', 'Score']
submission = submission[submission['Score'] >= 0.05]
submission['Score'] = submission['Score'].clip(upper=1.0)
submission.shape

In [None]:
# 5. Handling missing value
# Keep only 1500 GO per Protein
submission = (
    submission.sort_values(['ProteinID', 'Score'], ascending=[True, False])
    .groupby('ProteinID', group_keys=False)
    .head(1500)
)
submission.shape

In [None]:
# 6.  Weighted Averaging of Scores
# GT data collected using QuickGO API
protein_go_annotations = pd.read_csv('/kaggle/input/protein-go-annotations-taxonomy/protein_go_annotations.csv')
protein_go_annotations = protein_go_annotations[['ProteinID', 'GO_Term']]
protein_go_annotations['Score'] = round(1.0, 3)

In [None]:
# 7. Handling Missing Protein/GO Term Data
combined = pd.concat([submission, protein_go_annotations], ignore_index=True)

# Drop duplicates,
# Keep the ground truth score (1.0) if overlap
combined.sort_values(by='Score', ascending=False, inplace=True)
final_submission = combined.drop_duplicates(subset=['ProteinID', 'GO_Term'], keep='first').reset_index(drop=True)
final_submission.shape

In [None]:
# 8. Submission
final_submission.to_csv('submission.tsv',sep='\t', index=False, header=None)

## Conclusion

This script provides a method for combining multiple protein-GO term prediction files into a single, weighted ensemble. It handles missing data gracefully, merges files efficiently, and outputs the final sorted results for downstream analysis.

In [None]:
# # -=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-
# # 2. Function Definition
# # -=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-
# def ensemble_submissions(file_paths, weights, output_path='submission.tsv'):
#     # -=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-
#     # 3. Read and Process Files
#     # -=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-
#     dfs = []
#     for i, path in enumerate(file_paths):
#         df = pd.read_csv(path, sep='\t', header=None, names=['protein', 'go_term', 'score'])
#         df['key'] = df['protein'] + '_' + df['go_term']
#         df = df.rename(columns={'score': f'score_{i}'})
#         dfs.append(df)
#         print(f"Loaded {len(df)} predictions from file {i+1}")

#     # -=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-
#     ## 4. Merging DataFrames
#     # -=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-
#     result = dfs[0][['protein', 'go_term', 'key', 'score_0']].copy()
#     for i in range(1, len(dfs)):
#         result = result.merge(dfs[i][['key', f'score_{i}']], on='key', how='outer')

#     # -=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-
#     #  5. Handling Missing Data
#     # -=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-
#     for i in range(len(dfs)):
#         result[f'score_{i}'] = result[f'score_{i}'].fillna(0)

#     # -=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-
#     # 6. Weighted Averaging of Scores
#     # -=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-
#     result['score'] = sum(weights[i] * result[f'score_{i}'] for i in range(len(dfs)))
#     # -=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-
#     # 7. Handling Missing Protein/GO Term Data
#     # -=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-
#     result['protein'] = result['protein'].fillna(result['key'].str.split('_').str[0])
#     result['go_term'] = result['go_term'].fillna(result['key'].str.split('_').str[-1])

#     # -=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-
#     # 8. Sorting and Saving Final Predictions
#     # -=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-
#     result = result.sort_values('score', ascending=False)
#     result[['protein', 'go_term', 'score']].to_csv(
#         output_path,
#         sep='\t',
#         index=False,
#         header=False
#     )
    
#     print(f"\nSaved {len(result)} predictions to {output_path}")
#     return result
# # -=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-
# # 9. Example Usage
# # -=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-=_=-
# if __name__ == "__main__":
#     file_paths = [
#         '/kaggle/input/cafa-6-protein-function-prediction'
#         # '/kaggle/input/gaf-submission/submission.tsv',
#         # '/kaggle/input/cafa-6-predictions/submission.tsv'
#     ]
#     weights = [0.5, 0.5]
    
#     ensemble_submissions(file_paths, weights)