In [None]:
!pip install biopython

In [None]:
import pandas as pd
import numpy as np
from Bio import SeqIO
import matplotlib.pyplot as plt

## Training Set

The first step is to open and familiarise with the training set. And the oranisers don't exaggerate that the data is messy... </br>

In [None]:
# For convinience, I add a header row in train_taxonomy
train_taxonomy = pd.read_csv("/kaggle/input/cafa-6-protein-function-prediction/Train/train_taxonomy.tsv",sep="\t", names=["Train_EntryID", "OX"]) 
train_taxonomy.head()

In [None]:
#as you can see, IDs, names and descriptions aren't separated in train_sequences.fasta file 
seqs = SeqIO.parse("/kaggle/input/cafa-6-protein-function-prediction/Train/train_sequences.fasta", "fasta")
print(next(seqs))

In [None]:
#... so I separated them manually
# I also left names and descriptions aside as I don't care about them. 
# The only important information as for now (for me) is the organism. But this information is carried in train_taxonomy
entries = []

for entry in SeqIO.parse('/kaggle/input/cafa-6-protein-function-prediction/Train/train_sequences.fasta', "fasta"):

    # Split the description by the pipe symbol and extract the EntryID
    EntryID = entry.id.split('|')[1] 
    entries.append({
        "Train_EntryID": EntryID,
        "sequence": str(entry.seq),
    })

fasta_df = pd.DataFrame(entries)
display(fasta_df.head())

In [None]:
#You've probably already guessed what comes next
train_seq = pd.merge(train_taxonomy, fasta_df, on="Train_EntryID")
train_seq.head()

In [None]:
train_GO = pd.read_csv("/kaggle/input/cafa-6-protein-function-prediction/Train/train_terms.tsv",sep="\t") 
train_GO.head()

In [None]:
CC = train_GO[train_GO.aspect == "C"]
CC_GO = CC.term.unique()
MF = train_GO[train_GO.aspect == "F"]
MF_GO = MF.term.unique()
BP = train_GO[train_GO.aspect == "P"]
BP_GO = BP.term.nunique()

## Test Set

In [None]:
#This one is much neater!
seqs = SeqIO.parse("/kaggle/input/cafa-6-protein-function-prediction/Test/testsuperset.fasta", "fasta")
print(next(seqs))

In [None]:
entries = []

for entry in SeqIO.parse('/kaggle/input/cafa-6-protein-function-prediction/Test/testsuperset.fasta', "fasta"):

    entries.append({
        "EntryID": entry.id,
        "OX": entry.description.split(' ')[1] ,
        "sequence": str(entry.seq)
        })

test_seq = pd.DataFrame(entries)
display(test_seq.head())

## Train vs Test Set

In [None]:
print("Train Set:")
Unique_Proteins = train_GO['EntryID'].nunique()
print("Unique proteins in train set:", Unique_Proteins)
Unique_GOs = train_GO['term'].nunique()
print("Unique GOs in train set:", Unique_GOs)
Train_Species = train_taxonomy['OX'].nunique()
print("Species in train set:", Train_Species)
print()
print("Test Set:")
Unique_Proteins_Test = test_seq['EntryID'].nunique()
print("Unique proteins in test set:", Unique_Proteins_Test)
Test_Species = test_seq['OX'].nunique()
print("Species in test set:", Test_Species)

In [None]:
plt.figure(figsize=(10, 5))
plt.bar(
    x=[
    "Unique proteins\nin train set",
    "Unique proteins in test set",
    "Species\nin train set",
    "Species in test set",
    ],
    height=[
        Unique_Proteins,
        Unique_Proteins_Test,
        Train_Species,
        Test_Species
    ]
)

Personally, I found it strange that the test set was much larger than the training set. Furthermore, there are organisms that are only present in the test set. <br>
Last but not least, train set is in test set ðŸ¤”

In [None]:
#Proving that all train sequences are in test sequences:
train_ids = set(train_seq.Train_EntryID)
test_ids = set(test_seq.EntryID)

test_ids >= train_ids

## Submission

Okay, so it is cheating ðŸ˜‚ But, since we know the GOs of the train dataset, I can simply use them. Logically, the probability of these GOs will be 1. ðŸ˜Š

In [None]:
sample_sub = pd.read_csv("/kaggle/input/cafa-6-protein-function-prediction/sample_submission.tsv",sep="\t",
                         names =["EntryID", "DO", "Probablity"],
                         usecols=[0, 1, 2]) 
sample_sub.head()

In [None]:
sample_sub.EntryID
rows = []
for e in sample_sub.EntryID.unique():
    train_rows = train_GO[train_GO.EntryID == e]
    if len(train_rows) > 0:
        for _, r in train_rows.iterrows():
            rows.append({
                "EntryID": e,
                "term": r.term,
                "probability": 1
            })
    else:
            rows.append({
                "EntryID" : e,
                "term": 0,
                "probability": 0,
            })
submission = pd.DataFrame(rows)
submission.head()

In [None]:
submission.to_csv('submission.tsv', sep='\t', header=None, index=False)
print("File created")

In [None]:
!head 'submission.tsv'

-----------------------------

I am curions how much this will score. ðŸ˜‚ Anyway, there are still proteins with unknown DOs, so <br> 
to be continued... (I need to learn obonet)