# Introduction

You are given a list of proteins (with identifiers), their species (taxon ID), and for training proteins, their known GO term annotations. The Gene Ontology (go-basic.obo) provides the vocabulary of possible functions.

Output:
For each target (unannotated) protein we need to predict one or more GO term IDs that describe its biological function.

In [None]:
!pip install Bio
from Bio import SeqIO
import pandas as pd
import plotly.express as px
import plotly.io as pio

!pip install pronto
from pronto import Ontology
!pip install goatools
import matplotlib.pyplot as plt
import squarify  
from goatools.obo_parser import GODag
import seaborn as sns

In [None]:
train_sequences = {}
for record in SeqIO.parse("/kaggle/input/cafa-6-protein-function-prediction/Train/train_sequences.fasta", "fasta"):
    train_sequences[record.id] = str(record.seq)


In [None]:
import pandas as pd

train_taxanomy = pd.read_csv("/kaggle/input/cafa-6-protein-function-prediction/Train/train_taxonomy.tsv", sep="\t", names=["protein_id", "go_id"])


In [None]:
print(train_taxanomy)

In [None]:

train_terms = pd.read_csv("/kaggle/input/cafa-6-protein-function-prediction/Train/train_terms.tsv", sep="\t", names=["protein_id", "go_id"])

In [None]:
print(train_terms)

In [None]:
tax_counts = train_taxanomy["go_id"].value_counts() 
tax_counts.head(10)


| Taxonomy ID | Scientific name                   | Common name               | Kingdom  |
| ----------- | --------------------------------- | ------------------------- | -------- |
| **9606**    | *Homo sapiens*                    | Human                     | Animalia |
| **10090**   | *Mus musculus*                    | Mouse                     | Animalia |
| **3702**    | *Arabidopsis thaliana*            | Thale cress (plant model) | Plantae  |
| **559292**  | *Saccharomyces cerevisiae* S288C  | Baker’s yeast             | Fungi    |
| **10116**   | *Rattus norvegicus*               | Rat                       | Animalia |
| **284812**  | *Schizosaccharomyces pombe* 972h− | Fission yeast             | Fungi    |
| **83333**   | *Escherichia coli* K-12           | E. coli (bacterium)       | Bacteria |
| **7227**    | *Drosophila melanogaster*         | Fruit fly                 | Animalia |
| **6239**    | *Caenorhabditis elegans*          | Nematode (worm)           | Animalia |
| **83332**   | *Escherichia coli*                | E. coli (general)         | Bacteria |


In [None]:


fasta_file = "/kaggle/input/cafa-6-protein-function-prediction/Train/train_sequences.fasta"
seq_records = list(SeqIO.parse(fasta_file, "fasta"))

sequences_df = pd.DataFrame({
    "protein_id": [rec.id for rec in seq_records],
    "sequence": [str(rec.seq) for rec in seq_records]
})


In [None]:
tax_df = pd.read_csv("/kaggle/input/cafa-6-protein-function-prediction/Train/train_taxonomy.tsv", sep="\t", names=["protein_id", "taxon_id"])
terms_df = pd.read_csv("/kaggle/input/cafa-6-protein-function-prediction/Train/train_terms.tsv", sep="\t", names=["protein_id", "go_id", "aspect"])


In [None]:
go_agg = terms_df.groupby("protein_id")["go_id"].apply(list).reset_index()


In [None]:
go_agg["go_id"] = go_agg["go_id"].apply(lambda x: ",".join(x))


In [None]:
sequences_df["protein_id"] = sequences_df["protein_id"].apply(lambda x: x.split("|")[1])


In [None]:
merged_df = sequences_df.merge(tax_df, on="protein_id", how="left") \
                        .merge(go_agg, on="protein_id", how="left")


In [None]:
merged_df

In [None]:
merged_expl = merged_df.copy()
merged_expl = merged_expl.drop("sequence", axis=1)  

merged_expl = merged_expl.assign(go_id=merged_expl["go_id"].str.split(",")).explode("go_id")

merged_expl = merged_expl.merge(terms_df[["protein_id", "go_id", "aspect"]], 
                                on=["protein_id", "go_id"], how="left")


In [None]:


taxon_map = {
    9606: "Homo sapiens",
    10090: "Mus musculus",
    3702: "Arabidopsis thaliana",
    559292: "Saccharomyces cerevisiae",
    284812: "Schizosaccharomyces pombe"
}

merged_df["species"] = merged_df["taxon_id"].map(taxon_map)
merged_small = merged_df[merged_df["species"].notna()]
merged_expl = merged_small.assign(go_id=merged_small["go_id"].str.split(",")).explode("go_id")

go = GODag("/kaggle/input/cafa-6-protein-function-prediction/Train/go-basic.obo")

def go_name(go_id):
    return go[go_id].name if go_id in go else go_id

merged_expl["go_name"] = merged_expl["go_id"].apply(go_name)

# Aggregate counts
agg_df = merged_expl.groupby(["species", "go_name"]).size().reset_index(name="count")

# Filter top 20 GO terms by total count
top_go = agg_df.groupby("go_name")["count"].sum().nlargest(20).index
agg_df = agg_df[agg_df["go_name"].isin(top_go)]

# Create labels for the treemap
agg_df["label"] = agg_df["species"] + "\n" + agg_df["go_name"]

# Plot treemap
plt.figure(figsize=(16, 10))
squarify.plot(
    sizes=agg_df["count"],
    label=agg_df["label"],
    alpha=0.8
)
plt.axis('off')
plt.title("Top 20 GO Terms by Species")
plt.show()


In [None]:

go = Ontology("/kaggle/input/cafa-6-protein-function-prediction/Train/go-basic.obo")

term = go['GO:0000785']
print(term.id, term.name, term.namespace)


In [None]:



merged_df["species"] = merged_df["taxon_id"].map(taxon_map)
merged_small = merged_df[merged_df["species"].notna()]
merged_expl = merged_small.assign(go_id=merged_small["go_id"].str.split(",")).explode("go_id")

# Aggregate counts
agg_df = merged_expl.groupby(["species", "go_id"]).size().reset_index(name="count")

# Select top 20 GO IDs
top_go = agg_df.groupby("go_id")["count"].sum().nlargest(20).index
agg_df = agg_df[agg_df["go_id"].isin(top_go)]

# Sort for better plot
agg_df = agg_df.sort_values("count", ascending=False)

# Plot bar chart
plt.figure(figsize=(14, 8))
sns.barplot(
    data=agg_df,
    x="count",
    y="go_id",
    hue="species"
)
plt.title("Top 20 GO IDs by Species")
plt.xlabel("Count")
plt.ylabel("GO ID")
plt.tight_layout()
plt.show()


In [None]:
pio.renderers.default = 'notebook' 

go = Ontology("/kaggle/input/cafa-6-protein-function-prediction/Train/go-basic.obo")

merged_df["species"] = merged_df["taxon_id"].map(taxon_map)

merged_small = merged_df[merged_df["species"].notna()]

merged_expl = merged_small.assign(go_id=merged_small["go_id"].str.split(",")).explode("go_id")

def go_name(go_id):
    try:
        return go[go_id].name
    except KeyError:
        return go_id  

merged_expl["go_name"] = merged_expl["go_id"].apply(go_name)

agg_df = merged_expl.groupby(["species", "go_name"]).size().reset_index(name="count")

top_go = agg_df.groupby("go_name")["count"].sum().nlargest(20).index
agg_df = agg_df[agg_df["go_name"].isin(top_go)]


fig = px.sunburst(
    agg_df,
    path=["species", "go_name"], 
    values="count",
    color="species"
)
fig.show()