# **CAFA6 Data Structures**

In [None]:
!pip install pandas biopython obonet

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

## **train: go-basic**

In [None]:
import obonet

go_graph = obonet.read_obo('/kaggle/input/cafa-6-protein-function-prediction/Train/go-basic.obo')

print(f"Total number of GO terms (nodes): **{len(go_graph.nodes)}**")
print("---")

# Example: Root GO term (biological_process)
example_go_id = 'GO:0008150' 

if example_go_id in go_graph:
    # Get all node attributes as a dictionary
    term_data = go_graph.nodes[example_go_id]
    
    print(f"ðŸ”¬ Details for GO Term ID: {example_go_id}")
    print(f"  - Name: {term_data.get('name', 'N/A')}")

    definition = term_data.get('def', 'N/A')
    print(f"  - Definition: {definition[:60]}...")
    
    # Print the entire dictionary (if you want to see all attributes)
    print("\n--- All Node Attributes ---")
    print(term_data)
    
else:
    print(f"Term ID **{example_go_id}** not found in the graph.")

## **train: EntryID, sequences**

In [None]:
from Bio import SeqIO

sequences = list(SeqIO.parse('/kaggle/input/cafa-6-protein-function-prediction/Train/train_sequences.fasta', 'fasta'))
print('n_sequences:',len(sequences))

if sequences:
    first_record = sequences[0]
    print(f"EntryID: {first_record.id}")
    print(f"Length: {len(first_record.seq)}")
    print(f"First characters of the sequence: {first_record.seq}")
else:
    print("\nNo sequence records found in 'train_sequences.fasta'.")

## **test: EntryID, sequences**

In [None]:
tsequences = list(SeqIO.parse('/kaggle/input/cafa-6-protein-function-prediction/Test/testsuperset.fasta', 'fasta'))
print('n_sequences:',len(sequences))

if tsequences:
    first_record = tsequences[0]
    print(f"EntryID: {first_record.id}")
    print(f"Length: {len(first_record.seq)}")
    print(f"First characters of the sequence: {first_record.seq}")
else:
    print("\nNo sequence records found in 'train_sequences.fasta'.")

## **train: EntryID,GoID,aspect**

In [None]:
df_terms = pd.read_csv('/kaggle/input/cafa-6-protein-function-prediction/Train/train_terms.tsv', sep='\t')
print(df_terms[0:10])

## **train: EntryID, SpeciesID**

In [None]:
df_taxonomy = pd.read_csv('/kaggle/input/cafa-6-protein-function-prediction/Train/train_taxonomy.tsv',header=None, sep='\t')
print(df_taxonomy[0:10])


## **test: SpeciesID,Species**

In [None]:
df_ttaxonomy = pd.read_csv('/kaggle/input/cafa-6-protein-function-prediction/Test/testsuperset-taxon-list.tsv',header=None, sep='\t')
print(df_ttaxonomy[0:10])

## **IA: GoID,weights**

In [None]:
df_IA = pd.read_csv('/kaggle/input/cafa-6-protein-function-prediction/IA.tsv',header=None, sep='\t')
print(df_IA[0:10])


## **submission: EntryID,GoID,score**

In [None]:
df_submit = pd.read_csv('/kaggle/input/cafa-6-protein-function-prediction/sample_submission.tsv',
                        header=None, sep='\t', on_bad_lines='skip')
print(df_submit[0:10])