# CAFA 6 Protein Function Prediction - EDA

In [1]:
import pandas as pd
import numpy as np
from collections import Counter

# Load training terms
train_terms = pd.read_csv('/home/data/Train/train_terms.tsv', sep='\t')
print('Train terms shape:', train_terms.shape)
print('Columns:', train_terms.columns.tolist())
print('\nFirst rows:')
train_terms.head()

Train terms shape: (537027, 3)
Columns: ['EntryID', 'term', 'aspect']

First rows:


Unnamed: 0,EntryID,term,aspect
0,Q5W0B1,GO:0000785,C
1,Q5W0B1,GO:0004842,F
2,Q5W0B1,GO:0051865,P
3,Q5W0B1,GO:0006275,P
4,Q5W0B1,GO:0006513,P


In [2]:
# Analyze the data structure
print('Number of unique proteins:', train_terms['EntryID'].nunique())
print('Number of unique GO terms:', train_terms['term'].nunique())
print('\\nAspect distribution (C=Cellular Component, F=Molecular Function, P=Biological Process):')
print(train_terms['aspect'].value_counts())

# Terms per protein
terms_per_protein = train_terms.groupby('EntryID').size()
print('\\nTerms per protein stats:')
print(terms_per_protein.describe())

# Most common GO terms
print('\\nTop 20 most common GO terms:')
print(train_terms['term'].value_counts().head(20))

Number of unique proteins: 82404
Number of unique GO terms: 26125
\nAspect distribution (C=Cellular Component, F=Molecular Function, P=Biological Process):
aspect
P    250805
C    157770
F    128452
Name: count, dtype: int64
\nTerms per protein stats:
count    82404.000000
mean         6.517002
std          7.965655
min          1.000000
25%          2.000000
50%          4.000000
75%          8.000000
max        233.000000
dtype: float64
\nTop 20 most common GO terms:
term
GO:0005515    33713
GO:0005634    13283
GO:0005829    13040
GO:0005886    10150
GO:0005737     9442
GO:0005739     5807
GO:0005654     5065
GO:0016020     3563
GO:0042802     3547
GO:0005576     3241
GO:0005783     2837
GO:0005615     2391
GO:0045944     2319
GO:0070062     2130
GO:0005794     2045
GO:0005730     1789
GO:0042803     1627
GO:0003723     1613
GO:0000122     1551
GO:0009507     1512
Name: count, dtype: int64


In [3]:
# Check IA weights distribution\nia_df = pd.read_csv('/home/data/IA.tsv', sep='\\t', header=None, names=['term', 'ia'])\nprint('IA weights shape:', ia_df.shape)\nprint('\\nIA weight statistics:')\nprint(ia_df['ia'].describe())\n\n# Check sample submission format\nsample_sub = pd.read_csv('/home/data/sample_submission.tsv', sep='\\t', header=None, names=['protein_id', 'go_term', 'confidence'])\nprint('\\nSample submission shape:', sample_sub.shape)\nprint('\\nSample submission head:')\nprint(sample_sub.head())\nprint('\\nUnique proteins in sample submission:', sample_sub['protein_id'].nunique())

In [4]:
# Check test set size\nfrom Bio import SeqIO\n\n# Count test sequences\ntest_count = 0\ntest_lengths = []\nfor record in SeqIO.parse('/home/data/Test/testsuperset.fasta', 'fasta'):\n    test_count += 1\n    test_lengths.append(len(record.seq))\n    if test_count >= 1000:  # Sample first 1000\n        break\n\nprint(f'Test sequences (sampled): {test_count}')\nprint(f'Sequence length stats (first 1000):')\nprint(f'  Mean: {np.mean(test_lengths):.1f}')\nprint(f'  Min: {np.min(test_lengths)}')\nprint(f'  Max: {np.max(test_lengths)}')