In [1]:
import pandas as pd
import numpy as np
from Bio import SeqIO
from collections import Counter

# Load training data
print("Loading training data...")
train_terms = pd.read_csv('/home/data/Train/train_terms.tsv', sep='\t')
print(f"Train terms shape: {train_terms.shape}")
print(train_terms.head())
print(f"\nColumns: {train_terms.columns.tolist()}")
print(f"\nUnique proteins: {train_terms['EntryID'].nunique()}")
print(f"Unique GO terms: {train_terms['term'].nunique()}")
print(f"\nAspect distribution:")
print(train_terms['aspect'].value_counts())

Loading training data...
Train terms shape: (537027, 3)
  EntryID        term aspect
0  Q5W0B1  GO:0000785      C
1  Q5W0B1  GO:0004842      F
2  Q5W0B1  GO:0051865      P
3  Q5W0B1  GO:0006275      P
4  Q5W0B1  GO:0006513      P

Columns: ['EntryID', 'term', 'aspect']

Unique proteins: 82404
Unique GO terms: 26125

Aspect distribution:


aspect
P    250805
C    157770
F    128452
Name: count, dtype: int64


In [2]:
# Analyze GO term frequency distribution
term_counts = train_terms['term'].value_counts()
print(f"GO term frequency statistics:")
print(f"Most common terms:")
print(term_counts.head(20))
print(f"\nTerm frequency distribution:")
print(term_counts.describe())
print(f"\nTerms appearing in >1000 proteins: {(term_counts > 1000).sum()}")
print(f"Terms appearing in >500 proteins: {(term_counts > 500).sum()}")
print(f"Terms appearing in >100 proteins: {(term_counts > 100).sum()}")

GO term frequency statistics:
Most common terms:
term
GO:0005515    33713
GO:0005634    13283
GO:0005829    13040
GO:0005886    10150
GO:0005737     9442
GO:0005739     5807
GO:0005654     5065
GO:0016020     3563
GO:0042802     3547
GO:0005576     3241
GO:0005783     2837
GO:0005615     2391
GO:0045944     2319
GO:0070062     2130
GO:0005794     2045
GO:0005730     1789
GO:0042803     1627
GO:0003723     1613
GO:0000122     1551
GO:0009507     1512
Name: count, dtype: int64

Term frequency distribution:
count    26125.000000
mean        20.556057
std        268.143836
min          1.000000
25%          2.000000
50%          4.000000
75%         12.000000
max      33713.000000
Name: count, dtype: float64

Terms appearing in >1000 proteins: 33
Terms appearing in >500 proteins: 82
Terms appearing in >100 proteins: 663


In [4]:
# Analyze protein sequence lengths
print("Loading protein sequences...")
train_seqs = {rec.id: str(rec.seq) for rec in SeqIO.parse('/home/data/Train/train_sequences.fasta', 'fasta')}
print(f"Number of training sequences: {len(train_seqs)}")

seq_lengths = [len(s) for s in train_seqs.values()]
print(f"\nSequence length statistics:")
print(f"Min: {min(seq_lengths)}, Max: {max(seq_lengths)}, Mean: {np.mean(seq_lengths):.1f}, Median: {np.median(seq_lengths):.1f}")

# Test set
test_seqs = {rec.id: str(rec.seq) for rec in SeqIO.parse('/home/data/Test/testsuperset.fasta', 'fasta')}
print(f"\nNumber of test sequences: {len(test_seqs)}")
test_lengths = [len(s) for s in test_seqs.values()]
print(f"Test sequence length - Min: {min(test_lengths)}, Max: {max(test_lengths)}, Mean: {np.mean(test_lengths):.1f}")

Loading protein sequences...


Number of training sequences: 82404

Sequence length statistics:
Min: 3, Max: 35213, Mean: 525.8, Median: 409.0



Number of test sequences: 224309
Test sequence length - Min: 2, Max: 35213, Mean: 429.2


In [5]:
# Check IA weights (Information Accretion)
ia_df = pd.read_csv('/home/data/IA.tsv', sep='\t', header=None, names=['term', 'ia'])
print(f"IA weights shape: {ia_df.shape}")
print(f"\nIA statistics:")
print(ia_df['ia'].describe())
print(f"\nSample IA values:")
print(ia_df.head(10))

IA weights shape: (40122, 2)

IA statistics:
count    40122.000000
mean         2.647186
std          3.191901
min          0.000000
25%          0.000000
50%          1.201634
75%          4.584963
max         15.879703
Name: ia, dtype: float64

Sample IA values:
         term        ia
0  GO:0000001  0.000000
1  GO:0000002  2.849666
2  GO:0000011  0.137504
3  GO:0000012  6.038630
4  GO:0000017  0.514573
5  GO:0000018  1.879637
6  GO:0000019  3.584963
7  GO:0000022  0.000000
8  GO:0000023  3.311586
9  GO:0000024  0.000000


In [7]:
# Check sample submission format
sample_sub = pd.read_csv('/home/data/sample_submission.tsv', sep='\t', header=None, on_bad_lines='skip')
print(f"Sample submission shape: {sample_sub.shape}")
print(sample_sub.head(10))
print(f"\nUnique proteins in sample: {sample_sub[0].nunique()}")

Sample submission shape: (20000, 3)
            0           1      2
0  A0A0C5B5G6  GO:0000001  0.123
1  A0A0C5B5G6  GO:0000002  0.456
2  A0A1B0GTW7  GO:0000001  0.123
3  A0A1B0GTW7  GO:0000002  0.456
4      A0JNW5  GO:0000001  0.123
5      A0JNW5  GO:0000002  0.456
6      A0JP26  GO:0000001  0.123
7      A0JP26  GO:0000002  0.456
8      A0PK11  GO:0000001  0.123
9      A0PK11  GO:0000002  0.456

Unique proteins in sample: 10000
