In [1]:
import pandas as pd
import numpy as np
from Bio import SeqIO
from collections import Counter

# Load training terms
train_terms = pd.read_csv('/home/data/Train/train_terms.tsv', sep='\t')
print("Train terms shape:", train_terms.shape)
print(train_terms.head())
print("\nColumns:", train_terms.columns.tolist())
print("\nUnique proteins:", train_terms['EntryID'].nunique())
print("Unique GO terms:", train_terms['term'].nunique())
print("\nAspect distribution:")
print(train_terms['aspect'].value_counts())

Train terms shape: (537027, 3)
  EntryID        term aspect
0  Q5W0B1  GO:0000785      C
1  Q5W0B1  GO:0004842      F
2  Q5W0B1  GO:0051865      P
3  Q5W0B1  GO:0006275      P
4  Q5W0B1  GO:0006513      P

Columns: ['EntryID', 'term', 'aspect']

Unique proteins: 82404
Unique GO terms: 26125

Aspect distribution:
aspect
P    250805
C    157770
F    128452
Name: count, dtype: int64


In [3]:
# Check IA weights (Information Accretion)
ia_df = pd.read_csv('/home/data/IA.tsv', sep='\t', header=None, names=['term', 'ia'])
print("IA weights shape:", ia_df.shape)
print(ia_df.head())
print("\nIA statistics:")
print(ia_df['ia'].describe())

# Check sample submission format - handle potential parsing issues
import csv
sample_sub = pd.read_csv('/home/data/sample_submission.tsv', sep='\t', header=None, 
                          names=['protein_id', 'go_term', 'confidence'], 
                          on_bad_lines='skip')
print("\nSample submission shape:", sample_sub.shape)
print(sample_sub.head())

IA weights shape: (40122, 2)
         term        ia
0  GO:0000001  0.000000
1  GO:0000002  2.849666
2  GO:0000011  0.137504
3  GO:0000012  6.038630
4  GO:0000017  0.514573

IA statistics:
count    40122.000000
mean         2.647186
std          3.191901
min          0.000000
25%          0.000000
50%          1.201634
75%          4.584963
max         15.879703
Name: ia, dtype: float64

Sample submission shape: (20000, 3)
   protein_id     go_term  confidence
0  A0A0C5B5G6  GO:0000001       0.123
1  A0A0C5B5G6  GO:0000002       0.456
2  A0A1B0GTW7  GO:0000001       0.123
3  A0A1B0GTW7  GO:0000002       0.456
4      A0JNW5  GO:0000001       0.123


In [None]:
# Check test superset size
test_seqs = list(SeqIO.parse('/home/data/Test/testsuperset.fasta', 'fasta'))
print(f"Test superset proteins: {len(test_seqs)}")

# Check train sequences
train_seqs = list(SeqIO.parse('/home/data/Train/train_sequences.fasta', 'fasta'))
print(f"Train proteins: {len(train_seqs)}")

# Sequence length distribution
train_lens = [len(seq.seq) for seq in train_seqs]
print(f"\nTrain sequence lengths:")
print(f"  Min: {min(train_lens)}")
print(f"  Max: {max(train_lens)}")
print(f"  Mean: {np.mean(train_lens):.1f}")
print(f"  Median: {np.median(train_lens):.1f}")