# EDA 01 â€“ Data Overview

Goals:
- Sanity-check files exist and can be loaded
- Understand ontology, sequences, labels, and IA weights
- Produce quick plots and cached tables for faster iteration

In [None]:
# Imports & Paths
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from Bio import SeqIO
import networkx as nx
import obonet

sns.set_context("talk")
sns.set_style("whitegrid")

# Reproducibility
np.random.seed(42)

# Paths
TRAIN_SEQ = Path('Train/train_sequences.fasta')
TRAIN_TERMS = Path('Train/train_terms.tsv')
GO_OBO = Path('Train/go-basic.obo')
IA_TSV = Path('IA.tsv')
TEST_FASTA = Path('Test/testsuperset.fasta')

FIG_DIR = Path('notebooks/figures'); FIG_DIR.mkdir(parents=True, exist_ok=True)
PROC_DIR = Path('data/processed'); PROC_DIR.mkdir(parents=True, exist_ok=True)


def safe_exists(p: Path) -> bool:
    try:
        return p.exists()
    except Exception:
        return False


def head_print(df: pd.DataFrame, n: int = 5):
    with pd.option_context('display.max_columns', None, 'display.width', 160):
        print(df.head(n))

: 

In [None]:
# Versions and file existence checks
print('Python:', sys.version)
print('pandas:', pd.__version__)
print('numpy:', np.__version__)
import matplotlib
print('matplotlib:', matplotlib.__version__)
import Bio
print('biopython:', Bio.__version__)
print('networkx:', nx.__version__)
print('obonet:', obonet.__version__ if hasattr(obonet, '__version__') else 'n/a')

for p in [TRAIN_SEQ, TRAIN_TERMS, GO_OBO, IA_TSV, TEST_FASTA]:
    print(f"Exists {p}: ", safe_exists(p))

In [None]:
# Training Sequences (FASTA)
from statistics import mean

train_seq_df = None
try:
    if TRAIN_SEQ.exists():
        lengths = []
        rows = []
        for i, rec in enumerate(SeqIO.parse(str(TRAIN_SEQ), 'fasta')):
            if i < 200:
                rows.append({'EntryID': rec.id, 'length': len(rec.seq), 'sequence': str(rec.seq)[:50]})
            lengths.append(len(rec.seq))
            if i >= 10000:  # cap for speed
                break
        train_seq_df = pd.DataFrame(rows)
        print(f"Sampled proteins (preview rows): {len(train_seq_df)}")
        head_print(train_seq_df)
        if lengths:
            print(f"Length stats (first {len(lengths)} seqs): min={min(lengths)}, mean={mean(lengths):.1f}, max={max(lengths)}")
            plt.figure(figsize=(8,4))
            sns.histplot(lengths, bins=50, kde=True)
            plt.title('Train sequence length distribution (sample)')
            plt.xlabel('length'); plt.ylabel('count'); plt.tight_layout(); plt.show()
    else:
        print('Train FASTA not found:', TRAIN_SEQ)
except Exception as e:
    print('Error reading training FASTA:', e)

In [None]:
# Training Labels (train_terms.tsv)
labels_df = None
try:
    if TRAIN_TERMS.exists():
        labels_df = pd.read_csv(TRAIN_TERMS, sep='\t', header=0)
        expected_cols = {'EntryID','term'}
        missing = expected_cols - set(labels_df.columns)
        if missing:
            print('Warning: Missing expected columns:', missing)
        print('Labels shape:', labels_df.shape)
        print('Unique proteins:', labels_df['EntryID'].nunique())
        print('Unique GO terms:', labels_df['term'].nunique())
        print('Total annotations:', len(labels_df))
        # terms per protein
        terms_per_protein = labels_df.groupby('EntryID').size()
        print('Avg terms per protein:', terms_per_protein.mean())
        # top terms
        top_terms = labels_df['term'].value_counts().head(20)
        display(top_terms)
        plt.figure(figsize=(10,4))
        sns.barplot(x=top_terms.index, y=top_terms.values)
        plt.xticks(rotation=90)
        plt.title('Top 20 GO terms by count')
        plt.tight_layout(); plt.show()
    else:
        print('Labels TSV not found:', TRAIN_TERMS)
except Exception as e:
    print('Error reading labels TSV:', e)

In [None]:
# GO Ontology (go-basic.obo)
go_graph = None
try:
    if GO_OBO.exists():
        go_graph = obonet.read_obo(str(GO_OBO))
        print('GO nodes:', len(go_graph))
        print('GO edges:', go_graph.number_of_edges())
        # Build term table
        terms = []
        for node, data in go_graph.nodes(data=True):
            terms.append({
                'term': node,
                'name': data.get('name'),
                'namespace': data.get('namespace'),
                'is_obsolete': data.get('is_obsolete', 'false') == 'true'
            })
        terms_df = pd.DataFrame(terms)
        head_print(terms_df)
        # Roots
        roots = {'BPO': 'GO:0008150', 'CCO': 'GO:0005575', 'MFO': 'GO:0003674'}
        print('Root presence:', {k: r in go_graph for k, r in roots.items()})
        # Avg degree
        degs = [d for _, d in go_graph.degree()]
        print('Average degree:', float(np.mean(degs)))
    else:
        print('GO OBO not found:', GO_OBO)
except Exception as e:
    print('Error reading GO OBO:', e)

In [None]:
# IA (Information Accretion)
ia_df = None
try:
    if IA_TSV.exists():
        ia_df = pd.read_csv(IA_TSV, sep='\t', header=None, names=['term','ia'])
        print(ia_df.describe())
        plt.figure(figsize=(6,4))
        sns.histplot(ia_df['ia'], bins=50, kde=True)
        plt.title('IA distribution'); plt.tight_layout(); plt.show()
        if 'labels_df' in globals() and labels_df is not None:
            term_freq = labels_df['term'].value_counts().rename_axis('term').reset_index(name='freq')
            merged = term_freq.merge(ia_df, on='term', how='left')
            plt.figure(figsize=(6,4))
            sns.scatterplot(x=np.log10(merged['freq']+1), y=merged['ia'])
            plt.xlabel('log10(term frequency + 1)'); plt.ylabel('IA')
            plt.title('IA vs term frequency (train)'); plt.tight_layout(); plt.show()
            # correlations
            from scipy.stats import pearsonr, spearmanr
            valid = merged.dropna(subset=['ia'])
            if not valid.empty:
                pr = pearsonr(np.log10(valid['freq']+1), valid['ia'])
                sr = spearmanr(np.log10(valid['freq']+1), valid['ia'])
                print('Pearson r, p:', pr)
                print('Spearman r, p:', sr)
    else:
        print('IA.tsv not found:', IA_TSV)
except Exception as e:
    print('Error reading IA.tsv:', e)

In [None]:
# Test Superset (FASTA) vs Train lengths
try:
    test_lengths = []
    if TEST_FASTA.exists():
        for i, rec in enumerate(SeqIO.parse(str(TEST_FASTA), 'fasta')):
            test_lengths.append(len(rec.seq))
            if i >= 10000:
                break
        print('Test sample count:', len(test_lengths))
    else:
        print('Test FASTA not found:', TEST_FASTA)

    # Overlay vs train sample if available
    if 'lengths' in globals() and lengths and test_lengths:
        plt.figure(figsize=(8,4))
        sns.histplot(lengths, color='C0', label='train (sample)', bins=50, kde=True, stat='density', alpha=0.5)
        sns.histplot(test_lengths, color='C1', label='test (sample)', bins=50, kde=True, stat='density', alpha=0.5)
        plt.legend(); plt.title('Length distributions: train vs test (samples)')
        plt.xlabel('length'); plt.tight_layout(); plt.show()
except Exception as e:
    print('Error reading test FASTA:', e)

In [None]:
# Takeaways & Next Steps
print('- Inspect outliers in sequence length and composition')
print('- Build label propagation utility (child -> ancestors)')
print('- Construct baseline with term frequency prior')
print('- Prepare train/val split and caching')