# CAFA 6 Protein fold competition

---

This notebook was automatically generated by Alexandria with comprehensive research data.


## 1. Setup & Imports

Install and import required libraries.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

import warnings
warnings.filterwarnings('ignore')

# Set random seeds
np.random.seed(42)
torch.manual_seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

## 2. Load Dataset

Loading dataset: **physionet-ecg-images**

Competition: `cafa-6-protein-function-prediction`

In [None]:
from pathlib import Path
import pandas as pd
import os

# Setup
DATA_PATH = Path(f'/kaggle/input/cafa-6-protein-function-prediction')
print(f'üìÅ Data path: {DATA_PATH}')
print(f'üìÅ Path exists: {DATA_PATH.exists()}')

# List all files
if DATA_PATH.exists():
    all_files = list(DATA_PATH.glob('**/*'))
    print(f'\nüìä Found {len(all_files)} total files/folders:')
    for f in all_files:
        print(f'  - {f.relative_to(DATA_PATH)}')
else:
    print(f'‚ùå Data path does not exist')

# Identify TSV files
tsv_files = [f for f in all_files if f.suffix.lower() == '.tsv']
print(f'\nüìù Found {len(tsv_files)} TSV files:')
for f in tsv_files:
    print(f'  - {f.name}')

# Load and inspect TSV files (train/test splits if available)
for tsv_file in tsv_files:
    print(f'\nüîç Inspecting file: {tsv_file.name}')
    try:
        df = pd.read_csv(tsv_file, sep='\t')
        print(f'  Shape: {df.shape}')
        print(f'  Columns: {list(df.columns)}')
        print(f'  Sample data:')
        display(df.head())
        print(f'  Info:')
        df.info()
    except Exception as e:
        print(f'  ‚ùå Could not load {tsv_file.name}: {e}')

# Attempt to identify train/test splits by filename
train_files = [f for f in tsv_files if 'train' in f.name.lower()]
test_files = [f for f in tsv_files if 'test' in f.name.lower() or 'val' in f.name.lower()]

if train_files:
    print(f'\nüü¢ Train files:')
    for f in train_files:
        print(f'  - {f.name}')
if test_files:
    print(f'\nüîµ Test/Validation files:')
    for f in test_files:
        print(f'  - {f.name}')

## 3. Exploratory Data Analysis

**Analyzing the competition data structure**

In [None]:
# Exploratory Data Analysis
try:
    print('üîß === EXPLORATORY DATA ANALYSIS ===\n')
    
    import matplotlib.pyplot as plt
    import seaborn as sns
    import pandas as pd
    import numpy as np

    # Helper for displaying multiple dataframes
    from IPython.display import display

    # Load train/test DataFrames from identified files
    dfs = {}
    for split, files in [('train', train_files), ('test', test_files)]:
        for f in files:
            try:
                df = pd.read_csv(f, sep='\t')
                dfs[f'{split}_{f.stem}'] = df
                print(f'‚úÖ Loaded {split} file: {f.name} | shape: {df.shape}')
            except Exception as e:
                print(f'‚úó Could not load {split} file {f.name}: {e}')

    # Show basic info for each dataframe
    for name, df in dfs.items():
        print(f'\nüìÑ DataFrame: {name}')
        print(f'  Shape: {df.shape}')
        print(f'  Columns: {list(df.columns)}')
        print('  Sample rows:')
        display(df.head())
        print('  Info:')
        df.info()

    # Analyze column types and missing values
    for name, df in dfs.items():
        print(f'\nüîé Missing values in {name}:')
        print(df.isnull().sum())

    # Distribution of sequence lengths (if sequence column exists)
    for name, df in dfs.items():
        seq_cols = [col for col in df.columns if 'sequence' in col.lower()]
        for seq_col in seq_cols:
            print(f'\nüìä Sequence length distribution in {name} [{seq_col}]:')
            seq_lengths = df[seq_col].dropna().apply(len)
            print(seq_lengths.describe())
            plt.figure(figsize=(8,4))
            sns.histplot(seq_lengths, bins=50, kde=True)
            plt.title(f'Sequence Length Distribution: {name} [{seq_col}]')
            plt.xlabel('Sequence Length')
            plt.ylabel('Count')
            plt.show()

    # Distribution of target labels (if any label columns exist)
    for name, df in dfs.items():
        label_cols = [col for col in df.columns if 'label' in col.lower() or 'go_' in col.lower() or 'function' in col.lower()]
        for label_col in label_cols:
            print(f'\nüìä Label distribution in {name} [{label_col}]:')
            label_counts = df[label_col].value_counts(dropna=False)
            print(label_counts.head(20))
            plt.figure(figsize=(8,4))
            sns.barplot(x=label_counts.index.astype(str)[:20], y=label_counts.values[:20])
            plt.title(f'Label Distribution: {name} [{label_col}] (Top 20)')
            plt.xlabel('Label')
            plt.ylabel('Count')
            plt.xticks(rotation=45, ha='right')
            plt.tight_layout()
            plt.show()

    # If multi-label columns (e.g., GO terms as lists/strings), analyze number of labels per sample
    for name, df in dfs.items():
        go_cols = [col for col in df.columns if 'go_' in col.lower() or 'function' in col.lower()]
        for go_col in go_cols:
            if df[go_col].dtype == object:
                print(f'\nüìä Number of GO terms per sample in {name} [{go_col}]:')
                # Try splitting by semicolon/comma/space
                sample = df[go_col].dropna().astype(str)
                if sample.str.contains(';').any():
                    splitter = ';'
                elif sample.str.contains(',').any():
                    splitter = ','
                else:
                    splitter = ' '
                n_terms = sample.apply(lambda x: len([t for t in x.split(splitter) if t.strip()]))
                print(n_terms.describe())
                plt.figure(figsize=(8,4))
                sns.histplot(n_terms, bins=30, kde=False)
                plt.title(f'GO Terms per Sample: {name} [{go_col}]')
                plt.xlabel('Number of GO Terms')
                plt.ylabel('Count')
                plt.show()

    # Check for duplicate sequences or IDs
    for name, df in dfs.items():
        id_cols = [col for col in df.columns if 'id' in col.lower()]
        for id_col in id_cols:
            n_unique = df[id_col].nunique()
            n_total = df.shape[0]
            print(f'\nüÜî {name}: {id_col} - {n_unique} unique / {n_total} total ({n_total-n_unique} duplicates)')
        seq_cols = [col for col in df.columns if 'sequence' in col.lower()]
        for seq_col in seq_cols:
            n_unique = df[seq_col].nunique()
            n_total = df.shape[0]
            print(f'üß¨ {name}: {seq_col} - {n_unique} unique / {n_total} total ({n_total-n_unique} duplicates)')

    # Correlation heatmap for numeric columns (if any)
    for name, df in dfs.items():
        num_cols = df.select_dtypes(include=[np.number]).columns
        if len(num_cols) > 1:
            print(f'\nüìà Correlation heatmap for {name}:')
            plt.figure(figsize=(8,6))
            sns.heatmap(df[num_cols].corr(), annot=True, fmt='.2f', cmap='coolwarm')
            plt.title(f'Numeric Feature Correlation: {name}')
            plt.show()

    print('\n‚úÖ Exploratory Data Analysis complete!')

except Exception as e:
    print(f'‚úó Error in Exploratory Data Analysis: {e}')
    import traceback
    traceback.print_exc()

## 4. Data Preprocessing

**Competition:** cafa-6-protein-function-prediction

**Note:** Following research-based implementation strategy

In [None]:
# Data Preprocessing
try:
    print('üîß === DATA PREPROCESSING ===\n')
    
    # Suppress warnings for cleaner output
    warnings.filterwarnings('ignore')
    
    # Load all train/test TSV files into dataframes
    dfs = {}
    for f in train_files + test_files:
        name = f.stem
        print(f'Loading: {name}')
        df = pd.read_csv(f, sep='\t')
        dfs[name] = df
        print(f'  Shape: {df.shape}')
    
    # Clean and preprocess each dataframe
    for name, df in dfs.items():
        print(f'\nüßπ Preprocessing: {name}')
        
        # Standardize column names
        df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]
        
        # Remove duplicate rows based on 'id' or 'sequence' columns
        id_cols = [col for col in df.columns if 'id' in col]
        seq_cols = [col for col in df.columns if 'sequence' in col]
        for id_col in id_cols:
            before = df.shape[0]
            df = df.drop_duplicates(subset=[id_col])
            after = df.shape[0]
            print(f'  Removed {before-after} duplicate IDs')
        for seq_col in seq_cols:
            before = df.shape[0]
            df = df.drop_duplicates(subset=[seq_col])
            after = df.shape[0]
            print(f'  Removed {before-after} duplicate sequences')
        
        # Remove rows with missing or invalid sequences
        for seq_col in seq_cols:
            before = df.shape[0]
            df = df[df[seq_col].notnull() & df[seq_col].str.match('^[ACDEFGHIKLMNPQRSTVWY]+$', na=False)]
            after = df.shape[0]
            print(f'  Removed {before-after} rows with invalid/missing sequences')
        
        # Fill missing GO terms with empty string
        go_cols = [col for col in df.columns if 'go' in col]
        for go_col in go_cols:
            df[go_col] = df[go_col].fillna('')
        
        # Normalize GO term delimiters to semicolon
        for go_col in go_cols:
            df[go_col] = df[go_col].astype(str).str.replace(',', ';').str.replace(' ', ';')
            df[go_col] = df[go_col].apply(lambda x: ';'.join([t.strip() for t in x.split(';') if t.strip()]))
        
        # Remove duplicate GO terms per sample
        for go_col in go_cols:
            df[go_col] = df[go_col].apply(lambda x: ';'.join(sorted(set(x.split(';')))) if x else '')
        
        # Reset index after cleaning
        df = df.reset_index(drop=True)
        dfs[name] = df
        
        print(f'  Final shape: {df.shape}')
    
    # Visualize GO term distribution after cleaning
    for name, df in dfs.items():
        go_cols = [col for col in df.columns if 'go' in col]
        for go_col in go_cols:
            sample = df[go_col].dropna().astype(str)
            n_terms = sample.apply(lambda x: len([t for t in x.split(';') if t.strip()]))
            print(f'\nGO Terms per Sample: {name} [{go_col}]')
            print(n_terms.describe())
            plt.figure(figsize=(8,4))
            sns.histplot(n_terms, bins=30, kde=False)
            plt.title(f'GO Terms per Sample: {name} [{go_col}]')
            plt.xlabel('Number of GO Terms')
            plt.ylabel('Count')
            plt.show()
    
    # Save cleaned dataframes for downstream tasks
    for name, df in dfs.items():
        out_path = DATA_PATH / f'{name}_cleaned.tsv'
        df.to_csv(out_path, sep='\t', index=False)
        print(f'  Saved cleaned dataframe: {out_path}')
    
    print('\n‚úÖ Data Preprocessing complete!')
    
except Exception as e:
    print(f'‚úó Error in Data Preprocessing: {e}')
    import traceback
    traceback.print_exc()

## 5. Model Architecture

**Approach:** Neural network baseline

In [None]:
# Model Architecture
try:
    print('üîß === MODEL ARCHITECTURE ===\n')
    import torch
    import torch.nn as nn
    import torch.nn.functional as F

    # =========================
    # Protein Function Prediction Model
    # =========================
    # Reference: ProteInfer[1], DPFunc[2], TAWFN[3], DeepFRI[4]
    # - Sequence encoder: 1D CNN + optional protein language model embedding
    # - Structure encoder: Graph Convolutional Network (GCN)
    # - Fusion: Adaptive weighted sum of sequence and structure features
    # - Output: Multi-label classification (GO terms)

    # --- Sequence Encoder: 1D CNN ---
    class SequenceCNN(nn.Module):
        def __init__(self, vocab_size, embed_dim, cnn_channels, kernel_sizes, dropout=0.2):
            super().__init__()
            self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
            self.convs = nn.ModuleList([
                nn.Conv1d(embed_dim, cnn_channels, k, padding=k//2)
                for k in kernel_sizes
            ])
            self.dropout = nn.Dropout(dropout)

        def forward(self, x):
            # x: (batch, seq_len)
            x = self.embedding(x)  # (batch, seq_len, embed_dim)
            x = x.transpose(1, 2)  # (batch, embed_dim, seq_len)
            feats = [F.relu(conv(x)) for conv in self.convs]  # list of (batch, cnn_channels, seq_len)
            x = torch.cat(feats, dim=1)  # (batch, cnn_channels * len(kernel_sizes), seq_len)
            x = F.adaptive_max_pool1d(x, 1).squeeze(-1)  # (batch, cnn_channels * len(kernel_sizes))
            x = self.dropout(x)
            return x

    # --- Structure Encoder: Graph Convolutional Network (GCN) ---
    class GCNLayer(nn.Module):
        def __init__(self, in_dim, out_dim):
            super().__init__()
            self.linear = nn.Linear(in_dim, out_dim)

        def forward(self, x, adj):
            # x: (batch, n_nodes, in_dim)
            # adj: (batch, n_nodes, n_nodes)
            h = torch.bmm(adj, x)  # (batch, n_nodes, in_dim)
            h = self.linear(h)
            return F.relu(h)

    class StructureGCN(nn.Module):
        def __init__(self, in_dim, hidden_dim, n_layers=2, dropout=0.2):
            super().__init__()
            self.layers = nn.ModuleList()
            for i in range(n_layers):
                self.layers.append(GCNLayer(in_dim if i == 0 else hidden_dim, hidden_dim))
            self.dropout = nn.Dropout(dropout)

        def forward(self, x, adj):
            for layer in self.layers:
                x = layer(x, adj)
                x = self.dropout(x)
            # Global pooling
            x = x.mean(dim=1)  # (batch, hidden_dim)
            return x

    # --- Fusion and Output ---
    class ProteinFunctionPredictor(nn.Module):
        def __init__(self, vocab_size, seq_embed_dim, cnn_channels, kernel_sizes,
                     gcn_in_dim, gcn_hidden_dim, gcn_layers, n_classes, dropout=0.3):
            super().__init__()
            self.seq_encoder = SequenceCNN(vocab_size, seq_embed_dim, cnn_channels, kernel_sizes, dropout)
            self.struct_encoder = StructureGCN(gcn_in_dim, gcn_hidden_dim, gcn_layers, dropout)
            fusion_dim = cnn_channels * len(kernel_sizes) + gcn_hidden_dim
            self.fusion = nn.Sequential(
                nn.Linear(fusion_dim, fusion_dim),
                nn.ReLU(),
                nn.Dropout(dropout)
            )
            self.classifier = nn.Linear(fusion_dim, n_classes)

        def forward(self, seq, struct_feats, adj):
            seq_feat = self.seq_encoder(seq)
            struct_feat = self.struct_encoder(struct_feats, adj)
            fused = torch.cat([seq_feat, struct_feat], dim=1)
            fused = self.fusion(fused)
            out = self.classifier(fused)
            return out

    # =========================
    # Example: Model Instantiation
    # =========================
    # These values should be set according to your data preprocessing pipeline
    vocab_size = 26  # 20 amino acids + special tokens
    seq_embed_dim = 128
    cnn_channels = 64
    kernel_sizes = [3, 5, 7]
    gcn_in_dim = 32   # e.g., residue-level features (set accordingly)
    gcn_hidden_dim = 64
    gcn_layers = 2
    n_classes = 500   # Number of GO terms (set according to your label binarizer)
    dropout = 0.3

    model = ProteinFunctionPredictor(
        vocab_size=vocab_size,
        seq_embed_dim=seq_embed_dim,
        cnn_channels=cnn_channels,
        kernel_sizes=kernel_sizes,
        gcn_in_dim=gcn_in_dim,
        gcn_hidden_dim=gcn_hidden_dim,
        gcn_layers=gcn_layers,
        n_classes=n_classes,
        dropout=dropout
    ).to(device)

    print(model)
    n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"\nTotal trainable parameters: {n_params:,}")

    # =========================
    # Visualization: Model Architecture
    # =========================
    try:
        from torchsummary import summary
        # Example dummy input shapes
        batch_size = 2
        seq_len = 512
        n_nodes = 512
        seq_input = torch.randint(0, vocab_size, (batch_size, seq_len)).to(device)
        struct_feats = torch.randn(batch_size, n_nodes, gcn_in_dim).to(device)
        adj = torch.eye(n_nodes).unsqueeze(0).repeat(batch_size,1,1).to(device)
        summary(model, [(seq_len,), (n_nodes, gcn_in_dim), (n_nodes, n_nodes)], device=str(device))
    except Exception as e:
        print("torchsummary not available or failed, skipping summary.")

    # =========================
    # Visualize Model Graph (optional)
    # =========================
    try:
        from torchviz import make_dot
        dummy_seq = torch.randint(0, vocab_size, (1, seq_len)).to(device)
        dummy_struct = torch.randn(1, n_nodes, gcn_in_dim).to(device)
        dummy_adj = torch.eye(n_nodes).unsqueeze(0).to(device)
        out = model(dummy_seq, dummy_struct, dummy_adj)
        dot = make_dot(out, params=dict(model.named_parameters()))
        dot.format = 'png'
        dot.render('protein_function_model_architecture', view=False)
        print("Model graph saved as 'protein_function_model_architecture.png'")
    except Exception as e:
        print("torchviz not available or failed, skipping model graph visualization.")

    print('‚úÖ Model Architecture complete!')

except Exception as e:
    print(f'‚úó Error in Model Architecture: {e}')
    import traceback
    traceback.print_exc()

## 6. Implementation & Next Steps

**Note:** This section provides guidance, not complete code. Actual implementation depends on competition task.

In [None]:
print('üìã === IMPLEMENTATION GUIDE ===\n')

print('Competition task determines implementation approach\n')
print('Possible approaches:')
print('  - Classification: Train classifier, predict labels')
print('  - Regression: Train regressor, predict values')
print('  - Generation: Generate required outputs')
print('  - Processing: Transform/extract data')

print('\n‚ö†Ô∏è TODO: Implement competition-specific solution')


## 7. Submission

**Generate submission file in competition format**

In [None]:
print('üì§ === SUBMISSION GENERATION ===\n')

print('‚ö†Ô∏è TODO: Check competition submission format')
print('Typical formats: CSV, Parquet, JSON')

# Generic template (uncomment and modify):
# submission = pd.DataFrame({
#     'id': test_ids,
#     'prediction': predictions  # YOUR PREDICTIONS HERE
# })
# submission.to_csv('submission.csv', index=False)
# print('‚úÖ Submission created!')
