# Data Exploration: Protein Atom Exposure Dataset

This notebook explores the protein atom exposure dataset and prepares it for GNN training.

## Contents
1. Load and inspect the dataset
2. Analyze protein statistics
3. Examine node and edge features
4. Visualize atom exposure distributions
5. Explore graph structures

In [None]:
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
from pathlib import Path

import torch
from torch_geometric.data import Data

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

%matplotlib inline
%load_ext autoreload
%autoreload 2

## 1. Load Dataset Overview

In [None]:
# Load protein list
protein_df = pd.read_csv('../dataset/protein_sample_5000.csv')
print(f"Number of proteins: {len(protein_df)}")
print(f"\nFirst 10 proteins:")
protein_df.head(10)

In [None]:
# Statistics on atom counts
print("Atom Count Statistics:")
print(protein_df['atom_count'].describe())

# Visualize distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

axes[0].hist(protein_df['atom_count'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Number of Atoms')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Protein Sizes')
axes[0].grid(True, alpha=0.3)

axes[1].boxplot(protein_df['atom_count'])
axes[1].set_ylabel('Number of Atoms')
axes[1].set_title('Protein Size Box Plot')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 2. Load Depth Indexes (Ground Truth)

In [None]:
# Load depth indexes
with open('../dataset/depth_indexes.pkl', 'rb') as f:
    depth_indexes = pickle.load(f)

print(f"Number of proteins with depth data: {len(depth_indexes)}")
print(f"\nFirst protein ID: {list(depth_indexes.keys())[0]}")
print(f"Depth values shape: {depth_indexes[list(depth_indexes.keys())[0]].shape}")

In [None]:
# Analyze depth value distributions
all_depths = []
for pdb_id, depths in depth_indexes.items():
    if isinstance(depths, np.ndarray):
        all_depths.extend(depths.flatten())

all_depths = np.array(all_depths)

print("Depth Value Statistics:")
print(f"Mean: {all_depths.mean():.4f}")
print(f"Std: {all_depths.std():.4f}")
print(f"Min: {all_depths.min():.4f}")
print(f"Max: {all_depths.max():.4f}")

# Plot distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(all_depths, bins=100, edgecolor='black', alpha=0.7)
plt.xlabel('Atom Exposure/Depth')
plt.ylabel('Frequency')
plt.title('Distribution of Atom Exposure Values')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.boxplot(all_depths)
plt.ylabel('Atom Exposure/Depth')
plt.title('Atom Exposure Box Plot')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 3. Examine Single Protein Graph

In [None]:
# Load a sample protein
sample_pdb = protein_df['pdb_id'].iloc[0]
print(f"Examining protein: {sample_pdb}")

protein_dir = f'../dataset/sadic_data/{sample_pdb}'

# Load nodes
nodes_df = pd.read_csv(f'{protein_dir}/{sample_pdb}__graphein__ATOM_nodes.csv', index_col=0)
print(f"\nNumber of atoms: {len(nodes_df)}")
print(f"Number of features: {len(nodes_df.columns)}")

# Display first few rows
print("\nFirst 5 atoms:")
nodes_df.head()

In [None]:
# Examine node features
print("Node Feature Columns:")
for i, col in enumerate(nodes_df.columns):
    print(f"{i+1}. {col}")

In [None]:
# Load edges
edges_df = pd.read_csv(f'{protein_dir}/{sample_pdb}__graphein__ATOM_edges.csv', index_col=0)
print(f"Number of edges: {len(edges_df)}")

print("\nFirst 10 edges:")
edges_df.head(10)

In [None]:
# Analyze edge types
print("Edge Type Distribution:")
print(edges_df['kind'].value_counts())

# Plot
plt.figure(figsize=(10, 5))
edges_df['kind'].value_counts().plot(kind='bar')
plt.xlabel('Edge Type')
plt.ylabel('Count')
plt.title(f'Edge Types in Protein {sample_pdb}')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 4. Visualize Protein Structure

In [None]:
# Extract 3D coordinates
coords = nodes_df[['x_coord', 'y_coord', 'z_coord']].values

# Get depth values for this protein
if sample_pdb in depth_indexes:
    depths = depth_indexes[sample_pdb]
else:
    depths = np.zeros(len(nodes_df))

# 3D scatter plot
fig = plt.figure(figsize=(12, 10))
ax = fig.add_subplot(111, projection='3d')

scatter = ax.scatter(
    coords[:, 0], 
    coords[:, 1], 
    coords[:, 2],
    c=depths,
    cmap='viridis',
    s=20,
    alpha=0.6
)

ax.set_xlabel('X Coordinate')
ax.set_ylabel('Y Coordinate')
ax.set_zlabel('Z Coordinate')
ax.set_title(f'Protein {sample_pdb} - Atoms Colored by Exposure')

plt.colorbar(scatter, ax=ax, label='Atom Exposure/Depth')
plt.tight_layout()
plt.show()

## 5. Analyze Atom Types and Residues

In [None]:
# Atom type distribution
print("Top 10 Atom Types:")
print(nodes_df['atom_type'].value_counts().head(10))

# Residue distribution
print("\nTop 10 Residue Types:")
print(nodes_df['residue_name'].value_counts().head(10))

# Element distribution
print("\nElement Distribution:")
print(nodes_df['element_symbol'].value_counts())

In [None]:
# Visualizations
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Atom types
nodes_df['atom_type'].value_counts().head(15).plot(kind='bar', ax=axes[0])
axes[0].set_xlabel('Atom Type')
axes[0].set_ylabel('Count')
axes[0].set_title('Top 15 Atom Types')
axes[0].tick_params(axis='x', rotation=45)

# Residue types
nodes_df['residue_name'].value_counts().head(15).plot(kind='bar', ax=axes[1])
axes[1].set_xlabel('Residue Type')
axes[1].set_ylabel('Count')
axes[1].set_title('Top 15 Residue Types')
axes[1].tick_params(axis='x', rotation=45)

# Element types
nodes_df['element_symbol'].value_counts().plot(kind='bar', ax=axes[2])
axes[2].set_xlabel('Element')
axes[2].set_ylabel('Count')
axes[2].set_title('Element Distribution')
axes[2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 6. Feature Analysis

In [None]:
# Select numeric features
numeric_cols = nodes_df.select_dtypes(include=[np.number]).columns
print(f"Number of numeric features: {len(numeric_cols)}")

# Check for missing values
missing = nodes_df[numeric_cols].isnull().sum()
missing = missing[missing > 0]
print(f"\nFeatures with missing values: {len(missing)}")
if len(missing) > 0:
    print(missing)

In [None]:
# Correlation of features with depth
if sample_pdb in depth_indexes:
    feature_df = nodes_df[numeric_cols].copy()
    feature_df['depth'] = depths
    
    # Calculate correlations
    correlations = feature_df.corr()['depth'].sort_values(ascending=False)
    
    print("Top 10 features correlated with atom exposure:")
    print(correlations.head(11))  # 11 to exclude depth itself
    
    # Plot
    plt.figure(figsize=(10, 8))
    correlations[1:21].plot(kind='barh')  # Top 20 excluding depth
    plt.xlabel('Correlation with Atom Exposure')
    plt.ylabel('Feature')
    plt.title('Top 20 Features Correlated with Atom Exposure')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

## 7. Load Using PyTorch Geometric Dataset

In [None]:
from src.data.dataset import ProteinAtomDataset

# Load dataset
dataset = ProteinAtomDataset(root='../dataset/', split='train')
print(f"Dataset size: {len(dataset)}")

# Get a sample
sample = dataset[0]
print(f"\nSample graph:")
print(f"  PDB ID: {sample.pdb_id}")
print(f"  Number of atoms: {sample.num_nodes}")
print(f"  Number of edges: {sample.num_edges}")
print(f"  Node features shape: {sample.x.shape}")
print(f"  Edge features shape: {sample.edge_attr.shape}")
print(f"  Target shape: {sample.y.shape}")

In [None]:
# Analyze multiple samples
num_samples = min(10, len(dataset))
sample_stats = []

for i in range(num_samples):
    data = dataset[i]
    sample_stats.append({
        'pdb_id': data.pdb_id,
        'num_nodes': data.num_nodes,
        'num_edges': data.num_edges,
        'avg_degree': data.num_edges / data.num_nodes,
        'num_features': data.x.shape[1],
        'target_mean': data.y.mean().item(),
        'target_std': data.y.std().item()
    })

stats_df = pd.DataFrame(sample_stats)
print("\nSample Statistics:")
stats_df

## 8. Summary and Next Steps

### Key Findings:
- Dataset contains 5,000 protein structures
- Protein sizes range from hundreds to thousands of atoms
- Rich feature set with 80+ features per atom
- Multiple edge types representing different bond types
- Atom exposure values show varied distribution

### Next Steps:
1. Implement data preprocessing and normalization
2. Train baseline GNN model
3. Experiment with different architectures (GCN, GAT, GIN)
4. Analyze model predictions and errors
5. Optimize hyperparameters