In [None]:
"""
1. Setup and Configuration

Load required libraries and set working directory.
"""
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict

# Configuration
workdir = "~/Desktop/work/protein_linkers/input_2"
workdir = os.path.expanduser(workdir)
elm_path = os.path.join(workdir, 'elm', 'elm_results.json')
fasta_path = os.path.join(workdir, 'proteins.fa')

In [None]:
"""
2. Load Linker + Domain Data

Load the three clustered linker dictionaries (short, medium, long) that were
generated by the clustering analysis, plus the filtered proteins dictionary
with all domains and linkers.
"""

# Load short linkers
short_linkers_path = os.path.join(workdir, 'short_linkers.json')
with open(short_linkers_path, 'r') as f:
    short_linkers = json.load(f)

# Load medium linkers
medium_linkers_path = os.path.join(workdir, 'medium_linkers.json')
with open(medium_linkers_path, 'r') as f:
    medium_linkers = json.load(f)

# Load long linkers
long_linkers_path = os.path.join(workdir, 'long_linkers.json')
with open(long_linkers_path, 'r') as f:
    long_linkers = json.load(f)

# Load filtered proteins (contains all domains and linkers after outlier removal)
filtered_proteins_path = os.path.join(workdir, 'filtered_proteins.json')
with open(filtered_proteins_path, 'r') as f:
    filtered_proteins = json.load(f)


In [None]:
"""
3. Load Protein Sequences

Load the FASTA file to extract linker sequences for ELM analysis.
"""

def load_fasta(fasta_path):
    """
    Load protein sequences from a FASTA file.

    Parameters:
    -----------
    fasta_path : str
        Path to FASTA file

    Returns:
    --------
    dict
        Dictionary with protein accessions as keys and sequences as values
    """
    sequences_dict = {}

    with open(fasta_path, 'r') as f:
        current_acc = None
        current_seq = []

        for line in f:
            line = line.strip()
            if line.startswith('>'):
                # Save previous sequence if exists
                if current_acc:
                    sequences_dict[current_acc] = ''.join(current_seq)

                # Start new sequence
                current_acc = line[1:]
                current_seq = []
            else:
                current_seq.append(line)

        # Save last sequence
        if current_acc:
            sequences_dict[current_acc] = ''.join(current_seq)

    return sequences_dict


# Load sequences
sequences_dict = load_fasta(fasta_path)

print(f"âœ“ Loaded {len(sequences_dict)} protein sequences from proteins.fa")

In [34]:
"""
elm_dict = {
    'domains': {protein_domain_id: region_info, ...},
    'n-terminus': {protein_linker_id: region_info, ...},
    'c-terminus': {protein_linker_id: region_info, ...},
    'inner': {protein_linker_id: region_info, ...}
}

where region_info = {
    'region_id': str,           # e.g., 'Q9Y6K9_linker_2_inner'
    'region_type': str,         # 'domain', 'n-terminus', 'c-terminus', or 'inner'
    'sequence_length': int,     # length of the region in amino acids
    'motifs': {
        'MOTIF_NAME': {
            'start': int,       # start position within the region (1-based)
            'end': int          # end position within the region (1-based)
        },
        ...
    },
    'motif_count': int          # total number of motifs found in this region
}
"""

with open(elm_path, 'r') as f:
    elm_dict = json.load(f)

print("ELM DICTIONARY SUMMARY")
print("=" * 60)
for category, regions in elm_dict.items():
    print(f"\n{category.upper()}:\n")
    print(f"  Number of regions: {len(regions)}\n")

    # Count total motifs in this category
    total_motifs = sum(region_info['motif_count'] for region_info in regions.values())
    print(f"  Total motif instances: {total_motifs}\n")

    # Show example
    if regions:
        example_id = list(regions.keys())[0]
        example = regions[example_id]
        print(f"  Example: {example_id}\n")
        print(f"    - Length: {example['sequence_length']} aa\n")
        print(f"    - Motif count: {example['motif_count']}\n")
        if example['motifs']:
            first_motif = list(example['motifs'].keys())[0]
            print(f"    - First motif: {first_motif}\n")


ELM DICTIONARY SUMMARY

DOMAINS:

  Number of regions: 591

  Total motif instances: 19094

  Example: P69905_domain_1_Hemoglobin and related oxygen transporters

    - Length: 142 aa

    - Motif count: 24

    - First motif: CLV_PCSK_SKI1_1


N-TERMINUS:

  Number of regions: 30

  Total motif instances: 332

  Example: Q9H3U1_linker_1_n-terminus

    - Length: 17 aa

    - Motif count: 9

    - First motif: CLV_PCSK_SKI1_1


C-TERMINUS:

  Number of regions: 32

  Total motif instances: 177

  Example: P00533_linker_2_c-terminus

    - Length: 6 aa

    - Motif count: 3

    - First motif: DEG_Cend_KLHDC2_1


INNER:

  Number of regions: 85

  Total motif instances: 1297

  Example: Q9Y6K9_linker_2_inner

    - Length: 16 aa

    - Motif count: 3

    - First motif: CLV_PCSK_KEX2_1

