# Codon Usage Bias Analysis Across Species

## Introduction and Setup

**Codon usage bias** refers to the phenomenon where synonymous codons (codons encoding the same amino acid) are used with different frequencies. This bias varies across:
- Different species
- Different genes within the same organism
- Highly vs. lowly expressed genes

Why is this important?
1. Gene expression optimization
2. Heterologous protein expression
3. Evolutionary analysis
4. Gene origin prediction
5. Synthetic biology application

In [None]:
# Library
import pandas as pd
import numpy as np

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# BioPython
from Bio import SeqIO

In [None]:
# Set style for better visualizations
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

## Codons

### Codon Table

In [1]:
# Standard genetic code (codon to amino acid mapping)
CODON_TABLE = {
    'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
    'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
    'TAT': 'Y', 'TAC': 'Y', 'TAA': '*', 'TAG': '*',
    'TGT': 'C', 'TGC': 'C', 'TGA': '*', 'TGG': 'W',
    'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
    'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
    'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
    'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
    'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
    'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
    'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
    'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
    'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
    'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
    'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
    'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'
}

In [2]:
from collections import defaultdict

# Group codons by amino acids
def get_synonymous_codons():
    """Return dictionary of amino acids and their synonymous codons"""
    aa_codons = defaultdict(list)
    for codon, aa in CODON_TABLE.items():
        if aa != '*':       # Exclude stop codons
            aa_codons[aa].append(codon)
    return dict(aa_codons)

In [4]:
SYNONYMOUS_CODONS = get_synonymous_codons()
SYNONYMOUS_CODONS

{'F': ['TTT', 'TTC'],
 'L': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
 'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
 'Y': ['TAT', 'TAC'],
 'C': ['TGT', 'TGC'],
 'W': ['TGG'],
 'P': ['CCT', 'CCC', 'CCA', 'CCG'],
 'H': ['CAT', 'CAC'],
 'Q': ['CAA', 'CAG'],
 'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
 'I': ['ATT', 'ATC', 'ATA'],
 'M': ['ATG'],
 'T': ['ACT', 'ACC', 'ACA', 'ACG'],
 'N': ['AAT', 'AAC'],
 'K': ['AAA', 'AAG'],
 'V': ['GTT', 'GTC', 'GTA', 'GTG'],
 'A': ['GCT', 'GCC', 'GCA', 'GCG'],
 'D': ['GAT', 'GAC'],
 'E': ['GAA', 'GAG'],
 'G': ['GGT', 'GGC', 'GGA', 'GGG']}

### Count Codons

In [5]:
from collections import Counter

# Codon Counting Function
def count_codons(sequence):
    """Count codon frequencies in a DNA sequence"""
    sequence = sequence.upper().replace('U', 'T')

    # Ensure sequence length is divisible by 3
    if len(sequence) % 3 != 0:
        sequence = sequence[:(len(sequence) % 3)]

    # Collect the codons
    codons = [sequence[i:i+3] for i in range(0, len(sequence), 3)]

    # Filter valid codons
    valid_codons = [c for c in codons if c in CODON_TABLE and CODON_TABLE[c] != '*']

    return Counter(valid_codons)


### Calculate Relative Synonymous Codon Usage (RSCU)

In [6]:
def calculate_rscu(codon_counts):
    """
    Calculate Relative Synonymous Codon Usage (RSCU)

    RSCU = (observed frequency of codon) / (expected frequency if all synonymous codon used equally)
    RSCU = 1 means no bias
    RSCU > 1.0 means codon is used more than expected
    RSCU < 1.0 means codon is used less than expected

    Parameters:
    -----------
    codon_counts: Counter
        Codon Frequencies

    Returns:
    --------
    dict: RSCU values for each codon
    """
    rscu_values = {}

    for aa, codons in SYNONYMOUS_CODONS.items():
        # Count total usage of all synonymous codons for this amino acid
        total_aa_count = sum(codon_counts.get(codon, 0) for codon in codons)

        if total_aa_count == 0:
            continue

        # Calculate RSCU for each synonymous codon
        num_synonymous = len(codons)
        for codon in codons:
            observed = codon_counts.get(codon, 0)
            expected = total_aa_count / num_synonymous
            rscu_values[codon] = (observed / expected) if expected > 0 else 0

    return rscu_values

### Calculate Effective Number of Codons (ENC)

In [8]:
def calculate_enc(codon_counts):
    """
    Calculate Effective Number of Codons (ENC)

    ENC measures the overall codon usage bias in a gene
    Range: 20 (maximum bias) to 61 (no bias)

    Parameters:
    -----------
    codon_counts: Counter
        Codon Frequencies

    Returns:
    --------
    float: ENC Value
    """
    
    # Group amino acids by degeneracy
    two_fold = ['D', 'E', 'F', 'H', 'K', 'N', 'Q', 'Y', 'C']
    three_fold = ['I']
    four_fold = ['A', 'G', 'P', 'T', 'V']
    six_fold = ['L', 'R', 'S']

    def homozygosity(codons, codon_counts):
        """Calculate homozygosity for a group of synonymous codons"""
        total = sum(codon_counts.get(c, 0) for c in codons)
        if total == 0:
            return 0
        return sum((codon_counts.get(c, 0) / total) ** 2 for c in codons)

    # Calculate F values for each degeneracy class
    F2 = np.mean([homozygosity(SYNONYMOUS_CODONS[aa], codon_counts) for aa in two_fold if aa in SYNONYMOUS_CODONS])
    F3 = np.mean([homozygosity(SYNONYMOUS_CODONS['I'], codon_counts)])
    F4 = np.mean([homozygosity(SYNONYMOUS_CODONS[aa], codon_counts) for aa in four_fold if aa in SYNONYMOUS_CODONS])
    F6 = np.mean([homozygosity(SYNONYMOUS_CODONS[aa], codon_counts) for aa in six_fold if aa in SYNONYMOUS_CODONS])

    # Calculate ENC
    enc = 2 + 9/F2 + 1/F3 + 5/F4 + 3/F6 if all([F2, F3, F4, F6]) else 61

    return min(enc, 61) # Cap at 61

### Calculate Codon Adaptation Index (CAI)

In [9]:
def calculate_cai(codon_counts, reference_counts):
    """
    Calculate Codon Adaptation Index (CAI)

    CAI measures how similar a gene's codon usage is to highly expressed genes
    Range: 0 to 1 (higher = more similar to reference set)

    Parameters:
    -----------
    codon_counts: Counter
        Codon Frequencies in Gene
    reference_counts: Counter
        Codon Frequencies in reference set

    Returns:
    --------
    float: CAI Value
    """

    # Calculate relative adaptiveness (w) for each codon
    w_values = {}
    for aa, codons in SYNONYMOUS_CODONS.items():
        max_usage = max(reference_counts.get(c, 0) for c in codons)
        if max_usage == 0:
            continue
        for codon in codons:
            w_values[codon] = reference_counts.get(codon, 0) / max_usage

    # Calculate geometric mean of w values
    w_products = []
    for codon, count in codon_counts.items():
        if codon in w_values and count > 0:
            w_products.extend([w_values[codon]] * count)

    if not w_products:
        return 0
    
    cai = np.exp(np.mean(np.log(np.array(w_products) + 1e-10)))
    return cai

## Multi-Species Analysis