# RNA-seq Analysis Pipeline: *Picosynechococcus sp.* PCC 11901

**Project**: Transcriptomic analysis of PCC 11901 under various nutrient, environmental, and circadian conditions

**Experimental Design**:
- 20 conditions × 3 biological replicates = 60 samples
- Group 1: Nutrient conditions (Control: U4,5,6)
- Group 2: Environmental conditions (Control: U1,2,3)
- Group 3: Circadian rhythm (T1-T4 timepoints)

**Pipeline Overview**:
1. Quality Control (FastQC + MultiQC)
2. Trimming (fastp)
3. Lane Merging
4. rRNA Removal (SortMeRNA)
5. Quantification (Salmon)
6. Alignment for Visualization (Bowtie2 → bigWig)
7. Differential Expression (DESeq2 via rpy2)
8. Visualization

---

## 0. Setup and Configuration

In [31]:
# === Standard Library ===
import os
import subprocess
import glob
import re
from pathlib import Path
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

# === Fix for rpy2 on Apple Silicon ===
# Force R_HOME to conda env's R before importing rpy2
# This prevents conflicts with System R
if 'CONDA_PREFIX' in os.environ:
    os.environ['R_HOME'] = os.path.join(os.environ['CONDA_PREFIX'], 'lib', 'R')

# === Data Science ===
import numpy as np
import pandas as pd
from scipy import stats

# === Visualization ===
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# === Pathway Enrichment ===
import gseapy as gp

# === Progress & Parallelization ===
from tqdm.notebook import tqdm
from joblib import Parallel, delayed

# === R Integration ===
# NOTE: If you see an architecture mismatch error (ARM64 vs x86_64),
# the conda environment has mixed architectures. Recreate it with:
#   conda deactivate
#   conda env remove -n pcc11901_rnaseq
#   CONDA_SUBDIR=osx-64 conda env create -f environment.yml
#   conda activate pcc11901_rnaseq
try:
    import rpy2.robjects as ro
    from rpy2.robjects import pandas2ri
    from rpy2.robjects.packages import importr
    from rpy2.robjects.conversion import localconverter
    RPY2_AVAILABLE = True
    print("[OK] rpy2 imported successfully")
except ImportError as e:
    RPY2_AVAILABLE = False
    print(f"[WARNING] rpy2 import failed: {e}")
    print("R-based analysis (DESeq2) will not be available.")
    print("If you need rpy2, recreate the conda environment with consistent architecture.")

# Set plotting defaults
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('colorblind')
pd.set_option('display.max_columns', 50)

print("\nPython packages imported successfully!")

[OK] rpy2 imported successfully

Python packages imported successfully!


In [32]:
# === Project Configuration ===

# Base directory (adjust if needed)
BASE_DIR = Path(os.getcwd())

# Directory structure
DIRS = {
    'data': BASE_DIR / 'Data',
    'genome': BASE_DIR / 'PCC_11901_annotated genome',
    'qc': BASE_DIR / '01_QC',
    'trimmed': BASE_DIR / '02_trimmed',
    'merged': BASE_DIR / '03_merged',
    'rrna_filtered': BASE_DIR / '04_rRNA_filtered',
    'salmon': BASE_DIR / '05_salmon',
    'alignment': BASE_DIR / '06_alignment',
    'deseq2': BASE_DIR / '07_deseq2',
    'functional': BASE_DIR / '08_functional',
    'figures': BASE_DIR / '09_figures',
    'logs': BASE_DIR / 'logs',
}

# Reference files
GENOME_FASTA = DIRS['genome'] / 'GCF_005577135.1_ASM557713v1_genomic.fna'
GTF_FILE = DIRS['genome'] / 'genomic.gtf'
TRANSCRIPTOME_FASTA = DIRS['genome'] / 'transcriptome.fa'  # Will be generated

# Hardware settings (M2 Max optimized)
N_THREADS = 8  # General parallelization
SORTMERNA_THREADS = 4  # Memory-intensive (~15GB/thread)
SALMON_THREADS = 10

# Create output directories
for name, path in DIRS.items():
    if name not in ['data', 'genome']:  # Don't create input dirs
        path.mkdir(parents=True, exist_ok=True)
        
# Create subdirectories
(DIRS['qc'] / 'fastqc_raw').mkdir(exist_ok=True)
(DIRS['qc'] / 'fastqc_trimmed').mkdir(exist_ok=True)
(DIRS['qc'] / 'multiqc_reports').mkdir(exist_ok=True)
(DIRS['rrna_filtered'] / 'non_rRNA').mkdir(exist_ok=True)
(DIRS['rrna_filtered'] / 'rRNA').mkdir(exist_ok=True)
(DIRS['salmon'] / 'index').mkdir(exist_ok=True)
(DIRS['salmon'] / 'quants').mkdir(exist_ok=True)
(DIRS['alignment'] / 'bowtie2_index').mkdir(exist_ok=True)
(DIRS['alignment'] / 'bam').mkdir(exist_ok=True)
(DIRS['alignment'] / 'bigwig').mkdir(exist_ok=True)
(DIRS['figures'] / 'qc_plots').mkdir(exist_ok=True)
(DIRS['figures'] / 'volcano_plots').mkdir(exist_ok=True)
(DIRS['figures'] / 'heatmaps').mkdir(exist_ok=True)

print(f"Base directory: {BASE_DIR}")
print(f"Output directories created.")

Base directory: /Users/felix/Library/CloudStorage/OneDrive-SharedLibraries-MacquarieUniversity/Australian Genome Foundry - AWS cloud infrastructure/11_Esther_Cyano_transcriptomics
Output directories created.


In [33]:
# === Publication-Quality Plot Settings ===
# Nature/Science style guidelines

PUBLICATION_LAYOUT = dict(
    font=dict(family="Arial, Helvetica, sans-serif", size=12, color="black"),
    title=dict(font=dict(size=14, family="Arial, Helvetica, sans-serif"), x=0.5, xanchor='center'),
    plot_bgcolor='white',
    paper_bgcolor='white',
    xaxis=dict(
        showline=True, linewidth=1.5, linecolor='black',
        showgrid=False, zeroline=False,
        ticks='outside', tickwidth=1.5, ticklen=5,
        title_font=dict(size=12), tickfont=dict(size=10)
    ),
    yaxis=dict(
        showline=True, linewidth=1.5, linecolor='black',
        showgrid=False, zeroline=False,
        ticks='outside', tickwidth=1.5, ticklen=5,
        title_font=dict(size=12), tickfont=dict(size=10)
    ),
    legend=dict(
        font=dict(size=10),
        bgcolor='rgba(255,255,255,0.8)',
        bordercolor='black',
        borderwidth=1
    ),
    margin=dict(l=60, r=20, t=50, b=60)
)

# Colorblind-safe palettes (Nature-recommended)
COLORS_CATEGORICAL = ['#0072B2', '#D55E00', '#009E73', '#CC79A7', '#F0E442', '#56B4E9']
COLORS_DIVERGING = 'RdBu_r'
COLORS_UPDOWN = {'Up': '#D55E00', 'Down': '#0072B2', 'NS': '#999999'}

print("Publication styling defined.")

Publication styling defined.


In [34]:
# === Sample Metadata ===

# Define experimental design
sample_info = {
    # Group 1 - Nutrients (Control: U4,5,6)
    'U4': {'group': 1, 'condition': 'Control_MAD', 'replicate': 1},
    'U5': {'group': 1, 'condition': 'Control_MAD', 'replicate': 2},
    'U6': {'group': 1, 'condition': 'Control_MAD', 'replicate': 3},
    'U46': {'group': 1, 'condition': 'Glycerol_0.75pct', 'replicate': 1},
    'U47': {'group': 1, 'condition': 'Glycerol_0.75pct', 'replicate': 2},
    'U48': {'group': 1, 'condition': 'Glycerol_0.75pct', 'replicate': 3},
    'U22': {'group': 1, 'condition': 'Low_Nitrogen', 'replicate': 1},
    'U23': {'group': 1, 'condition': 'Low_Nitrogen', 'replicate': 2},
    'U24': {'group': 1, 'condition': 'Low_Nitrogen', 'replicate': 3},
    'U25': {'group': 1, 'condition': 'High_Nitrogen', 'replicate': 1},
    'U26': {'group': 1, 'condition': 'High_Nitrogen', 'replicate': 2},
    'U27': {'group': 1, 'condition': 'High_Nitrogen', 'replicate': 3},
    'U28': {'group': 1, 'condition': 'Low_Phosphate', 'replicate': 1},
    'U29': {'group': 1, 'condition': 'Low_Phosphate', 'replicate': 2},
    'U30': {'group': 1, 'condition': 'Low_Phosphate', 'replicate': 3},
    'U31': {'group': 1, 'condition': 'High_Phosphate', 'replicate': 1},
    'U32': {'group': 1, 'condition': 'High_Phosphate', 'replicate': 2},
    'U33': {'group': 1, 'condition': 'High_Phosphate', 'replicate': 3},
    'U40': {'group': 1, 'condition': 'Ammonia', 'replicate': 1},
    'U41': {'group': 1, 'condition': 'Ammonia', 'replicate': 2},
    'U42': {'group': 1, 'condition': 'Ammonia', 'replicate': 3},
    'U43': {'group': 1, 'condition': 'Urea', 'replicate': 1},
    'U44': {'group': 1, 'condition': 'Urea', 'replicate': 2},
    'U45': {'group': 1, 'condition': 'Urea', 'replicate': 3},
    
    # Group 2 - Environmental (Control: U1,2,3)
    'U1': {'group': 2, 'condition': 'Control_MAD', 'replicate': 1},
    'U2': {'group': 2, 'condition': 'Control_MAD', 'replicate': 2},
    'U3': {'group': 2, 'condition': 'Control_MAD', 'replicate': 3},
    'U34': {'group': 2, 'condition': 'High_NaCl_9pct', 'replicate': 1},
    'U35': {'group': 2, 'condition': 'High_NaCl_9pct', 'replicate': 2},
    'U36': {'group': 2, 'condition': 'High_NaCl_9pct', 'replicate': 3},
    'U37': {'group': 2, 'condition': 'H2O2_0.005pct', 'replicate': 1},
    'U38': {'group': 2, 'condition': 'H2O2_0.005pct', 'replicate': 2},
    'U39': {'group': 2, 'condition': 'H2O2_0.005pct', 'replicate': 3},
    'U7': {'group': 2, 'condition': 'Atmospheric_CO2', 'replicate': 1},
    'U8': {'group': 2, 'condition': 'Atmospheric_CO2', 'replicate': 2},
    'U9': {'group': 2, 'condition': 'Atmospheric_CO2', 'replicate': 3},
    'U10': {'group': 2, 'condition': 'High_CO2_8pct', 'replicate': 1},
    'U11': {'group': 2, 'condition': 'High_CO2_8pct', 'replicate': 2},
    'U12': {'group': 2, 'condition': 'High_CO2_8pct', 'replicate': 3},
    'U13': {'group': 2, 'condition': 'High_Temp_38C', 'replicate': 1},
    'U14': {'group': 2, 'condition': 'High_Temp_38C', 'replicate': 2},
    'U15': {'group': 2, 'condition': 'High_Temp_38C', 'replicate': 3},
    'U16': {'group': 2, 'condition': 'Low_Light_15uE', 'replicate': 1},
    'U17': {'group': 2, 'condition': 'Low_Light_15uE', 'replicate': 2},
    'U18': {'group': 2, 'condition': 'Low_Light_15uE', 'replicate': 3},
    'U19': {'group': 2, 'condition': 'High_Light', 'replicate': 1},
    'U20': {'group': 2, 'condition': 'High_Light', 'replicate': 2},
    'U21': {'group': 2, 'condition': 'High_Light', 'replicate': 3},
    
    # Group 3 - Circadian
    'U49': {'group': 3, 'condition': 'T1_Light', 'replicate': 1},
    'U50': {'group': 3, 'condition': 'T1_Light', 'replicate': 2},
    'U51': {'group': 3, 'condition': 'T1_Light', 'replicate': 3},
    'U52': {'group': 3, 'condition': 'T2_Dark', 'replicate': 1},
    'U53': {'group': 3, 'condition': 'T2_Dark', 'replicate': 2},
    'U54': {'group': 3, 'condition': 'T2_Dark', 'replicate': 3},
    'U55': {'group': 3, 'condition': 'T3_Light', 'replicate': 1},
    'U56': {'group': 3, 'condition': 'T3_Light', 'replicate': 2},
    'U57': {'group': 3, 'condition': 'T3_Light', 'replicate': 3},
    'U58': {'group': 3, 'condition': 'T4_Dark', 'replicate': 1},
    'U59': {'group': 3, 'condition': 'T4_Dark', 'replicate': 2},
    'U60': {'group': 3, 'condition': 'T4_Dark', 'replicate': 3},
}

# Create DataFrame
metadata = pd.DataFrame.from_dict(sample_info, orient='index')
metadata.index.name = 'sample_id'
metadata = metadata.reset_index()

# Sort by sample number
metadata['sample_num'] = metadata['sample_id'].str.extract(r'U(\d+)').astype(int)
metadata = metadata.sort_values('sample_num').reset_index(drop=True)

print(f"Total samples: {len(metadata)}")
print(f"\nGroup summary:")
print(metadata.groupby('group')['condition'].nunique())
metadata.head(10)

Total samples: 60

Group summary:
group
1    8
2    8
3    4
Name: condition, dtype: int64


Unnamed: 0,sample_id,group,condition,replicate,sample_num
0,U1,2,Control_MAD,1,1
1,U2,2,Control_MAD,2,2
2,U3,2,Control_MAD,3,3
3,U4,1,Control_MAD,1,4
4,U5,1,Control_MAD,2,5
5,U6,1,Control_MAD,3,6
6,U7,2,Atmospheric_CO2,1,7
7,U8,2,Atmospheric_CO2,2,8
8,U9,2,Atmospheric_CO2,3,9
9,U10,2,High_CO2_8pct,1,10


In [35]:
# === Helper Functions ===

def run_command(cmd, description="Running command", log_file=None, check=True):
    """
    Run a shell command with optional logging.
    
    Parameters:
    -----------
    cmd : str
        Shell command to execute
    description : str
        Description for progress display
    log_file : Path, optional
        File to write stdout/stderr
    check : bool
        Raise exception on non-zero exit code
    
    Returns:
    --------
    subprocess.CompletedProcess
    """
    print(f"  {description}...")
    
    if log_file:
        with open(log_file, 'w') as f:
            result = subprocess.run(
                cmd, shell=True, stdout=f, stderr=subprocess.STDOUT,
                check=check
            )
    else:
        result = subprocess.run(
            cmd, shell=True, capture_output=True, text=True, check=check
        )
    
    return result


def get_fastq_files(sample_id, directory, pattern='merged'):
    """
    Get FASTQ file paths for a sample.
    
    Parameters:
    -----------
    sample_id : str
        Sample ID (e.g., 'U1')
    directory : Path
        Directory containing FASTQ files
    pattern : str
        'raw' for lane-split files, 'merged' for merged files
    
    Returns:
    --------
    dict with 'R1' and 'R2' keys
    """
    if pattern == 'merged':
        r1 = directory / f"{sample_id}_R1.fastq.gz"
        r2 = directory / f"{sample_id}_R2.fastq.gz"
        return {'R1': r1, 'R2': r2}
    else:
        # Raw files (4 per sample)
        files = list(directory.glob(f"{sample_id}-*.fastq.gz"))
        r1 = sorted([f for f in files if '_R1_' in f.name])
        r2 = sorted([f for f in files if '_R2_' in f.name])
        return {'R1': r1, 'R2': r2}


def check_tool(tool_name):
    """Check if a tool is available in PATH."""
    result = subprocess.run(f"which {tool_name}", shell=True, capture_output=True)
    if result.returncode == 0:
        print(f"  [OK] {tool_name}: {result.stdout.decode().strip()}")
        return True
    else:
        print(f"  [MISSING] {tool_name}")
        return False


print("Helper functions defined.")

Helper functions defined.


In [21]:
# === Verify Tool Installation ===

print("Checking required tools...\n")

tools = [
    'fastqc', 'multiqc', 'fastp',  # QC
    'sortmerna',                     # rRNA removal
    'gffread',                       # Transcriptome extraction
    'salmon',                        # Quantification
    'bowtie2', 'samtools',           # Alignment
    'bamCoverage',                   # deepTools
]

missing = []
for tool in tools:
    if not check_tool(tool):
        missing.append(tool)

if missing:
    print(f"\n[WARNING] Missing tools: {missing}")
    print("Install with: conda activate pcc11901_rnaseq")
else:
    print("\nAll tools available!")

Checking required tools...

  [OK] fastqc: /opt/miniconda3/envs/pcc11901_rnaseq/bin/fastqc
  [OK] multiqc: /opt/miniconda3/envs/pcc11901_rnaseq/bin/multiqc
  [OK] fastp: /opt/miniconda3/envs/pcc11901_rnaseq/bin/fastp
  [OK] sortmerna: /opt/miniconda3/envs/pcc11901_rnaseq/bin/sortmerna
  [OK] gffread: /opt/miniconda3/envs/pcc11901_rnaseq/bin/gffread
  [OK] salmon: /opt/miniconda3/envs/pcc11901_rnaseq/bin/salmon
  [OK] bowtie2: /opt/miniconda3/envs/pcc11901_rnaseq/bin/bowtie2
  [OK] samtools: /opt/miniconda3/envs/pcc11901_rnaseq/bin/samtools
  [OK] bamCoverage: /opt/miniconda3/envs/pcc11901_rnaseq/bin/bamCoverage

All tools available!


---
## Phase 1: Pre-processing

### 1.1 Quality Control of Raw Reads

In [None]:
# === FastQC on Raw Reads ===
# 
# Shell command equivalent:
# fastqc -t 8 -o 01_QC/fastqc_raw/ Data/*.fastq.gz
#
# This step checks:
# - Per-base sequence quality
# - Per-sequence quality scores
# - GC content distribution
# - Sequence duplication levels
# - Overrepresented sequences (adapters)
# - Adapter content

RUN_FASTQC_RAW = False  # Set to True to run

if RUN_FASTQC_RAW:
    output_dir = DIRS['qc'] / 'fastqc_raw'
    input_files = list(DIRS['data'].glob('*.fastq.gz'))
    
    print(f"Running FastQC on {len(input_files)} files...")
    print(f"Output: {output_dir}")
    
    # Build file list with proper quoting for paths with spaces
    file_list = ' '.join([f'"{f}"' for f in input_files])
    cmd = f'fastqc -t {N_THREADS} -o "{output_dir}" {file_list}'
    print(f"\nCommand:\n{cmd[:200]}...\n")
    
    # Run FastQC
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        print(f"[ERROR] FastQC failed:\n{result.stderr}")
    else:
        print("FastQC complete!")
else:
    print("Skipping FastQC on raw reads (set RUN_FASTQC_RAW = True to run)")
    print(f"\nTo run manually:")
    print(f'fastqc -t {N_THREADS} -o "{DIRS["qc"]}/fastqc_raw/" "{DIRS["data"]}"/*.fastq.gz')

### 1.2 Trimming with fastp

In [None]:
# === Trimming with fastp ===
#
# fastp automatically:
# - Detects and removes adapters
# - Trims low-quality bases from both ends
# - Filters reads below quality threshold
# - Generates HTML/JSON QC reports
#
# For each sample, we process the 4 raw FASTQ files (2 lanes × 2 reads)
# and output trimmed files ready for lane merging.

RUN_FASTP = False  # Set to True to run

def run_fastp_sample(sample_id):
    """
    Run fastp on a single sample (all lanes).
    """
    raw_files = get_fastq_files(sample_id, DIRS['data'], pattern='raw')
    
    # Process each lane separately
    for lane in ['L001', 'L002']:
        r1_in = [f for f in raw_files['R1'] if lane in f.name][0]
        r2_in = [f for f in raw_files['R2'] if lane in f.name][0]
        
        r1_out = DIRS['trimmed'] / f"{sample_id}_{lane}_R1.fastq.gz"
        r2_out = DIRS['trimmed'] / f"{sample_id}_{lane}_R2.fastq.gz"
        
        json_report = DIRS['trimmed'] / f"{sample_id}_{lane}.fastp.json"
        html_report = DIRS['trimmed'] / f"{sample_id}_{lane}.fastp.html"
        
        cmd = f"""
fastp \\
    -i "{r1_in}" \\
    -I "{r2_in}" \\
    -o "{r1_out}" \\
    -O "{r2_out}" \\
    --detect_adapter_for_pe \\
    --correction \\
    --qualified_quality_phred 20 \\
    --length_required 50 \\
    --thread {N_THREADS} \\
    --json "{json_report}" \\
    --html "{html_report}"
"""
        subprocess.run(cmd, shell=True, check=True, capture_output=True)
    
    return sample_id


if RUN_FASTP:
    samples = metadata['sample_id'].tolist()
    print(f"Running fastp on {len(samples)} samples...")
    print(f"Output: {DIRS['trimmed']}")
    
    # Run in parallel (but fastp itself uses threads, so limit parallel jobs)
    results = Parallel(n_jobs=2)(
        delayed(run_fastp_sample)(s) for s in tqdm(samples, desc="Trimming")
    )
    
    print(f"\nCompleted trimming for {len(results)} samples.")
else:
    print("Skipping fastp trimming (set RUN_FASTP = True to run)")
    print("\nExample command for single sample:")
    print(f"""
fastp \\
    -i Data/U1-AMO17076A1-22FLL5LT1_S1_L001_R1_001.fastq.gz \\
    -I Data/U1-AMO17076A1-22FLL5LT1_S1_L001_R2_001.fastq.gz \\
    -o 02_trimmed/U1_L001_R1.fastq.gz \\
    -O 02_trimmed/U1_L001_R2.fastq.gz \\
    --detect_adapter_for_pe \\
    --qualified_quality_phred 20 \\
    --thread {N_THREADS}
""")

### 1.3 Lane Merging

In [None]:
# === Lane Merging ===
#
# Merge L001 and L002 files for each sample.
# Result: 120 files (60 samples × 2 reads)
#
# Shell command:
# cat U1_L001_R1.fastq.gz U1_L002_R1.fastq.gz > U1_R1.fastq.gz

RUN_MERGE = False  # Set to True to run

def merge_lanes(sample_id):
    """
    Merge lane files for a single sample.
    """
    for read in ['R1', 'R2']:
        l001 = DIRS['trimmed'] / f"{sample_id}_L001_{read}.fastq.gz"
        l002 = DIRS['trimmed'] / f"{sample_id}_L002_{read}.fastq.gz"
        merged = DIRS['merged'] / f"{sample_id}_{read}.fastq.gz"
        
        # Use cat (gzipped files can be concatenated directly)
        cmd = f'cat "{l001}" "{l002}" > "{merged}"'
        subprocess.run(cmd, shell=True, check=True)
    
    return sample_id


if RUN_MERGE:
    samples = metadata['sample_id'].tolist()
    print(f"Merging lanes for {len(samples)} samples...")
    
    results = Parallel(n_jobs=N_THREADS)(
        delayed(merge_lanes)(s) for s in tqdm(samples, desc="Merging")
    )
    
    print(f"\nMerged files saved to: {DIRS['merged']}")
else:
    print("Skipping lane merging (set RUN_MERGE = True to run)")
    print("\nExample commands:")
    print("cat 02_trimmed/U1_L001_R1.fastq.gz 02_trimmed/U1_L002_R1.fastq.gz > 03_merged/U1_R1.fastq.gz")
    print("cat 02_trimmed/U1_L001_R2.fastq.gz 02_trimmed/U1_L002_R2.fastq.gz > 03_merged/U1_R2.fastq.gz")

### 1.4 rRNA Removal with SortMeRNA

In [None]:
# === rRNA Removal with SortMeRNA ===
#
# SortMeRNA filters out ribosomal RNA reads using SILVA 138.1 database.
# This is critical for bacterial RNA-seq even after rRNA depletion during library prep.
#
# IMPORTANT: Memory-intensive! Use max 3-4 threads (~15GB RAM each)
# IMPORTANT: This step takes ~45-60 min PER SAMPLE. Running 60 samples = ~2.5 days!
#            DO NOT run in the notebook kernel - use the generated shell script instead.
#
# Database: SILVA 138.1 NR99 (SSU + LSU) - downloaded to Data/silva_db/
# The sequences have been converted from RNA (U) to DNA (T) for compatibility.

GENERATE_SORTMERNA_SCRIPT = True  # Generate shell script for external execution

# === SILVA 138.1 Database Configuration ===
# Using custom downloaded SILVA database (more comprehensive than conda-bundled)
SILVA_DIR = DIRS['data'] / 'silva_db'
REF_SSU = SILVA_DIR / 'SILVA_138.1_SSURef_NR99_tax_silva.fasta'  # 16S/18S
REF_LSU = SILVA_DIR / 'SILVA_138.1_LSURef_NR99_tax_silva.fasta'  # 23S/28S

# Verify database exists
if SILVA_DIR.exists():
    print(f"[OK] SILVA database directory: {SILVA_DIR}")
    if REF_SSU.exists():
        print(f"  [OK] SSU (16S/18S): {REF_SSU.name}")
    else:
        print(f"  [MISSING] SSU: {REF_SSU}")
    if REF_LSU.exists():
        print(f"  [OK] LSU (23S/28S): {REF_LSU.name}")
    else:
        print(f"  [MISSING] LSU: {REF_LSU}")
else:
    print(f"[WARNING] SILVA database not found at {SILVA_DIR}")
    print("Download with:")
    print("  mkdir -p Data/silva_db && cd Data/silva_db")
    print("  curl -O https://www.arb-silva.de/fileadmin/silva_databases/release_138_1/Exports/SILVA_138.1_SSURef_NR99_tax_silva.fasta.gz")
    print("  curl -O https://www.arb-silva.de/fileadmin/silva_databases/release_138_1/Exports/SILVA_138.1_LSURef_NR99_tax_silva.fasta.gz")
    print("  gunzip *.gz")
    print("  sed -i '' 's/U/T/g' *.fasta  # Convert RNA to DNA")

if GENERATE_SORTMERNA_SCRIPT:
    samples = metadata['sample_id'].tolist()
    script_path = DIRS['logs'] / "run_sortmerna_all.sh"
    
    with open(script_path, "w") as f:
        f.write("#!/bin/bash\n")
        f.write("# ==============================================\n")
        f.write("# SortMeRNA Batch Script - PCC 11901 RNA-seq\n")
        f.write("# ==============================================\n")
        f.write("# Using SILVA 138.1 NR99 database (SSU + LSU)\n")
        f.write("# IMPORTANT: Run this in a separate terminal, NOT in Jupyter!\n")
        f.write("# Estimated runtime: ~45-60 hours (60 samples × ~1 hour each)\n")
        f.write("#\n")
        f.write("# Usage:\n")
        f.write("#   conda activate pcc11901_rnaseq\n")
        f.write(f"#   bash {script_path}\n")
        f.write("#\n")
        f.write("# To resume after interruption, comment out completed samples.\n")
        f.write("# ==============================================\n\n")
        f.write("set -e  # Exit on error\n\n")
        
        # SILVA database paths
        f.write("# SILVA 138.1 database paths\n")
        f.write(f"REF_SSU=\"{REF_SSU}\"\n")
        f.write(f"REF_LSU=\"{REF_LSU}\"\n\n")
        
        # Verify database exists
        f.write("# Verify database exists\n")
        f.write("if [ ! -f \"$REF_SSU\" ]; then\n")
        f.write("    echo \"ERROR: SSU database not found at $REF_SSU\"\n")
        f.write("    exit 1\n")
        f.write("fi\n")
        f.write("if [ ! -f \"$REF_LSU\" ]; then\n")
        f.write("    echo \"ERROR: LSU database not found at $REF_LSU\"\n")
        f.write("    exit 1\n")
        f.write("fi\n\n")
        
        f.write(f"echo \"Starting SortMeRNA processing for {len(samples)} samples...\"\n")
        f.write("echo \"Database: SILVA 138.1 NR99\"\n")
        f.write("echo \"Start time: $(date)\"\n\n")
        
        for i, sample in enumerate(samples, 1):
            r1 = DIRS['merged'] / f"{sample}_R1.fastq.gz"
            r2 = DIRS['merged'] / f"{sample}_R2.fastq.gz"
            aligned_prefix = DIRS['rrna_filtered'] / 'rRNA' / sample
            other_prefix = DIRS['rrna_filtered'] / 'non_rRNA' / sample
            workdir = DIRS['rrna_filtered'] / f'workdir_{sample}'
            log_file = DIRS['logs'] / f'{sample}_sortmerna.log'
            
            f.write(f"# === Sample {i}/{len(samples)}: {sample} ===\n")
            f.write(f"echo \"[{i}/{len(samples)}] Processing {sample}...\"\n")
            f.write(f"mkdir -p \"{workdir}\"\n")
            f.write(f"sortmerna \\\n")
            f.write(f"    --ref \"$REF_SSU\" \\\n")
            f.write(f"    --ref \"$REF_LSU\" \\\n")
            f.write(f"    --reads \"{r1}\" --reads \"{r2}\" \\\n")
            f.write(f"    --paired_in --out2 \\\n")
            f.write(f"    --aligned \"{aligned_prefix}\" \\\n")
            f.write(f"    --other \"{other_prefix}\" \\\n")
            f.write(f"    --fastx \\\n")
            f.write(f"    --threads {SORTMERNA_THREADS} \\\n")
            f.write(f"    --workdir \"{workdir}\" \\\n")
            f.write(f"    2>&1 | tee \"{log_file}\"\n")
            f.write(f"rm -rf \"{workdir}\"\n")
            f.write(f"echo \"  Completed {sample} at $(date)\"\n\n")
        
        f.write("echo \"==============================================\"\n")
        f.write("echo \"SortMeRNA processing complete!\"\n")
        f.write("echo \"End time: $(date)\"\n")
        f.write("echo \"==============================================\"\n")
    
    # Make executable
    os.chmod(script_path, 0o755)
    
    print(f"\nGenerated SortMeRNA batch script: {script_path}")
    print(f"\nTo run (in a separate terminal):")
    print(f"  conda activate pcc11901_rnaseq")
    print(f"  bash {script_path}")
    print(f"\nEstimated runtime: ~45-60 hours for {len(samples)} samples")
    print("TIP: Use 'screen' or 'tmux' to keep it running if you disconnect.")
else:
    print("Skipping SortMeRNA script generation (set GENERATE_SORTMERNA_SCRIPT = True)")
    print("\nExample command for single sample:")
    print(f"""
sortmerna \\
    --ref "{REF_SSU}" \\
    --ref "{REF_LSU}" \\
    --reads 03_merged/U1_R1.fastq.gz --reads 03_merged/U1_R2.fastq.gz \\
    --paired_in --out2 \\
    --aligned 04_rRNA_filtered/rRNA/U1 \\
    --other 04_rRNA_filtered/non_rRNA/U1 \\
    --fastx --threads {SORTMERNA_THREADS}
""")

### 1.5 Post-Processing QC

In [None]:
# === FastQC on Trimmed/Filtered Reads ===
#
# Run FastQC on the rRNA-filtered reads to verify quality improvement.

RUN_FASTQC_TRIMMED = False  # Set to True to run

if RUN_FASTQC_TRIMMED:
    output_dir = DIRS['qc'] / 'fastqc_trimmed'
    input_dir = DIRS['rrna_filtered'] / 'non_rRNA'
    
    # Get all filtered FASTQ files
    input_files = list(input_dir.glob('*.fq.gz')) + list(input_dir.glob('*.fastq.gz'))
    
    if not input_files:
        print(f"[WARNING] No FASTQ files found in {input_dir}")
    else:
        print(f"Running FastQC on {len(input_files)} filtered files...")
        print(f"Output: {output_dir}")
        
        # Build file list with proper quoting for paths with spaces
        file_list = ' '.join([f'"{f}"' for f in input_files])
        cmd = f'fastqc -t {N_THREADS} -o "{output_dir}" {file_list}'
        
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
        if result.returncode != 0:
            print(f"[ERROR] FastQC failed:\n{result.stderr}")
        else:
            print("FastQC complete!")
else:
    print("Skipping post-filter FastQC (set RUN_FASTQC_TRIMMED = True to run)")

In [None]:
# === MultiQC Summary Report ===
#
# Aggregate all QC reports into a single interactive report.

RUN_MULTIQC = False  # Set to True to run

if RUN_MULTIQC:
    output_dir = DIRS['qc'] / 'multiqc_reports'
    
    # Build command with quoted paths
    cmd = f'''multiqc \
        "{DIRS['qc']}/fastqc_raw" \
        "{DIRS['qc']}/fastqc_trimmed" \
        "{DIRS['trimmed']}" \
        -o "{output_dir}" \
        --filename multiqc_report \
        --force'''
    
    print("Running MultiQC...")
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        print(f"[ERROR] MultiQC failed:\n{result.stderr}")
    else:
        print(f"Report saved to: {output_dir}/multiqc_report.html")
else:
    print("Skipping MultiQC (set RUN_MULTIQC = True to run)")
    print(f'\nTo run manually:\nmultiqc "{DIRS["qc"]}/" -o "{DIRS["qc"]}/multiqc_reports/"')

---
## Phase 2: Quantification and Alignment

### 2.1 Extract Transcriptome (gffread)

In [None]:
# === Extract Transcriptome with gffread ===
#
# Create a FASTA file of transcript sequences from the genome + GTF.
# This is required for Salmon indexing.
#
# Shell command:
# gffread -w transcriptome.fa -g genome.fna annotation.gtf

RUN_GFFREAD = False  # Set to True to run

if RUN_GFFREAD:
    cmd = f'''gffread \
        -w "{TRANSCRIPTOME_FASTA}" \
        -g "{GENOME_FASTA}" \
        "{GTF_FILE}"'''
    
    print("Extracting transcriptome...")
    print(f"Command: gffread -w transcriptome.fa -g genome.fna annotation.gtf")
    
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        print(f"[ERROR] gffread failed:\n{result.stderr}")
    else:
        # Check result
        if TRANSCRIPTOME_FASTA.exists():
            # Count transcripts
            with open(TRANSCRIPTOME_FASTA, 'r') as f:
                n_transcripts = sum(1 for line in f if line.startswith('>'))
            print(f"\nTranscriptome created: {TRANSCRIPTOME_FASTA}")
            print(f"Number of transcripts: {n_transcripts}")
        else:
            print("[ERROR] Transcriptome extraction failed - file not created!")
else:
    print("Skipping transcriptome extraction (set RUN_GFFREAD = True to run)")
    print(f"\nTo run manually:")
    print(f'gffread -w "{TRANSCRIPTOME_FASTA}" -g "{GENOME_FASTA}" "{GTF_FILE}"')

### 2.2 Salmon Index and Quantification

In [None]:
# === Build Salmon Index ===
#
# Using decoy-aware mapping for better accuracy:
# - Target: transcriptome sequences
# - Decoy: whole genome (prevents false quantification of intergenic reads)
#
# Shell commands:
# grep "^>" genome.fna | cut -d " " -f 1 | sed 's/>//g' > decoys.txt
# cat transcriptome.fa genome.fna > gentrome.fa
# salmon index -t gentrome.fa -d decoys.txt -i salmon_index -p 8

RUN_SALMON_INDEX = False  # Set to True to run

SALMON_INDEX = DIRS['salmon'] / 'index'
GENTROME = DIRS['genome'] / 'gentrome.fa'
DECOYS = DIRS['genome'] / 'decoys.txt'

if RUN_SALMON_INDEX:
    print("Building Salmon index with decoy-aware mapping...\n")
    
    # Step 1: Create decoys.txt (chromosome names)
    print("Step 1: Creating decoys list...")
    cmd_decoys = f'grep "^>" "{GENOME_FASTA}" | cut -d " " -f 1 | sed "s/>//g" > "{DECOYS}"'
    result = subprocess.run(cmd_decoys, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        print(f"[ERROR] Creating decoys failed:\n{result.stderr}")
    else:
        print(f"  Decoys file: {DECOYS}")
    
    # Step 2: Create gentrome (transcriptome + genome)
    print("\nStep 2: Creating gentrome...")
    cmd_gentrome = f'cat "{TRANSCRIPTOME_FASTA}" "{GENOME_FASTA}" > "{GENTROME}"'
    result = subprocess.run(cmd_gentrome, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        print(f"[ERROR] Creating gentrome failed:\n{result.stderr}")
    else:
        print(f"  Gentrome file: {GENTROME}")
    
    # Step 3: Build Salmon index
    print(f"\nStep 3: Building Salmon index (this may take a few minutes)...")
    cmd_index = f'''salmon index \
        -t "{GENTROME}" \
        -d "{DECOYS}" \
        -i "{SALMON_INDEX}" \
        -p {SALMON_THREADS}'''
    
    result = subprocess.run(cmd_index, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        print(f"[ERROR] Salmon index failed:\n{result.stderr}")
    else:
        print(f"\nSalmon index created: {SALMON_INDEX}")
else:
    print("Skipping Salmon index build (set RUN_SALMON_INDEX = True to run)")
    print("\nCommands to run manually:")
    print(f'grep "^>" "{GENOME_FASTA}" | cut -d " " -f 1 | sed "s/>//g" > "{DECOYS}"')
    print(f'cat "{TRANSCRIPTOME_FASTA}" "{GENOME_FASTA}" > "{GENTROME}"')
    print(f'salmon index -t "{GENTROME}" -d "{DECOYS}" -i "{SALMON_INDEX}" -p {SALMON_THREADS}')

In [None]:
# === Salmon Quantification ===
#
# Quantify transcript abundance for each sample.
# Salmon auto-detects library type (strandedness).
#
# Shell command:
# salmon quant -i salmon_index -l A \
#     -1 U1_R1.fastq.gz -2 U1_R2.fastq.gz \
#     -o quants/U1 --validateMappings --gcBias -p 10

RUN_SALMON_QUANT = False  # Set to True to run

def run_salmon_quant(sample_id):
    """
    Run Salmon quantification on a single sample.
    Uses rRNA-filtered reads from 04_rRNA_filtered/non_rRNA/
    """
    # Input files (from SortMeRNA output)
    r1 = DIRS['rrna_filtered'] / 'non_rRNA' / f"{sample_id}_fwd.fq.gz"
    r2 = DIRS['rrna_filtered'] / 'non_rRNA' / f"{sample_id}_rev.fq.gz"
    
    # Output directory
    output_dir = DIRS['salmon'] / 'quants' / sample_id
    
    cmd = f"""
salmon quant \\
    -i "{SALMON_INDEX}" \\
    -l A \\
    -1 "{r1}" \\
    -2 "{r2}" \\
    -o "{output_dir}" \\
    --validateMappings \\
    --gcBias \\
    --seqBias \\
    -p {SALMON_THREADS}
"""
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    return sample_id


if RUN_SALMON_QUANT:
    samples = metadata['sample_id'].tolist()
    print(f"Running Salmon quantification on {len(samples)} samples...")
    print(f"Output: {DIRS['salmon']}/quants/")
    
    # Run sequentially (Salmon already uses multiple threads)
    for sample in tqdm(samples, desc="Salmon quant"):
        run_salmon_quant(sample)
    
    print("\nSalmon quantification complete!")
else:
    print("Skipping Salmon quantification (set RUN_SALMON_QUANT = True to run)")
    print("\nExample command for single sample:")
    print(f"""
salmon quant \\
    -i "{SALMON_INDEX}" \\
    -l A \\
    -1 04_rRNA_filtered/non_rRNA/U1_fwd.fq.gz \\
    -2 04_rRNA_filtered/non_rRNA/U1_rev.fq.gz \\
    -o 05_salmon/quants/U1 \\
    --validateMappings --gcBias -p {SALMON_THREADS}
""")

### 2.3 Bowtie2 Alignment (for Visualization)

In [None]:
# === Build Bowtie2 Index ===
#
# Build genome index for read alignment.
# This is needed to create BAM files for IGV visualization.
#
# Shell command:
# bowtie2-build genome.fna bowtie2_index/pcc11901

RUN_BOWTIE2_INDEX = False  # Set to True to run

BOWTIE2_INDEX = DIRS['alignment'] / 'bowtie2_index' / 'pcc11901'

if RUN_BOWTIE2_INDEX:
    print("Building Bowtie2 index...")
    
    cmd = f'bowtie2-build --threads {N_THREADS} "{GENOME_FASTA}" "{BOWTIE2_INDEX}"'
    print(f"Command: bowtie2-build --threads {N_THREADS} genome.fna bowtie2_index/pcc11901\n")
    
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        print(f"[ERROR] Bowtie2 index failed:\n{result.stderr}")
    else:
        print(f"Bowtie2 index created: {BOWTIE2_INDEX}")
else:
    print("Skipping Bowtie2 index (set RUN_BOWTIE2_INDEX = True to run)")
    print(f'\nTo run manually:\nbowtie2-build --threads {N_THREADS} "{GENOME_FASTA}" "{BOWTIE2_INDEX}"')

In [None]:
# === Bowtie2 Alignment ===
#
# Align reads to genome and create sorted BAM files.
#
# Shell command:
# bowtie2 -x bowtie2_index/pcc11901 -1 R1.fq.gz -2 R2.fq.gz -p 8 | \
#     samtools view -bS - | samtools sort -o sample.bam
# samtools index sample.bam

RUN_BOWTIE2_ALIGN = False  # Set to True to run

def run_bowtie2_align(sample_id):
    """
    Align reads with Bowtie2 and create sorted BAM.
    """
    r1 = DIRS['rrna_filtered'] / 'non_rRNA' / f"{sample_id}_fwd.fq.gz"
    r2 = DIRS['rrna_filtered'] / 'non_rRNA' / f"{sample_id}_rev.fq.gz"
    
    bam_file = DIRS['alignment'] / 'bam' / f"{sample_id}.bam"
    
    cmd = f"""
bowtie2 \\
    -x "{BOWTIE2_INDEX}" \\
    -1 "{r1}" \\
    -2 "{r2}" \\
    -p {N_THREADS} \\
    2> "{DIRS['logs']}/{sample_id}_bowtie2.log" | \
samtools view -bS - | \
samtools sort -@ 4 -o "{bam_file}" -

samtools index "{bam_file}"
"""
    subprocess.run(cmd, shell=True, check=True)
    return sample_id


if RUN_BOWTIE2_ALIGN:
    samples = metadata['sample_id'].tolist()
    print(f"Running Bowtie2 alignment on {len(samples)} samples...")
    
    for sample in tqdm(samples, desc="Aligning"):
        run_bowtie2_align(sample)
    
    print(f"\nBAM files saved to: {DIRS['alignment']}/bam/")
else:
    print("Skipping Bowtie2 alignment (set RUN_BOWTIE2_ALIGN = True to run)")
    print("\nExample command:")
    print(f"""
bowtie2 -x "{BOWTIE2_INDEX}" \\
    -1 04_rRNA_filtered/non_rRNA/U1_fwd.fq.gz \\
    -2 04_rRNA_filtered/non_rRNA/U1_rev.fq.gz \\
    -p {N_THREADS} | samtools view -bS - | samtools sort -o 06_alignment/bam/U1.bam -
samtools index 06_alignment/bam/U1.bam
""")

In [None]:
# === Create bigWig Coverage Tracks ===
#
# Convert BAM to normalized bigWig for IGV visualization.
# Using CPM (counts per million) normalization.
#
# Shell command:
# bamCoverage -b sample.bam -o sample.bw --normalizeUsing CPM -p 8

RUN_BIGWIG = False  # Set to True to run

def create_bigwig(sample_id):
    """
    Create CPM-normalized bigWig from BAM.
    """
    bam_file = DIRS['alignment'] / 'bam' / f"{sample_id}.bam"
    bigwig_file = DIRS['alignment'] / 'bigwig' / f"{sample_id}.bw"
    
    cmd = f"""
bamCoverage \\
    -b "{bam_file}" \\
    -o "{bigwig_file}" \\
    --normalizeUsing CPM \\
    -p {N_THREADS}
"""
    subprocess.run(cmd, shell=True, check=True, capture_output=True)
    return sample_id


if RUN_BIGWIG:
    samples = metadata['sample_id'].tolist()
    print(f"Creating bigWig files for {len(samples)} samples...")
    
    for sample in tqdm(samples, desc="bigWig"):
        create_bigwig(sample)
    
    print(f"\nbigWig files saved to: {DIRS['alignment']}/bigwig/")
else:
    print("Skipping bigWig creation (set RUN_BIGWIG = True to run)")
    print(f"\nTo run manually:\nbamCoverage -b 06_alignment/bam/U1.bam -o 06_alignment/bigwig/U1.bw --normalizeUsing CPM -p {N_THREADS}")

---
## Phase 3: Differential Expression Analysis

### 3.1 Import Salmon Counts with tximport

In [None]:
# === Setup R Environment via rpy2 ===

if not RPY2_AVAILABLE:
    print("[SKIPPED] rpy2 not available - R integration disabled")
    print("DESeq2 analysis will need to be run directly in R")
else:
    # Import R packages
    base = importr('base')
    stats = importr('stats')
    
    # Load Bioconductor packages
    try:
        tximport = importr('tximport')
        deseq2 = importr('DESeq2')
        print("[OK] tximport and DESeq2 loaded")
    except Exception as e:
        print(f"[ERROR] Could not load R packages: {e}")
        print("Install with: R -e 'BiocManager::install(c(\"tximport\", \"DESeq2\"))'")

In [None]:
# === Load Salmon Counts ===
#
# Use tximport to aggregate transcript-level counts to gene level.
# This is the recommended way to import Salmon output into DESeq2.

LOAD_COUNTS = False  # Set to True after Salmon quant is complete

if LOAD_COUNTS:
    # Create tx2gene mapping from GTF
    print("Creating transcript-to-gene mapping...")
    
    # Parse GTF to get transcript -> gene mapping
    # Note: NCBI RefSeq GTF files may use different attribute formats
    tx2gene = []
    with open(GTF_FILE, 'r') as f:
        for line in f:
            if line.startswith('#'):
                continue
            fields = line.strip().split('\t')
            if len(fields) < 9:
                continue
            
            # Only process CDS or exon features (which have transcript associations)
            feature_type = fields[2]
            if feature_type not in ['CDS', 'exon', 'transcript']:
                continue
                
            attrs = fields[8]
            
            # Try multiple patterns for transcript_id (RefSeq GTFs vary)
            tx_match = re.search(r'transcript_id "([^"]+)"', attrs)
            if not tx_match:
                tx_match = re.search(r'transcript_id=([^;]+)', attrs)
            
            # Try multiple patterns for gene_id
            gene_match = re.search(r'gene_id "([^"]+)"', attrs)
            if not gene_match:
                gene_match = re.search(r'gene_id=([^;]+)', attrs)
            if not gene_match:
                # Fallback: try locus_tag
                gene_match = re.search(r'locus_tag "([^"]+)"', attrs)
            
            if tx_match and gene_match:
                tx2gene.append({
                    'transcript_id': tx_match.group(1),
                    'gene_id': gene_match.group(1)
                })
    
    tx2gene_df = pd.DataFrame(tx2gene).drop_duplicates()
    
    # === VALIDATION: Check tx2gene parsing succeeded ===
    if len(tx2gene_df) == 0:
        print("\n[ERROR] tx2gene parsing failed! No transcript-gene mappings found.")
        print("Debugging info - first 5 lines of GTF attributes:")
        with open(GTF_FILE, 'r') as f:
            for i, line in enumerate(f):
                if not line.startswith('#') and i < 10:
                    fields = line.strip().split('\t')
                    if len(fields) >= 9:
                        print(f"  {fields[8][:200]}...")
        raise ValueError(
            "tx2gene parsing failed! Check if your GTF uses 'gene_id'/'transcript_id' "
            "or alternative formats like 'gene', 'locus_tag', etc. "
            "Preview the GTF attributes above to debug."
        )
    
    tx2gene_file = DIRS['deseq2'] / 'tx2gene.csv'
    tx2gene_df.to_csv(tx2gene_file, index=False)
    print(f"  tx2gene mapping: {len(tx2gene_df)} transcripts -> {tx2gene_df['gene_id'].nunique()} genes")
    
    # Get Salmon quant files
    samples = metadata['sample_id'].tolist()
    quant_files = [str(DIRS['salmon'] / 'quants' / s / 'quant.sf') for s in samples]
    
    # Check that files exist
    missing = [f for f in quant_files if not Path(f).exists()]
    if missing:
        print(f"[WARNING] Missing quant files: {len(missing)}")
        print(f"  First missing: {missing[0]}")
    else:
        print(f"  Found all {len(quant_files)} quant.sf files")
    
    # Import with tximport via R
    print("\nRunning tximport...")
    
    ro.r(f'''
    library(tximport)
    
    # Read tx2gene
    tx2gene <- read.csv("{tx2gene_file}")
    
    # Sample files
    files <- c({', '.join([f'"{f}"' for f in quant_files])})
    names(files) <- c({', '.join([f'"{s}"' for s in samples])})
    
    # Import
    txi <- tximport(files, type="salmon", tx2gene=tx2gene)
    
    # Save counts
    counts <- as.data.frame(txi$counts)
    write.csv(counts, "{DIRS['deseq2']}/counts_matrix.csv")
    ''')
    
    # Load counts into Python
    counts_df = pd.read_csv(DIRS['deseq2'] / 'counts_matrix.csv', index_col=0)
    print(f"\nCounts matrix: {counts_df.shape[0]} genes × {counts_df.shape[1]} samples")
    counts_df.head()
else:
    print("Skipping count loading (set LOAD_COUNTS = True after Salmon quant is complete)")

### 3.2 DESeq2 Analysis

In [None]:
# === DESeq2 Analysis - Group 1 (Nutrients) ===
#
# Compare each nutrient condition vs Control (U4, U5, U6)
#
# Conditions:
# - Glycerol_0.75pct
# - Low_Nitrogen, High_Nitrogen
# - Low_Phosphate, High_Phosphate
# - Ammonia, Urea

RUN_DESEQ2_GROUP1 = False  # Set to True to run

if RUN_DESEQ2_GROUP1:
    print("Running DESeq2 for Group 1 (Nutrients)...\n")
    
    # Filter metadata for Group 1
    group1_meta = metadata[metadata['group'] == 1].copy()
    group1_samples = group1_meta['sample_id'].tolist()
    
    # Save metadata for R
    group1_meta_file = DIRS['deseq2'] / 'group1_metadata.csv'
    group1_meta.to_csv(group1_meta_file, index=False)
    
    # Run DESeq2 in R
    ro.r(f'''
    library(DESeq2)
    library(tximport)
    
    # Load metadata
    coldata <- read.csv("{group1_meta_file}")
    rownames(coldata) <- coldata$sample_id
    coldata$condition <- factor(coldata$condition)
    coldata$condition <- relevel(coldata$condition, ref="Control_MAD")
    
    # Load counts (subset to Group 1 samples)
    counts <- read.csv("{DIRS['deseq2']}/counts_matrix.csv", row.names=1)
    counts <- counts[, coldata$sample_id]
    counts <- round(counts)  # DESeq2 requires integers
    
    # Create DESeq2 object
    dds <- DESeqDataSetFromMatrix(
        countData = counts,
        colData = coldata,
        design = ~ condition
    )
    
    # Filter low counts
    keep <- rowSums(counts(dds) >= 10) >= 3
    dds <- dds[keep,]
    
    # Run DESeq2
    dds <- DESeq(dds)
    
    # Save normalized counts
    norm_counts <- as.data.frame(counts(dds, normalized=TRUE))
    write.csv(norm_counts, "{DIRS['deseq2']}/group1_normalized_counts.csv")
    
    # Extract results for each comparison
    conditions <- levels(coldata$condition)
    conditions <- conditions[conditions != "Control_MAD"]
    
    for (cond in conditions) {{
        res <- results(dds, contrast=c("condition", cond, "Control_MAD"))
        res <- as.data.frame(res)
        res$gene_id <- rownames(res)
        write.csv(res, paste0("{DIRS['deseq2']}/group1_", cond, "_vs_Control.csv"), row.names=FALSE)
        
        # Summary
        sig <- sum(res$padj < 0.05 & abs(res$log2FoldChange) > 1, na.rm=TRUE)
        cat(paste0(cond, " vs Control: ", sig, " DE genes\n"))
    }}
    ''')
    
    print("\nGroup 1 analysis complete!")
    print(f"Results saved to: {DIRS['deseq2']}/")
else:
    print("Skipping DESeq2 Group 1 (set RUN_DESEQ2_GROUP1 = True to run)")

In [None]:
# === DESeq2 Analysis - Group 2 (Environmental) ===
#
# Compare each environmental condition vs Control (U1, U2, U3)
#
# Conditions:
# - High_NaCl_9pct, H2O2_0.005pct
# - Atmospheric_CO2, High_CO2_8pct
# - High_Temp_38C
# - Low_Light_15uE, High_Light

RUN_DESEQ2_GROUP2 = False  # Set to True to run

if RUN_DESEQ2_GROUP2:
    print("Running DESeq2 for Group 2 (Environmental)...\n")
    
    # Filter metadata for Group 2
    group2_meta = metadata[metadata['group'] == 2].copy()
    
    # Save metadata for R
    group2_meta_file = DIRS['deseq2'] / 'group2_metadata.csv'
    group2_meta.to_csv(group2_meta_file, index=False)
    
    # Run DESeq2 in R
    ro.r(f'''
    library(DESeq2)
    
    # Load metadata
    coldata <- read.csv("{group2_meta_file}")
    rownames(coldata) <- coldata$sample_id
    coldata$condition <- factor(coldata$condition)
    coldata$condition <- relevel(coldata$condition, ref="Control_MAD")
    
    # Load counts (subset to Group 2 samples)
    counts <- read.csv("{DIRS['deseq2']}/counts_matrix.csv", row.names=1)
    counts <- counts[, coldata$sample_id]
    counts <- round(counts)
    
    # Create DESeq2 object
    dds <- DESeqDataSetFromMatrix(
        countData = counts,
        colData = coldata,
        design = ~ condition
    )
    
    # Filter low counts
    keep <- rowSums(counts(dds) >= 10) >= 3
    dds <- dds[keep,]
    
    # Run DESeq2
    dds <- DESeq(dds)
    
    # Save normalized counts
    norm_counts <- as.data.frame(counts(dds, normalized=TRUE))
    write.csv(norm_counts, "{DIRS['deseq2']}/group2_normalized_counts.csv")
    
    # Extract results for each comparison
    conditions <- levels(coldata$condition)
    conditions <- conditions[conditions != "Control_MAD"]
    
    for (cond in conditions) {{
        res <- results(dds, contrast=c("condition", cond, "Control_MAD"))
        res <- as.data.frame(res)
        res$gene_id <- rownames(res)
        write.csv(res, paste0("{DIRS['deseq2']}/group2_", cond, "_vs_Control.csv"), row.names=FALSE)
        
        sig <- sum(res$padj < 0.05 & abs(res$log2FoldChange) > 1, na.rm=TRUE)
        cat(paste0(cond, " vs Control: ", sig, " DE genes\n"))
    }}
    ''')
    
    print("\nGroup 2 analysis complete!")
else:
    print("Skipping DESeq2 Group 2 (set RUN_DESEQ2_GROUP2 = True to run)")

In [None]:
# === DESeq2 Analysis - Group 3 (Circadian) ===
#
# Time-series analysis using DESeq2 LRT (Likelihood Ratio Test)
# Tests for genes with significant changes across timepoints.
#
# Timepoints:
# - T1 (Light), T2 (Dark), T3 (Light), T4 (Dark)

RUN_DESEQ2_GROUP3 = False  # Set to True to run

if RUN_DESEQ2_GROUP3:
    print("Running DESeq2 for Group 3 (Circadian - LRT)...\n")
    
    # Filter metadata for Group 3
    group3_meta = metadata[metadata['group'] == 3].copy()
    
    # Add timepoint numeric for trend analysis
    timepoint_map = {'T1_Light': 1, 'T2_Dark': 2, 'T3_Light': 3, 'T4_Dark': 4}
    group3_meta['timepoint'] = group3_meta['condition'].map(timepoint_map)
    
    # Save metadata for R
    group3_meta_file = DIRS['deseq2'] / 'group3_metadata.csv'
    group3_meta.to_csv(group3_meta_file, index=False)
    
    # Run DESeq2 LRT in R
    ro.r(f'''
    library(DESeq2)
    
    # Load metadata
    coldata <- read.csv("{group3_meta_file}")
    rownames(coldata) <- coldata$sample_id
    coldata$condition <- factor(coldata$condition, 
                                 levels=c("T1_Light", "T2_Dark", "T3_Light", "T4_Dark"))
    
    # Load counts
    counts <- read.csv("{DIRS['deseq2']}/counts_matrix.csv", row.names=1)
    counts <- counts[, coldata$sample_id]
    counts <- round(counts)
    
    # Create DESeq2 object
    dds <- DESeqDataSetFromMatrix(
        countData = counts,
        colData = coldata,
        design = ~ condition
    )
    
    # Filter low counts
    keep <- rowSums(counts(dds) >= 10) >= 3
    dds <- dds[keep,]
    
    # Run DESeq2 with LRT (tests for ANY difference across timepoints)
    dds <- DESeq(dds, test="LRT", reduced=~1)
    
    # Save normalized counts
    norm_counts <- as.data.frame(counts(dds, normalized=TRUE))
    write.csv(norm_counts, "{DIRS['deseq2']}/group3_normalized_counts.csv")
    
    # LRT results (genes changing over time)
    res_lrt <- results(dds)
    res_lrt <- as.data.frame(res_lrt)
    res_lrt$gene_id <- rownames(res_lrt)
    write.csv(res_lrt, "{DIRS['deseq2']}/group3_circadian_LRT.csv", row.names=FALSE)
    
    sig <- sum(res_lrt$padj < 0.05, na.rm=TRUE)
    cat(paste0("Genes with significant circadian variation: ", sig, "\n"))
    
    # Also do pairwise comparisons
    # T2 (Dark) vs T1 (Light)
    dds_wald <- DESeq(dds, test="Wald")
    
    res_t2t1 <- results(dds_wald, contrast=c("condition", "T2_Dark", "T1_Light"))
    write.csv(as.data.frame(res_t2t1), "{DIRS['deseq2']}/group3_T2_Dark_vs_T1_Light.csv")
    
    res_t3t2 <- results(dds_wald, contrast=c("condition", "T3_Light", "T2_Dark"))
    write.csv(as.data.frame(res_t3t2), "{DIRS['deseq2']}/group3_T3_Light_vs_T2_Dark.csv")
    
    res_t4t3 <- results(dds_wald, contrast=c("condition", "T4_Dark", "T3_Light"))
    write.csv(as.data.frame(res_t4t3), "{DIRS['deseq2']}/group3_T4_Dark_vs_T3_Light.csv")
    ''')
    
    print("\nGroup 3 analysis complete!")
else:
    print("Skipping DESeq2 Group 3 (set RUN_DESEQ2_GROUP3 = True to run)")

---
## Phase 4: Visualization

### 4.1 QC Plots (PCA, Sample Correlation)

In [None]:
# === PCA Plot (Publication Quality) ===

PLOT_PCA = False  # Set to True after counts are loaded

if PLOT_PCA:
    norm_counts = pd.read_csv(DIRS['deseq2'] / 'counts_matrix.csv', index_col=0)
    log_counts = np.log2(norm_counts + 1)
    
    from sklearn.decomposition import PCA
    from sklearn.preprocessing import StandardScaler
    
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(log_counts.T)
    
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(scaled_data)
    
    pca_df = pd.DataFrame({
        'PC1': pca_result[:, 0],
        'PC2': pca_result[:, 1],
        'sample_id': norm_counts.columns
    })
    pca_df = pca_df.merge(metadata, on='sample_id')
    
    fig = px.scatter(
        pca_df, x='PC1', y='PC2',
        color='condition', symbol='group',
        hover_data=['sample_id'],
        color_discrete_sequence=COLORS_CATEGORICAL,
        labels={
            'PC1': f'PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)',
            'PC2': f'PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)'
        }
    )
    
    fig.update_layout(**PUBLICATION_LAYOUT)
    fig.update_layout(title=dict(text='<b>Principal Component Analysis</b>'), height=550, width=650)
    fig.update_traces(marker=dict(size=10, line=dict(width=1, color='white')))
    fig.show()
    
    output_path = DIRS['figures'] / 'qc_plots' / 'pca_all_samples'
    fig.write_html(f"{output_path}.html")
    try:
        fig.write_image(f"{output_path}.pdf", format='pdf')
        fig.write_image(f"{output_path}.png", format='png', scale=3)
    except: pass
else:
    print("Skipping PCA (set PLOT_PCA = True)")

In [None]:
# === Sample Correlation Heatmap (Publication Quality) ===

PLOT_CORRELATION = False  # Set to True after counts are loaded

if PLOT_CORRELATION:
    # Load normalized counts
    norm_counts = pd.read_csv(DIRS['deseq2'] / 'counts_matrix.csv', index_col=0)
    
    # Calculate correlation
    log_counts = np.log2(norm_counts + 1)
    corr_matrix = log_counts.corr(method='pearson')
    
    # Order samples by condition
    sample_order = metadata.sort_values(['group', 'condition', 'sample_id'])['sample_id'].tolist()
    corr_matrix = corr_matrix.loc[sample_order, sample_order]
    
    # Create heatmap
    fig = px.imshow(
        corr_matrix,
        labels=dict(color="Pearson r"),
        x=corr_matrix.columns,
        y=corr_matrix.index,
        color_continuous_scale='RdBu_r',
        zmin=0.85, zmax=1.0,
        aspect='equal'
    )
    
    # Apply publication styling
    fig.update_layout(
        font=dict(family="Arial, Helvetica, sans-serif", size=10, color="black"),
        title=dict(
            text='<b>Sample Correlation Matrix</b>',
            font=dict(size=14),
            x=0.5, xanchor='center'
        ),
        height=700, 
        width=800,
        paper_bgcolor='white',
        plot_bgcolor='white',
        xaxis=dict(tickfont=dict(size=8), tickangle=45),
        yaxis=dict(tickfont=dict(size=8)),
        coloraxis_colorbar=dict(
            title=dict(text='Pearson r', font=dict(size=11)),
            tickfont=dict(size=10),
            len=0.6
        ),
        margin=dict(l=100, r=20, t=60, b=100)
    )
    
    fig.show()
    
    # Save outputs
    output_path = DIRS['figures'] / 'qc_plots' / 'correlation_heatmap'
    fig.write_html(f"{output_path}.html")
    try:
        fig.write_image(f"{output_path}.pdf", format='pdf')
        fig.write_image(f"{output_path}.png", format='png', scale=3)
        print(f"Saved: {output_path}.html, .pdf, .png")
    except:
        print(f"Saved: {output_path}.html")
else:
    print("Skipping correlation heatmap (set PLOT_CORRELATION = True after loading counts)")


### 4.2 Volcano Plots

In [None]:
# === Volcano Plot Function (Publication Quality) ===

def create_volcano_plot(results_file, title, output_path=None, 
                        log2fc_threshold=1, padj_threshold=0.05,
                        save_static=True):
    """
    Create a publication-quality volcano plot from DESeq2 results.
    """
    # Load results - handle both formats (gene_id as column OR as row index)
    df = pd.read_csv(results_file)
    
    # If first column is unnamed (row index from R), use it as gene_id
    if 'Unnamed: 0' in df.columns:
        df = df.rename(columns={'Unnamed: 0': 'gene_id'})
    elif 'gene_id' not in df.columns:
        df = pd.read_csv(results_file, index_col=0).reset_index()
        df = df.rename(columns={'index': 'gene_id'})
    
    df = df.dropna(subset=['log2FoldChange', 'padj'])
    df['neg_log10_padj'] = -np.log10(df['padj'].clip(lower=1e-300))
    
    conditions = [
        (df['padj'] < padj_threshold) & (df['log2FoldChange'] > log2fc_threshold),
        (df['padj'] < padj_threshold) & (df['log2FoldChange'] < -log2fc_threshold),
    ]
    df['regulation'] = np.select(conditions, ['Up', 'Down'], default='NS')
    
    n_up = (df['regulation'] == 'Up').sum()
    n_down = (df['regulation'] == 'Down').sum()
    
    fig = px.scatter(
        df, x='log2FoldChange', y='neg_log10_padj',
        color='regulation',
        color_discrete_map=COLORS_UPDOWN,
        hover_data=['gene_id', 'baseMean', 'padj'] if 'gene_id' in df.columns else ['baseMean', 'padj'],
        labels={'log2FoldChange': 'Log₂ Fold Change', 'neg_log10_padj': '-Log₁₀(adjusted P-value)'}
    )
    
    fig.update_layout(**PUBLICATION_LAYOUT)
    fig.update_layout(
        title=dict(text=f"<b>{title}</b><br><sup>Up: {n_up} | Down: {n_down}</sup>"),
        height=500, width=550,
        legend=dict(
            title='', 
            orientation='h', 
            yanchor='top', 
            y=0.99,  # Moved down from 1.02 to inside the plot area
            xanchor='right', 
            x=0.99,
            bgcolor='rgba(255,255,255,0.8)',
            bordercolor='black',
            borderwidth=1
        )
    )
    
    fig.add_hline(y=-np.log10(padj_threshold), line_dash="dot", line_color="#666666", line_width=1)
    fig.add_vline(x=log2fc_threshold, line_dash="dot", line_color="#666666", line_width=1)
    fig.add_vline(x=-log2fc_threshold, line_dash="dot", line_color="#666666", line_width=1)
    fig.update_traces(marker=dict(size=4, opacity=0.7, line=dict(width=0)))
    
    if output_path:
        fig.write_html(output_path)
        try:
            fig.write_image(str(output_path).replace('.html', '.pdf'), format='pdf')
            fig.write_image(str(output_path).replace('.html', '.png'), format='png', scale=3)
        except: pass
    
    return fig

print("Volcano plot function defined.")

In [None]:
# === Generate Volcano Plots for All Comparisons ===

PLOT_VOLCANOS = False  # Set to True after DESeq2 is complete

if PLOT_VOLCANOS:
    # Find all results files
    results_files = list(DIRS['deseq2'].glob('group*_*_vs_*.csv'))
    results_files += list(DIRS['deseq2'].glob('group*_circadian_LRT.csv'))
    
    print(f"Found {len(results_files)} results files\n")
    
    for results_file in results_files:
        # Extract comparison name from filename
        name = results_file.stem
        title = name.replace('_', ' ').replace('vs', 'vs.')
        
        output_path = DIRS['figures'] / 'volcano_plots' / f"{name}.html"
        
        fig = create_volcano_plot(
            results_file, 
            title=title,
            output_path=output_path
        )
        # Display first one as example
        if results_file == results_files[0]:
            fig.show()
    
    print(f"\nAll volcano plots saved to: {DIRS['figures']}/volcano_plots/")
else:
    print("Skipping volcano plots (set PLOT_VOLCANOS = True after DESeq2 is complete)")

In [None]:
# === Combined Volcano Plot Report ===
#
# Creates a single HTML file with all volcano plots in a 3-column grid

GENERATE_COMBINED_REPORT = True

if GENERATE_COMBINED_REPORT:
    from pathlib import Path
    import plotly.graph_objects as go
    
    # Find all results files
    results_files = sorted(DIRS['deseq2'].glob('*_vs_*.csv')) + sorted(DIRS['deseq2'].glob('*_LRT.csv'))
    print(f"Found {len(results_files)} results files")
    
    if len(results_files) == 0:
        print("No results files found! Run DESeq2 analysis first.")
    else:
        # Group files by experimental group
        group1_files = [f for f in results_files if 'group1_' in f.name]
        group2_files = [f for f in results_files if 'group2_' in f.name]
        group3_files = [f for f in results_files if 'group3_' in f.name]
        
        # Create temp directory for individual plots
        temp_dir = DIRS['figures'] / 'volcano_plots_temp'
        temp_dir.mkdir(parents=True, exist_ok=True)
        
        # Generate individual plot files and collect metadata
        plot_info = []
        
        for group_name, files in [("Group 1: Nutrient Conditions", group1_files), 
                                   ("Group 2: Environmental Conditions", group2_files),
                                   ("Group 3: Circadian Rhythm", group3_files)]:
            if not files:
                continue
            
            for f in files:
                comparison = f.stem.replace('group1_', '').replace('group2_', '').replace('group3_', '')
                comparison = comparison.replace('_', ' ').replace(' vs ', ' vs. ')
                
                # Create volcano plot and save as individual HTML
                fig = create_volcano_plot(f, comparison)
                fig.update_layout(width=400, height=380, margin=dict(l=50, r=20, t=60, b=40))
                
                plot_filename = f"volcano_{len(plot_info)}.html"
                plot_path = temp_dir / plot_filename
                fig.write_html(plot_path, include_plotlyjs='cdn')
                
                plot_info.append({
                    'group': group_name,
                    'comparison': comparison,
                    'filename': plot_filename
                })
        
        # Build combined HTML using iframes
        html_parts = []
        html_parts.append("""<!DOCTYPE html>
<html>
<head>
    <title>PCC 11901 Differential Expression - Volcano Plots</title>
    <style>
        body { font-family: Arial, sans-serif; margin: 20px; background: #f5f5f5; }
        h1 { color: #333; text-align: center; margin-bottom: 10px; }
        h2 { color: #0072B2; border-bottom: 2px solid #0072B2; padding-bottom: 5px; margin-top: 30px; }
        .summary { background: #e8f4f8; padding: 15px; border-radius: 5px; margin-bottom: 20px; text-align: center; }
        .grid-container { display: grid; grid-template-columns: repeat(3, 1fr); gap: 10px; }
        .plot-container { background: white; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); overflow: hidden; }
        .plot-container iframe { width: 100%; height: 400px; border: none; }
        .group-section { margin-bottom: 30px; }
        @media (max-width: 1400px) { .grid-container { grid-template-columns: repeat(2, 1fr); } }
        @media (max-width: 900px) { .grid-container { grid-template-columns: 1fr; } }
    </style>
</head>
<body>
    <h1>PCC 11901 RNA-seq: Differential Expression Analysis</h1>
    <div class="summary">
        <p><strong>Organism:</strong> <em>Picosynechococcus sp.</em> PCC 11901 | 
        <strong>Analysis:</strong> DESeq2 | 
        <strong>Thresholds:</strong> |log2FC| > 1, adj. p < 0.05</p>
    </div>
""")
        
        current_group = None
        for info in plot_info:
            if info['group'] != current_group:
                if current_group is not None:
                    html_parts.append('</div></div>')
                html_parts.append(f"<div class=\"group-section\"><h2>{info['group']}</h2>")
                html_parts.append('<div class="grid-container">')
                current_group = info['group']
            
            html_parts.append(f"<div class=\"plot-container\"><iframe src=\"volcano_plots_temp/{info['filename']}\"></iframe></div>")
        
        if current_group is not None:
            html_parts.append('</div></div>')
        
        html_parts.append("</body></html>")
        
        # Save combined report
        report_path = DIRS['figures'] / 'combined_volcano_report.html'
        with open(report_path, 'w') as f:
            f.write('\n'.join(html_parts))
        
        print(f"\nCombined report saved: {report_path}")
        print(f"Individual plots saved in: {temp_dir}")
        print(f"Total plots: {len(plot_info)}")
        print(f"Layout: 3-column responsive grid with iframes")
else:
    print("Skipping combined report (set GENERATE_COMBINED_REPORT = True)")


## Phase 5: CyanoCyc Pathway Enrichment Analysis

Pathway and GO term enrichment using CyanoCyc database for *Picosynechococcus sp.* PCC 11901.

- **241 pathways** from CyanoCyc
- **559 genes** with pathway annotations (16.5% coverage)
- Analysis using `gseapy` with Fisher's exact test

In [36]:
# === Load CyanoCyc Pathway Database ===

# Load pathways file
pathways_file = BASE_DIR / 'CyanoCycDB' / 'All-pathways-of-Synechococcus-sp.-PCC-11901.txt'
pathways_df = pd.read_csv(pathways_file, sep='\t')

# Load genes file (for GO terms)
genes_file = BASE_DIR / 'CyanoCycDB' / 'All-genes-of-Synechococcus-sp.-PCC-11901.txt'
genes_df = pd.read_csv(genes_file, sep='\t')

print(f"Loaded {len(pathways_df)} pathways")
print(f"Loaded {len(genes_df)} genes")

# === Create TERM2GENE mapping (pathway_id -> gene_id) ===
term2gene_rows = []
for _, row in pathways_df.iterrows():
    pathway_id = row['Pathways']
    genes_str = row['Genes of pathway']
    if pd.notna(genes_str):
        for gene in str(genes_str).split(' // '):
            gene = gene.strip()
            if gene:
                term2gene_rows.append({'term': pathway_id, 'gene': gene})

PATHWAY_TERM2GENE = pd.DataFrame(term2gene_rows)
print(f"TERM2GENE: {len(PATHWAY_TERM2GENE)} pathway-gene mappings")

# === Create TERM2NAME mapping (pathway_id -> pathway_name) ===
def clean_html(text):
    """Remove HTML tags from pathway names"""
    if pd.isna(text):
        return ""
    # Take first name if multiple (separated by //)
    text = str(text).split(' // ')[0]
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    return text.strip()

PATHWAY_TERM2NAME = pathways_df[['Pathways', 'Names']].copy()
PATHWAY_TERM2NAME.columns = ['term', 'name']
PATHWAY_TERM2NAME['name'] = PATHWAY_TERM2NAME['name'].apply(clean_html)
PATHWAY_TERM2NAME = PATHWAY_TERM2NAME[PATHWAY_TERM2NAME['name'] != '']
print(f"TERM2NAME: {len(PATHWAY_TERM2NAME)} pathway names")

# === Create pathway category mapping ===
PATHWAY_CATEGORIES = pathways_df[['Pathways', 'Ontology - pathway type']].copy()
PATHWAY_CATEGORIES.columns = ['term', 'category']
PATHWAY_CATEGORIES['category'] = PATHWAY_CATEGORIES['category'].apply(
    lambda x: str(x).split(' // ')[0] if pd.notna(x) else 'Other'
)

# === Create GO TERM2GENE mappings ===
go_bp_rows = []
go_mf_rows = []
go_cc_rows = []

for _, row in genes_df.iterrows():
    gene = row['Gene Name']
    
    # Biological Process
    if pd.notna(row['GO terms (biological process)']):
        for go_term in str(row['GO terms (biological process)']).split(' // '):
            go_term = go_term.strip()
            if go_term:
                go_bp_rows.append({'term': go_term, 'gene': gene})
    
    # Molecular Function
    if pd.notna(row['GO terms (molecular function)']):
        for go_term in str(row['GO terms (molecular function)']).split(' // '):
            go_term = go_term.strip()
            if go_term:
                go_mf_rows.append({'term': go_term, 'gene': gene})
    
    # Cellular Component
    if pd.notna(row['GO terms (cellular component)']):
        for go_term in str(row['GO terms (cellular component)']).split(' // '):
            go_term = go_term.strip()
            if go_term:
                go_cc_rows.append({'term': go_term, 'gene': gene})

GO_BP_TERM2GENE = pd.DataFrame(go_bp_rows)
GO_MF_TERM2GENE = pd.DataFrame(go_mf_rows)
GO_CC_TERM2GENE = pd.DataFrame(go_cc_rows)

print(f"GO BP: {len(GO_BP_TERM2GENE)} mappings")
print(f"GO MF: {len(GO_MF_TERM2GENE)} mappings")
print(f"GO CC: {len(GO_CC_TERM2GENE)} mappings")

# === Background gene set (all genes in DESeq2 results) ===
sample_deseq = pd.read_csv(DIRS['deseq2'] / 'group1_Ammonia_vs_Control.csv')
BACKGROUND_GENES = set(sample_deseq['gene_id'].dropna())
print(f"Background gene set: {len(BACKGROUND_GENES)} genes")

Loaded 241 pathways
Loaded 3379 genes
TERM2GENE: 1206 pathway-gene mappings
TERM2NAME: 241 pathway names
GO BP: 765 mappings
GO MF: 1036 mappings
GO CC: 266 mappings
Background gene set: 2802 genes


In [37]:
# === Pathway Enrichment Functions using gseapy ===

def run_pathway_enrichment(
    gene_list: list,
    term2gene: pd.DataFrame,
    term2name: pd.DataFrame = None,
    background: set = None,
    pvalue_cutoff: float = 0.05,
    min_genes: int = 3
) -> pd.DataFrame:
    """
    Run pathway enrichment using gseapy's enrich function.
    
    Parameters:
    -----------
    gene_list : list of gene IDs (DEGs)
    term2gene : DataFrame with 'term' and 'gene' columns
    term2name : DataFrame with 'term' and 'name' columns (optional)
    background : set of background genes (default: all genes in term2gene)
    pvalue_cutoff : significance threshold
    min_genes : minimum genes in pathway to test
    
    Returns:
    --------
    DataFrame with enrichment results
    """
    if len(gene_list) < 3:
        print(f"Warning: Only {len(gene_list)} genes provided, skipping enrichment")
        return pd.DataFrame()
    
    # Convert to gseapy format (dict: term -> gene list)
    gene_sets = term2gene.groupby('term')['gene'].apply(list).to_dict()
    
    # Filter by minimum gene count
    gene_sets = {k: v for k, v in gene_sets.items() if len(v) >= min_genes}
    
    if background is None:
        background = set(term2gene['gene'].unique())
    
    try:
        enr = gp.enrich(
            gene_list=gene_list,
            gene_sets=gene_sets,
            background=list(background),
            outdir=None,
            verbose=False
        )
        results = enr.results
    except Exception as e:
        print(f"Enrichment failed: {e}")
        return pd.DataFrame()
    
    if results.empty:
        return pd.DataFrame()
    
    # Filter by p-value
    results = results[results['Adjusted P-value'] <= pvalue_cutoff].copy()
    
    # Add pathway names if available
    if term2name is not None and not results.empty:
        name_map = dict(zip(term2name['term'], term2name['name']))
        results['Pathway_Name'] = results['Term'].map(name_map)
        results['Pathway_Name'] = results['Pathway_Name'].fillna(results['Term'])
    else:
        results['Pathway_Name'] = results['Term']
    
    # Sort by adjusted p-value
    results = results.sort_values('Adjusted P-value')
    
    return results


def get_degs(deseq_file: Path, padj_cutoff: float = 0.05, lfc_cutoff: float = 1.0) -> dict:
    """
    Extract DEGs from DESeq2 results file.
    
    Returns dict with 'up', 'down', and 'all' gene lists.
    """
    df = pd.read_csv(deseq_file)
    
    # Handle different CSV formats - gene_id might be a column or the index
    if 'gene_id' not in df.columns:
        if 'Unnamed: 0' in df.columns:
            df = df.rename(columns={'Unnamed: 0': 'gene_id'})
        elif df.index.name == 'gene_id' or df.index.dtype == 'object':
            df = df.reset_index()
            if 'index' in df.columns:
                df = df.rename(columns={'index': 'gene_id'})
    
    # Filter significant genes
    sig = df[(df['padj'] < padj_cutoff) & (df['log2FoldChange'].abs() > lfc_cutoff)].copy()
    
    up_genes = sig[sig['log2FoldChange'] > 0]['gene_id'].tolist()
    down_genes = sig[sig['log2FoldChange'] < 0]['gene_id'].tolist()
    all_genes = sig['gene_id'].tolist()
    
    return {
        'up': up_genes,
        'down': down_genes,
        'all': all_genes,
        'n_up': len(up_genes),
        'n_down': len(down_genes),
        'n_total': len(all_genes)
    }


print("Enrichment functions defined.")
print("  - run_pathway_enrichment()")
print("  - get_degs()")

Enrichment functions defined.
  - run_pathway_enrichment()
  - get_degs()


In [38]:
# === Enrichment Visualization Functions (Plotly) ===

def plot_enrichment_dotplot(
    results: pd.DataFrame,
    title: str = "Pathway Enrichment",
    top_n: int = 20,
) -> go.Figure:
    """
    Create publication-quality dotplot for enrichment results.
    """
    if results.empty:
        fig = go.Figure()
        fig.add_annotation(text="No significant pathways", xref="paper", yref="paper",
                          x=0.5, y=0.5, showarrow=False, font=dict(size=14))
        fig.update_layout(title=title)
        return fig
    
    # Prepare data
    plot_df = results.head(top_n).copy()
    plot_df['-log10(padj)'] = -np.log10(plot_df['Adjusted P-value'].clip(lower=1e-50))
    
    # Parse overlap (e.g., "5/100" -> 5)
    if 'Overlap' in plot_df.columns:
        plot_df['Overlap_Count'] = plot_df['Overlap'].apply(
            lambda x: int(str(x).split('/')[0]) if pd.notna(x) else 0
        )
        plot_df['Gene_Ratio'] = plot_df['Overlap'].apply(
            lambda x: int(str(x).split('/')[0]) / int(str(x).split('/')[1]) 
            if pd.notna(x) and '/' in str(x) else 0
        )
    else:
        plot_df['Overlap_Count'] = 1
        plot_df['Gene_Ratio'] = 0.1
    
    # Truncate long pathway names
    plot_df['Pathway_Short'] = plot_df['Pathway_Name'].apply(
        lambda x: x[:50] + '...' if len(str(x)) > 50 else x
    )
    
    # Create dotplot
    fig = px.scatter(
        plot_df,
        x='Gene_Ratio',
        y='Pathway_Short',
        size='Overlap_Count',
        color='-log10(padj)',
        color_continuous_scale='Reds',
        hover_data={
            'Pathway_Name': True,
            'Adjusted P-value': ':.2e',
            'Pathway_Short': False,
            '-log10(padj)': False
        }
    )
    
    # Apply base layout (excluding keys we'll override)
    exclude_keys = {'title', 'xaxis', 'yaxis', 'height', 'width'}
    base_layout = {k: v for k, v in PUBLICATION_LAYOUT.items() if k not in exclude_keys}
    fig.update_layout(**base_layout)
    
    # Apply specific layout for this plot
    fig.update_layout(
        title=dict(text=title, font=dict(size=14), x=0.5, xanchor='center'),
        xaxis=dict(
            title="Gene Ratio",
            showline=True, linewidth=1.5, linecolor='black',
            showgrid=False, zeroline=False,
            ticks='outside', tickwidth=1.5, ticklen=5
        ),
        yaxis=dict(
            title="",
            categoryorder='total ascending',
            showline=True, linewidth=1.5, linecolor='black',
            showgrid=False, zeroline=False,
            ticks='outside', tickwidth=1.5
        ),
        coloraxis_colorbar_title="-log10(padj)",
        height=max(400, top_n * 25)
    )
    
    return fig


def plot_enrichment_barplot(
    results: pd.DataFrame,
    title: str = "Pathway Enrichment",
    top_n: int = 15
) -> go.Figure:
    """
    Create horizontal bar plot for enrichment results.
    """
    if results.empty:
        fig = go.Figure()
        fig.add_annotation(text="No significant pathways", xref="paper", yref="paper",
                          x=0.5, y=0.5, showarrow=False, font=dict(size=14))
        fig.update_layout(title=title)
        return fig
    
    plot_df = results.head(top_n).copy()
    plot_df['-log10(padj)'] = -np.log10(plot_df['Adjusted P-value'].clip(lower=1e-50))
    plot_df['Pathway_Short'] = plot_df['Pathway_Name'].apply(
        lambda x: x[:45] + '...' if len(str(x)) > 45 else x
    )
    
    # Sort for display (lowest p-value at top)
    plot_df = plot_df.sort_values('-log10(padj)', ascending=True)
    
    fig = go.Figure(go.Bar(
        x=plot_df['-log10(padj)'],
        y=plot_df['Pathway_Short'],
        orientation='h',
        marker_color=COLORS_CATEGORICAL[0],
        hovertemplate=(
            "<b>%{y}</b><br>"
            "-log10(padj): %{x:.2f}<br>"
            "<extra></extra>"
        )
    ))
    
    # Apply base layout (excluding keys we'll override)
    exclude_keys = {'title', 'xaxis', 'yaxis', 'height', 'width'}
    base_layout = {k: v for k, v in PUBLICATION_LAYOUT.items() if k not in exclude_keys}
    fig.update_layout(**base_layout)
    
    # Apply specific layout for this plot
    fig.update_layout(
        title=dict(text=title, font=dict(size=14), x=0.5, xanchor='center'),
        xaxis=dict(
            title="-log10(adjusted p-value)",
            showline=True, linewidth=1.5, linecolor='black',
            showgrid=False, zeroline=False,
            ticks='outside', tickwidth=1.5, ticklen=5
        ),
        yaxis=dict(
            title="",
            showline=True, linewidth=1.5, linecolor='black',
            showgrid=False, zeroline=False,
            ticks='outside', tickwidth=1.5
        ),
        height=max(400, top_n * 28)
    )
    
    return fig


def plot_enrichment_heatmap(
    enrichment_dict: dict,
    top_n_per_comparison: int = 5,
    title: str = "Pathway Enrichment Across Conditions"
) -> go.Figure:
    """
    Create heatmap comparing enrichment across multiple conditions.
    
    Parameters:
    -----------
    enrichment_dict : dict mapping comparison_name -> enrichment_results DataFrame
    """
    # Collect top pathways from each comparison
    all_pathways = set()
    for name, df in enrichment_dict.items():
        if not df.empty:
            all_pathways.update(df.head(top_n_per_comparison)['Term'].tolist())
    
    if not all_pathways:
        fig = go.Figure()
        fig.add_annotation(text="No significant pathways", xref="paper", yref="paper",
                          x=0.5, y=0.5, showarrow=False)
        fig.update_layout(title=title)
        return fig
    
    # Build matrix
    comparisons = list(enrichment_dict.keys())
    pathways = sorted(all_pathways)
    
    # Get pathway names
    pathway_names = {}
    for df in enrichment_dict.values():
        if not df.empty:
            for _, row in df.iterrows():
                if row['Term'] not in pathway_names:
                    pathway_names[row['Term']] = row.get('Pathway_Name', row['Term'])
    
    matrix = np.zeros((len(pathways), len(comparisons)))
    for j, comp in enumerate(comparisons):
        df = enrichment_dict[comp]
        if not df.empty:
            pval_map = dict(zip(df['Term'], df['Adjusted P-value']))
            for i, pw in enumerate(pathways):
                if pw in pval_map:
                    matrix[i, j] = -np.log10(pval_map[pw])
    
    # Short names for y-axis
    y_labels = [pathway_names.get(p, p)[:40] for p in pathways]
    x_labels = [c.replace('_vs_Control', '').replace('_', ' ') for c in comparisons]
    
    fig = go.Figure(data=go.Heatmap(
        z=matrix,
        x=x_labels,
        y=y_labels,
        colorscale='Reds',
        colorbar_title="-log10(padj)",
        hoverongaps=False
    ))
    
    # Apply base layout (excluding keys we'll override)
    exclude_keys = {'title', 'xaxis', 'yaxis', 'height', 'width'}
    base_layout = {k: v for k, v in PUBLICATION_LAYOUT.items() if k not in exclude_keys}
    fig.update_layout(**base_layout)
    
    # Apply specific layout for this plot
    fig.update_layout(
        title=dict(text=title, font=dict(size=14), x=0.5, xanchor='center'),
        xaxis=dict(
            title="",
            tickangle=45,
            showline=True, linewidth=1.5, linecolor='black',
            showgrid=False, zeroline=False,
            ticks='outside', tickwidth=1.5
        ),
        yaxis=dict(
            title="",
            showline=True, linewidth=1.5, linecolor='black',
            showgrid=False, zeroline=False,
            ticks='outside', tickwidth=1.5
        ),
        height=max(500, len(pathways) * 22)
    )
    
    return fig


print("Visualization functions defined.")
print("  - plot_enrichment_dotplot()")
print("  - plot_enrichment_barplot()")
print("  - plot_enrichment_heatmap()")

Visualization functions defined.
  - plot_enrichment_dotplot()
  - plot_enrichment_barplot()
  - plot_enrichment_heatmap()


In [25]:
# === Run Pathway Enrichment for All Comparisons ===

# DESeq2 result files
deseq_files = {
    'Group 1 - Nutrients': [
        ('Ammonia', 'group1_Ammonia_vs_Control.csv'),
        ('Glycerol', 'group1_Glycerol_0.75pct_vs_Control.csv'),
        ('High Nitrogen', 'group1_High_Nitrogen_vs_Control.csv'),
        ('Low Nitrogen', 'group1_Low_Nitrogen_vs_Control.csv'),
        ('High Phosphate', 'group1_High_Phosphate_vs_Control.csv'),
        ('Low Phosphate', 'group1_Low_Phosphate_vs_Control.csv'),
        ('Urea', 'group1_Urea_vs_Control.csv'),
    ],
    'Group 2 - Environmental': [
        ('Atmospheric CO2', 'group2_Atmospheric_CO2_vs_Control.csv'),
        ('High CO2', 'group2_High_CO2_8pct_vs_Control.csv'),
        ('H2O2', 'group2_H2O2_0.005pct_vs_Control.csv'),
        ('High NaCl', 'group2_High_NaCl_9pct_vs_Control.csv'),
        ('High Temp', 'group2_High_Temp_38C_vs_Control.csv'),
        ('High Light', 'group2_High_Light_vs_Control.csv'),
        ('Low Light', 'group2_Low_Light_15uE_vs_Control.csv'),
    ],
    'Group 3 - Circadian': [
        ('T2 Dark vs T1', 'group3_T2_Dark_vs_T1_Light.csv'),
        ('T3 Light vs T2', 'group3_T3_Light_vs_T2_Dark.csv'),
        ('T4 Dark vs T3', 'group3_T4_Dark_vs_T3_Light.csv'),
    ],
}

# Store all results
ALL_ENRICHMENT_RESULTS = {}
DEG_SUMMARY = []

# Process each comparison
for group_name, comparisons in deseq_files.items():
    print(f"\n{'='*60}")
    print(f"{group_name}")
    print('='*60)
    
    for label, filename in comparisons:
        filepath = DIRS['deseq2'] / filename
        if not filepath.exists():
            print(f"  {label}: File not found")
            continue
        
        # Get DEGs
        degs = get_degs(filepath, padj_cutoff=0.05, lfc_cutoff=1.0)
        DEG_SUMMARY.append({
            'Group': group_name,
            'Comparison': label,
            'Up': degs['n_up'],
            'Down': degs['n_down'],
            'Total': degs['n_total']
        })
        
        print(f"  {label}: {degs['n_total']} DEGs ({degs['n_up']} up, {degs['n_down']} down)")
        
        # Run enrichment on all DEGs
        if degs['n_total'] >= 5:
            results = run_pathway_enrichment(
                gene_list=degs['all'],
                term2gene=PATHWAY_TERM2GENE,
                term2name=PATHWAY_TERM2NAME,
                background=BACKGROUND_GENES,
                pvalue_cutoff=0.05
            )
            ALL_ENRICHMENT_RESULTS[label] = results
            
            n_sig = len(results)
            print(f"    -> {n_sig} significant pathways")
        else:
            ALL_ENRICHMENT_RESULTS[label] = pd.DataFrame()
            print(f"    -> Too few DEGs for enrichment")

# Summary table
DEG_SUMMARY_DF = pd.DataFrame(DEG_SUMMARY)
print("\n" + "="*60)
print("DEG Summary")
print("="*60)
print(DEG_SUMMARY_DF.to_string(index=False))


Group 1 - Nutrients
  Ammonia: 80 DEGs (32 up, 48 down)
    -> 3 significant pathways
  Glycerol: 429 DEGs (168 up, 261 down)
    -> 3 significant pathways
  High Nitrogen: 334 DEGs (236 up, 98 down)
    -> 0 significant pathways
  Low Nitrogen: 22 DEGs (7 up, 15 down)
    -> 1 significant pathways
  High Phosphate: 85 DEGs (46 up, 39 down)
    -> 2 significant pathways
  Low Phosphate: 28 DEGs (2 up, 26 down)
    -> 0 significant pathways
  Urea: 127 DEGs (37 up, 90 down)
    -> 4 significant pathways

Group 2 - Environmental
  Atmospheric CO2: 207 DEGs (76 up, 131 down)
    -> 0 significant pathways
  High CO2: 334 DEGs (154 up, 180 down)
    -> 1 significant pathways
  H2O2: 34 DEGs (29 up, 5 down)
    -> 0 significant pathways
  High NaCl: 1372 DEGs (714 up, 658 down)
    -> 5 significant pathways
  High Temp: 347 DEGs (128 up, 219 down)
    -> 1 significant pathways
  High Light: 306 DEGs (120 up, 186 down)
    -> 1 significant pathways
  Low Light: 763 DEGs (334 up, 429 down)
  

In [39]:
# === Visualize Enrichment Results ===

# Select a comparison to visualize
COMPARISON = 'High NaCl'  # <-- Change this to view different comparisons

if COMPARISON in ALL_ENRICHMENT_RESULTS:
    results = ALL_ENRICHMENT_RESULTS[COMPARISON]
    
    if not results.empty:
        # Dotplot
        fig_dot = plot_enrichment_dotplot(
            results,
            title=f"Pathway Enrichment: {COMPARISON} vs Control",
            top_n=20
        )
        fig_dot.show()
        
        # Barplot
        fig_bar = plot_enrichment_barplot(
            results,
            title=f"Top Enriched Pathways: {COMPARISON}",
            top_n=15
        )
        fig_bar.show()
        
        # Show results table (select available columns)
        display_cols = ['Term', 'Pathway_Name', 'Overlap', 'Adjusted P-value']
        if 'Genes' in results.columns:
            display_cols.append('Genes')
        display(results[display_cols].head(20))
    else:
        print(f"No significant pathways for {COMPARISON}")
else:
    print(f"Comparison '{COMPARISON}' not found")
    print(f"Available: {list(ALL_ENRICHMENT_RESULTS.keys())}")

Unnamed: 0,Term,Pathway_Name,Overlap,Adjusted P-value,Genes
30,PHOTOALL-PWY,oxygenic photosynthesis,48/59,1.5e-05,FEK30_RS08295;FEK30_RS02765;FEK30_RS05405;FEK3...
32,PWY-101,photosynthesis light reactions,38/45,2.5e-05,FEK30_RS08295;FEK30_RS02765;FEK30_RS05405;FEK3...
80,PWY-7731,superpathway of photosynthetic hydrogen produc...,38/48,0.000376,FEK30_RS08295;FEK30_RS02765;FEK30_RS05405;FEK3...
84,PWY-7980,ATP biosynthesis,15/17,0.017829,FEK30_RS15990;FEK30_RS15995;FEK30_RS07780;FEK3...
86,PWY-8270,cyclic electron flow,17/20,0.017829,FEK30_RS08295;FEK30_RS01565;FEK30_RS09330;FEK3...


In [40]:
# === Cross-Condition Pathway Enrichment Heatmap ===

# Group 1 heatmap (Nutrients)
group1_results = {k: v for k, v in ALL_ENRICHMENT_RESULTS.items() 
                  if k in ['Ammonia', 'Glycerol', 'High Nitrogen', 'Low Nitrogen',
                          'High Phosphate', 'Low Phosphate', 'Urea']}

fig_g1 = plot_enrichment_heatmap(
    group1_results,
    top_n_per_comparison=5,
    title="Pathway Enrichment: Nutrient Conditions"
)
fig_g1.show()

# Group 2 heatmap (Environmental)
group2_results = {k: v for k, v in ALL_ENRICHMENT_RESULTS.items() 
                  if k in ['Atmospheric CO2', 'High CO2', 'H2O2', 'High NaCl',
                          'High Temp', 'High Light', 'Low Light']}

fig_g2 = plot_enrichment_heatmap(
    group2_results,
    top_n_per_comparison=5,
    title="Pathway Enrichment: Environmental Conditions"
)
fig_g2.show()

In [28]:
# === Save Enrichment Results ===

# Create output directory
output_dir = DIRS['functional'] / 'cyanocyc_enrichment'
output_dir.mkdir(parents=True, exist_ok=True)

# Save individual comparison results
for name, results in ALL_ENRICHMENT_RESULTS.items():
    if not results.empty:
        safe_name = name.replace(' ', '_').replace('/', '_')
        results.to_csv(output_dir / f'{safe_name}_enrichment.csv', index=False)

# Save DEG summary
DEG_SUMMARY_DF.to_csv(output_dir / 'deg_summary.csv', index=False)

# Save combined results (all significant pathways)
all_results = []
for name, results in ALL_ENRICHMENT_RESULTS.items():
    if not results.empty:
        results_copy = results.copy()
        results_copy['Comparison'] = name
        all_results.append(results_copy)

if all_results:
    combined = pd.concat(all_results, ignore_index=True)
    combined.to_csv(output_dir / 'all_enrichment_results.csv', index=False)
    print(f"Saved {len(combined)} total enrichment results")

# Save mapping files for reproducibility
PATHWAY_TERM2GENE.to_csv(output_dir / 'cyanocyc_term2gene.csv', index=False)
PATHWAY_TERM2NAME.to_csv(output_dir / 'cyanocyc_term2name.csv', index=False)

print(f"\nResults saved to: {output_dir}")
print(f"Files:")
for f in sorted(output_dir.glob('*.csv')):
    print(f"  - {f.name}")

Saved 40 total enrichment results

Results saved to: /Users/felix/Library/CloudStorage/OneDrive-SharedLibraries-MacquarieUniversity/Australian Genome Foundry - AWS cloud infrastructure/11_Esther_Cyano_transcriptomics/08_functional/cyanocyc_enrichment
Files:
  - Ammonia_enrichment.csv
  - Glycerol_enrichment.csv
  - High_CO2_enrichment.csv
  - High_Light_enrichment.csv
  - High_NaCl_enrichment.csv
  - High_Phosphate_enrichment.csv
  - High_Temp_enrichment.csv
  - Low_Light_enrichment.csv
  - Low_Nitrogen_enrichment.csv
  - T2_Dark_vs_T1_enrichment.csv
  - T3_Light_vs_T2_enrichment.csv
  - T4_Dark_vs_T3_enrichment.csv
  - Urea_enrichment.csv
  - all_enrichment_results.csv
  - cyanocyc_term2gene.csv
  - cyanocyc_term2name.csv
  - deg_summary.csv


In [41]:
# === GO Term Enrichment (Optional) ===

RUN_GO_ENRICHMENT = True  # Set to True to run GO enrichment

if RUN_GO_ENRICHMENT:
    # Example: Run GO BP enrichment for High NaCl
    COMPARISON = 'High NaCl'
    
    filepath = DIRS['deseq2'] / 'group2_High_NaCl_9pct_vs_Control.csv'
    degs = get_degs(filepath, padj_cutoff=0.05, lfc_cutoff=1.0)
    
    go_bp_results = run_pathway_enrichment(
        gene_list=degs['all'],
        term2gene=GO_BP_TERM2GENE,
        background=BACKGROUND_GENES,
        pvalue_cutoff=0.05
    )
    
    if not go_bp_results.empty:
        fig = plot_enrichment_barplot(
            go_bp_results,
            title=f"GO Biological Process: {COMPARISON}",
            top_n=15
        )
        fig.show()
        display(go_bp_results.head(15))
    else:
        print("No significant GO terms")
else:
    print("GO enrichment skipped (set RUN_GO_ENRICHMENT = True to enable)")

Unnamed: 0,Gene_set,Term,Overlap,P-value,Adjusted P-value,Odds Ratio,Combined Score,Genes,Pathway_Name
19,gs_ind_0,GO:0006412,51/55,1.238784e-12,9.166999e-11,12.719469,348.728297,FEK30_RS06525;FEK30_RS09095;FEK30_RS09050;FEK3...,GO:0006412


In [42]:
# === Generate Comprehensive HTML Reports per Group ===

import plotly.io as pio
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

def create_pca_figure(counts_file: Path, metadata_df: pd.DataFrame, group_filter: str = None):
    """Create PCA plot for samples, optionally filtered by group."""
    norm_counts = pd.read_csv(counts_file, index_col=0)
    log_counts = np.log2(norm_counts + 1)
    
    # Filter to group if specified
    if group_filter:
        group_samples = metadata_df[metadata_df['group'] == group_filter]['sample_id'].tolist()
        log_counts = log_counts[[c for c in log_counts.columns if c in group_samples]]
    
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(log_counts.T)
    
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(scaled_data)
    
    pca_df = pd.DataFrame({
        'PC1': pca_result[:, 0],
        'PC2': pca_result[:, 1],
        'sample_id': log_counts.columns
    })
    pca_df = pca_df.merge(metadata_df, on='sample_id')
    
    fig = px.scatter(
        pca_df, x='PC1', y='PC2',
        color='condition',
        hover_data=['sample_id'],
        color_discrete_sequence=COLORS_CATEGORICAL,
        labels={
            'PC1': f'PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)',
            'PC2': f'PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)'
        }
    )
    
    fig.update_traces(marker=dict(size=12, line=dict(width=1.5, color='white')))
    fig.update_layout(
        font=dict(family="Arial", size=12),
        plot_bgcolor='white',
        paper_bgcolor='white',
        xaxis=dict(showline=True, linewidth=1.5, linecolor='black', showgrid=False, zeroline=False),
        yaxis=dict(showline=True, linewidth=1.5, linecolor='black', showgrid=False, zeroline=False),
        legend=dict(font=dict(size=10)),
        margin=dict(l=60, r=20, t=40, b=60),
        height=400
    )
    return fig


def create_correlation_figure(counts_file: Path, metadata_df: pd.DataFrame, group_filter: str = None):
    """Create sample correlation heatmap."""
    norm_counts = pd.read_csv(counts_file, index_col=0)
    
    # Filter to group if specified
    if group_filter:
        group_samples = metadata_df[metadata_df['group'] == group_filter]['sample_id'].tolist()
        norm_counts = norm_counts[[c for c in norm_counts.columns if c in group_samples]]
    
    log_counts = np.log2(norm_counts + 1)
    corr_matrix = log_counts.corr(method='pearson')
    
    # Order by condition
    sample_meta = metadata_df[metadata_df['sample_id'].isin(corr_matrix.columns)]
    sample_order = sample_meta.sort_values(['condition', 'sample_id'])['sample_id'].tolist()
    corr_matrix = corr_matrix.loc[sample_order, sample_order]
    
    fig = px.imshow(
        corr_matrix,
        labels=dict(color="Pearson r"),
        color_continuous_scale='RdBu_r',
        zmin=0.85, zmax=1.0,
        aspect='equal'
    )
    
    fig.update_layout(
        font=dict(family="Arial", size=10),
        plot_bgcolor='white',
        paper_bgcolor='white',
        xaxis=dict(tickfont=dict(size=8), tickangle=45),
        yaxis=dict(tickfont=dict(size=8)),
        coloraxis_colorbar=dict(title='r', len=0.6),
        margin=dict(l=80, r=20, t=40, b=80),
        height=450
    )
    return fig


def create_volcano_figure(results_file: Path, title: str):
    """Create volcano plot from DESeq2 results."""
    df = pd.read_csv(results_file)
    
    if 'Unnamed: 0' in df.columns:
        df = df.rename(columns={'Unnamed: 0': 'gene_id'})
    elif 'gene_id' not in df.columns:
        df = df.reset_index().rename(columns={'index': 'gene_id'})
    
    df = df.dropna(subset=['log2FoldChange', 'padj'])
    df['neg_log10_padj'] = -np.log10(df['padj'].clip(lower=1e-300))
    
    conditions = [
        (df['padj'] < 0.05) & (df['log2FoldChange'] > 1),
        (df['padj'] < 0.05) & (df['log2FoldChange'] < -1),
    ]
    df['regulation'] = np.select(conditions, ['Up', 'Down'], default='NS')
    
    n_up = (df['regulation'] == 'Up').sum()
    n_down = (df['regulation'] == 'Down').sum()
    
    fig = px.scatter(
        df, x='log2FoldChange', y='neg_log10_padj',
        color='regulation',
        color_discrete_map=COLORS_UPDOWN,
        hover_data=['gene_id', 'padj'],
        labels={'log2FoldChange': 'Log₂FC', 'neg_log10_padj': '-Log₁₀(padj)'}
    )
    
    fig.add_hline(y=-np.log10(0.05), line_dash="dash", line_color="gray", line_width=1)
    fig.add_vline(x=-1, line_dash="dash", line_color="gray", line_width=1)
    fig.add_vline(x=1, line_dash="dash", line_color="gray", line_width=1)
    
    fig.update_traces(marker=dict(size=5, opacity=0.7))
    fig.update_layout(
        font=dict(family="Arial", size=11),
        plot_bgcolor='white',
        paper_bgcolor='white',
        xaxis=dict(showline=True, linewidth=1.5, linecolor='black', showgrid=False, zeroline=False),
        yaxis=dict(showline=True, linewidth=1.5, linecolor='black', showgrid=False, zeroline=False),
        legend=dict(title='', orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1),
        margin=dict(l=50, r=20, t=50, b=50),
        height=380,
        annotations=[dict(
            x=0.5, y=1.12, xref='paper', yref='paper',
            text=f'<b>{title}</b><br><span style="font-size:10px">Up: {n_up} | Down: {n_down}</span>',
            showarrow=False, font=dict(size=12)
        )]
    )
    return fig


def generate_group_report(
    group_name: str,
    group_id: str,  # e.g., 'group1', 'group2', 'group3'
    comparisons: list,
    output_path: Path,
    enrichment_results: dict = None,
    metadata_df: pd.DataFrame = None
):
    """Generate comprehensive HTML report with improved styling."""
    
    html_parts = []
    
    # Improved HTML header with better CSS
    html_parts.append(f"""<!DOCTYPE html>
<html>
<head>
    <meta charset="UTF-8">
    <title>{group_name} - Transcriptomics Report</title>
    <script src="https://cdn.plot.ly/plotly-2.27.0.min.js"></script>
    <style>
        * {{ box-sizing: border-box; }}
        body {{
            font-family: 'Segoe UI', Arial, sans-serif;
            margin: 0;
            padding: 30px;
            background: linear-gradient(135deg, #f5f7fa 0%, #e4e8ec 100%);
            min-height: 100vh;
        }}
        .container {{
            max-width: 1800px;
            margin: 0 auto;
        }}
        h1 {{
            color: #1a1a2e;
            font-size: 28px;
            font-weight: 600;
            margin-bottom: 5px;
            padding-bottom: 15px;
            border-bottom: 4px solid #0072B2;
        }}
        .subtitle {{
            color: #666;
            font-size: 14px;
            margin-bottom: 30px;
        }}
        h2 {{
            color: #0072B2;
            font-size: 20px;
            font-weight: 600;
            margin-top: 50px;
            margin-bottom: 20px;
            padding-bottom: 8px;
            border-bottom: 2px solid #e0e0e0;
        }}
        h3 {{
            color: #333;
            font-size: 16px;
            font-weight: 600;
            margin-top: 35px;
            margin-bottom: 15px;
            padding: 10px 15px;
            background: #f8f9fa;
            border-left: 4px solid #0072B2;
            border-radius: 0 4px 4px 0;
        }}
        
        /* Grid layouts */
        .grid-2 {{
            display: grid;
            grid-template-columns: repeat(2, 1fr);
            gap: 25px;
            margin: 25px 0;
        }}
        .grid-3 {{
            display: grid;
            grid-template-columns: repeat(3, 1fr);
            gap: 20px;
            margin: 20px 0;
        }}
        .card {{
            background: white;
            border-radius: 12px;
            padding: 20px;
            box-shadow: 0 4px 15px rgba(0,0,0,0.08);
            border: 1px solid #e8e8e8;
            transition: transform 0.2s, box-shadow 0.2s;
        }}
        .card:hover {{
            transform: translateY(-2px);
            box-shadow: 0 6px 20px rgba(0,0,0,0.12);
        }}
        .card-title {{
            font-size: 13px;
            font-weight: 600;
            color: #444;
            margin-bottom: 8px;
            text-transform: uppercase;
            letter-spacing: 0.5px;
        }}
        .card-desc {{
            font-size: 11px;
            color: #888;
            margin-bottom: 15px;
        }}
        .full-width {{
            grid-column: 1 / -1;
        }}
        
        /* Stats table */
        .stats-table {{
            width: 100%;
            border-collapse: separate;
            border-spacing: 0;
            margin: 20px 0;
            border-radius: 8px;
            overflow: hidden;
            box-shadow: 0 2px 10px rgba(0,0,0,0.08);
        }}
        .stats-table th {{
            background: linear-gradient(135deg, #0072B2 0%, #005a8c 100%);
            color: white;
            padding: 14px 16px;
            text-align: left;
            font-weight: 600;
            font-size: 13px;
        }}
        .stats-table td {{
            padding: 12px 16px;
            border-bottom: 1px solid #eee;
            background: white;
            font-size: 13px;
        }}
        .stats-table tr:last-child td {{
            border-bottom: none;
        }}
        .stats-table tr:hover td {{
            background: #f8fbff;
        }}
        .up {{ color: #D55E00; font-weight: 700; }}
        .down {{ color: #0072B2; font-weight: 700; }}
        .badge {{
            display: inline-block;
            padding: 4px 10px;
            border-radius: 12px;
            font-size: 12px;
            font-weight: 600;
        }}
        .badge-up {{ background: #fff0e6; color: #D55E00; }}
        .badge-down {{ background: #e6f3ff; color: #0072B2; }}
        .badge-neutral {{ background: #f0f0f0; color: #666; }}
        
        /* Section dividers */
        .section-divider {{
            height: 1px;
            background: linear-gradient(to right, transparent, #ddd, transparent);
            margin: 40px 0;
        }}
        
        /* Plot container */
        .plot-container {{
            min-height: 350px;
        }}
        .no-data {{
            display: flex;
            align-items: center;
            justify-content: center;
            height: 300px;
            color: #999;
            font-style: italic;
            background: #fafafa;
            border-radius: 8px;
        }}
        
        /* Footer */
        .footer {{
            margin-top: 60px;
            padding-top: 20px;
            border-top: 1px solid #ddd;
            color: #888;
            font-size: 11px;
            text-align: center;
        }}
    </style>
</head>
<body>
<div class="container">
    <h1>{group_name}</h1>
    <p class="subtitle">
        Transcriptomics Analysis Report for <em>Picosynechococcus sp.</em> PCC 11901<br>
        DESeq2 differential expression | CyanoCyc pathway enrichment
    </p>
""")
    
    # === SECTION 1: Overview (PCA + Correlation) ===
    html_parts.append("""
    <h2>1. Quality Control & Sample Overview</h2>
    <div class="grid-2">
""")
    
    # PCA Plot
    try:
        counts_file = DIRS['deseq2'] / f'{group_id}_normalized_counts.csv'
        if not counts_file.exists():
            counts_file = DIRS['deseq2'] / 'counts_matrix.csv'
        
        pca_fig = create_pca_figure(counts_file, metadata_df, group_filter=group_id.replace('group', 'Group '))
        pca_html = pio.to_html(pca_fig, full_html=False, include_plotlyjs=False, config={'displayModeBar': True, 'responsive': True})
        html_parts.append(f"""
        <div class="card">
            <div class="card-title">Principal Component Analysis</div>
            <div class="card-desc">Sample clustering based on overall gene expression</div>
            <div class="plot-container">{pca_html}</div>
        </div>
""")
    except Exception as e:
        html_parts.append(f"""
        <div class="card">
            <div class="card-title">Principal Component Analysis</div>
            <div class="no-data">Could not generate PCA: {str(e)[:50]}</div>
        </div>
""")
    
    # Correlation Heatmap
    try:
        corr_fig = create_correlation_figure(counts_file, metadata_df, group_filter=group_id.replace('group', 'Group '))
        corr_html = pio.to_html(corr_fig, full_html=False, include_plotlyjs=False, config={'responsive': True})
        html_parts.append(f"""
        <div class="card">
            <div class="card-title">Sample Correlation Matrix</div>
            <div class="card-desc">Pearson correlation of log2-transformed counts</div>
            <div class="plot-container">{corr_html}</div>
        </div>
""")
    except Exception as e:
        html_parts.append(f"""
        <div class="card">
            <div class="card-title">Sample Correlation Matrix</div>
            <div class="no-data">Could not generate correlation matrix</div>
        </div>
""")
    
    html_parts.append("    </div>")  # Close grid-2
    
    # === SECTION 2: DEG Summary Table ===
    html_parts.append("""
    <h2>2. Differential Expression Summary</h2>
    <table class="stats-table">
        <tr>
            <th>Condition</th>
            <th>Total DEGs</th>
            <th>Upregulated</th>
            <th>Downregulated</th>
            <th>Enriched Pathways</th>
        </tr>
""")
    
    comparison_data = []
    for label, filename in comparisons:
        filepath = DIRS['deseq2'] / filename
        if not filepath.exists():
            continue
        
        degs = get_degs(filepath, padj_cutoff=0.05, lfc_cutoff=1.0)
        n_pathways = 0
        if enrichment_results and label in enrichment_results:
            n_pathways = len(enrichment_results[label])
        
        comparison_data.append({
            'label': label,
            'filename': filename,
            'filepath': filepath,
            'degs': degs,
            'n_pathways': n_pathways
        })
        
        html_parts.append(f"""
        <tr>
            <td><strong>{label}</strong></td>
            <td>{degs['n_total']}</td>
            <td><span class="badge badge-up">{degs['n_up']} ↑</span></td>
            <td><span class="badge badge-down">{degs['n_down']} ↓</span></td>
            <td><span class="badge badge-neutral">{n_pathways}</span></td>
        </tr>
""")
    
    html_parts.append("    </table>")
    
    # === SECTION 3: Individual Comparisons ===
    html_parts.append("""
    <h2>3. Individual Condition Analysis</h2>
""")
    
    for comp in comparison_data:
        label = comp['label']
        filepath = comp['filepath']
        degs = comp['degs']
        
        html_parts.append(f"""
    <h3>{label} vs Control</h3>
    <div class="grid-3">
""")
        
        # Volcano plot
        try:
            volcano_fig = create_volcano_figure(filepath, label)
            volcano_html = pio.to_html(volcano_fig, full_html=False, include_plotlyjs=False, config={'responsive': True})
            html_parts.append(f"""
        <div class="card">
            <div class="card-title">Volcano Plot</div>
            <div class="card-desc">Log₂FC vs significance | {degs['n_total']} DEGs</div>
            <div class="plot-container">{volcano_html}</div>
        </div>
""")
        except Exception as e:
            html_parts.append(f"""
        <div class="card">
            <div class="card-title">Volcano Plot</div>
            <div class="no-data">Error: {str(e)[:40]}</div>
        </div>
""")
        
        # Pathway dotplot
        if enrichment_results and label in enrichment_results and not enrichment_results[label].empty:
            try:
                dot_fig = plot_enrichment_dotplot(enrichment_results[label], title="Enriched Pathways", top_n=12)
                dot_fig.update_layout(height=380, margin=dict(l=150, r=20, t=40, b=50))
                dot_html = pio.to_html(dot_fig, full_html=False, include_plotlyjs=False, config={'responsive': True})
                html_parts.append(f"""
        <div class="card">
            <div class="card-title">Pathway Enrichment (Dotplot)</div>
            <div class="card-desc">Gene ratio & significance</div>
            <div class="plot-container">{dot_html}</div>
        </div>
""")
            except Exception as e:
                html_parts.append(f"""
        <div class="card">
            <div class="card-title">Pathway Enrichment</div>
            <div class="no-data">Error generating plot</div>
        </div>
""")
        else:
            html_parts.append("""
        <div class="card">
            <div class="card-title">Pathway Enrichment (Dotplot)</div>
            <div class="no-data">No significant pathways (padj < 0.05)</div>
        </div>
""")
        
        # Pathway barplot
        if enrichment_results and label in enrichment_results and not enrichment_results[label].empty:
            try:
                bar_fig = plot_enrichment_barplot(enrichment_results[label], title="Top Pathways", top_n=12)
                bar_fig.update_layout(height=380, margin=dict(l=180, r=20, t=40, b=50))
                bar_html = pio.to_html(bar_fig, full_html=False, include_plotlyjs=False, config={'responsive': True})
                html_parts.append(f"""
        <div class="card">
            <div class="card-title">Pathway Enrichment (Barplot)</div>
            <div class="card-desc">-log₁₀(adjusted p-value)</div>
            <div class="plot-container">{bar_html}</div>
        </div>
""")
            except:
                html_parts.append("""
        <div class="card">
            <div class="card-title">Pathway Enrichment (Barplot)</div>
            <div class="no-data">Error generating plot</div>
        </div>
""")
        else:
            html_parts.append("""
        <div class="card">
            <div class="card-title">Pathway Enrichment (Barplot)</div>
            <div class="no-data">No significant pathways (padj < 0.05)</div>
        </div>
""")
        
        html_parts.append("    </div>")  # Close grid-3
    
    # === SECTION 4: Cross-condition heatmap ===
    html_parts.append("""
    <div class="section-divider"></div>
    <h2>4. Cross-Condition Pathway Comparison</h2>
    <div class="grid-2">
""")
    
    if enrichment_results:
        group_results = {comp['label']: enrichment_results.get(comp['label'], pd.DataFrame()) 
                        for comp in comparison_data}
        try:
            heatmap_fig = plot_enrichment_heatmap(group_results, top_n_per_comparison=5, title="Pathway Enrichment")
            heatmap_fig.update_layout(height=500, margin=dict(l=200, r=20, t=50, b=100))
            heatmap_html = pio.to_html(heatmap_fig, full_html=False, include_plotlyjs=False, config={'responsive': True})
            html_parts.append(f"""
        <div class="card full-width">
            <div class="card-title">Pathway Enrichment Heatmap</div>
            <div class="card-desc">Comparison of enriched pathways across all conditions (top 5 per condition)</div>
            <div class="plot-container">{heatmap_html}</div>
        </div>
""")
        except Exception as e:
            html_parts.append(f"""
        <div class="card full-width">
            <div class="card-title">Pathway Enrichment Heatmap</div>
            <div class="no-data">Could not generate heatmap: {str(e)[:50]}</div>
        </div>
""")
    
    html_parts.append("    </div>")  # Close grid-2
    
    # Footer
    html_parts.append("""
    <div class="footer">
        <p>
            Generated by RNAseq Analysis Pipeline | 
            <em>Picosynechococcus sp.</em> PCC 11901 | 
            CyanoCyc Database | DESeq2
        </p>
    </div>
</div>
</body>
</html>
""")
    
    # Write HTML file
    output_path.parent.mkdir(parents=True, exist_ok=True)
    with open(output_path, 'w') as f:
        f.write(''.join(html_parts))
    
    file_size = output_path.stat().st_size / 1024
    print(f"  ✓ {output_path.name} ({file_size:.0f} KB)")
    return output_path


# === Generate All Group Reports ===

GROUPS = {
    'Group 1 - Nutrient Conditions': {
        'id': 'group1',
        'output': '09_figures/reports/group1_nutrients_report.html',
        'comparisons': [
            ('Ammonia', 'group1_Ammonia_vs_Control.csv'),
            ('Glycerol', 'group1_Glycerol_0.75pct_vs_Control.csv'),
            ('High Nitrogen', 'group1_High_Nitrogen_vs_Control.csv'),
            ('Low Nitrogen', 'group1_Low_Nitrogen_vs_Control.csv'),
            ('High Phosphate', 'group1_High_Phosphate_vs_Control.csv'),
            ('Low Phosphate', 'group1_Low_Phosphate_vs_Control.csv'),
            ('Urea', 'group1_Urea_vs_Control.csv'),
        ]
    },
    'Group 2 - Environmental Conditions': {
        'id': 'group2',
        'output': '09_figures/reports/group2_environmental_report.html',
        'comparisons': [
            ('Atmospheric CO2', 'group2_Atmospheric_CO2_vs_Control.csv'),
            ('High CO2', 'group2_High_CO2_8pct_vs_Control.csv'),
            ('H2O2', 'group2_H2O2_0.005pct_vs_Control.csv'),
            ('High NaCl', 'group2_High_NaCl_9pct_vs_Control.csv'),
            ('High Temp', 'group2_High_Temp_38C_vs_Control.csv'),
            ('High Light', 'group2_High_Light_vs_Control.csv'),
            ('Low Light', 'group2_Low_Light_15uE_vs_Control.csv'),
        ]
    },
    'Group 3 - Circadian Rhythm': {
        'id': 'group3',
        'output': '09_figures/reports/group3_circadian_report.html',
        'comparisons': [
            ('T2 Dark vs T1 Light', 'group3_T2_Dark_vs_T1_Light.csv'),
            ('T3 Light vs T2 Dark', 'group3_T3_Light_vs_T2_Dark.csv'),
            ('T4 Dark vs T3 Light', 'group3_T4_Dark_vs_T3_Light.csv'),
        ]
    }
}

print("=" * 60)
print("Generating Comprehensive HTML Reports")
print("=" * 60)

for group_name, config in GROUPS.items():
    print(f"\n{group_name}")
    
    output_path = BASE_DIR / config['output']
    
    generate_group_report(
        group_name=group_name,
        group_id=config['id'],
        comparisons=config['comparisons'],
        output_path=output_path,
        enrichment_results=ALL_ENRICHMENT_RESULTS if 'ALL_ENRICHMENT_RESULTS' in dir() else None,
        metadata_df=metadata if 'metadata' in dir() else None
    )

print("\n" + "=" * 60)
print("Reports saved to: 09_figures/reports/")
print("=" * 60)

Generating comprehensive HTML reports...

Group 1 - Nutrient Conditions
----------------------------------------
Generated report: /Users/felix/Library/CloudStorage/OneDrive-SharedLibraries-MacquarieUniversity/Australian Genome Foundry - AWS cloud infrastructure/11_Esther_Cyano_transcriptomics/09_figures/reports/group1_nutrients_report.html

Group 2 - Environmental Conditions
----------------------------------------
Generated report: /Users/felix/Library/CloudStorage/OneDrive-SharedLibraries-MacquarieUniversity/Australian Genome Foundry - AWS cloud infrastructure/11_Esther_Cyano_transcriptomics/09_figures/reports/group2_environmental_report.html

Group 3 - Circadian Rhythm
----------------------------------------
Generated report: /Users/felix/Library/CloudStorage/OneDrive-SharedLibraries-MacquarieUniversity/Australian Genome Foundry - AWS cloud infrastructure/11_Esther_Cyano_transcriptomics/09_figures/reports/group3_circadian_report.html

Reports generated in 09_figures/reports/
  - g