In [None]:
# Ch15-3-snakemake

In [1]:
# Bioinformatics Pipeline with Snakemake and Interactive Dashboard
# This notebook demonstrates a simple bioinformatics pipeline with simulated data

In [2]:
# 1. Import Libraries
import subprocess
import sys
import os
from pathlib import Path
import random
import gzip
import json
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML, clear_output
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
import threading
import time

In [3]:
# 2. Install required packages 
def install_packages():
    packages = ['snakemake', 'pandas', 'matplotlib', 'seaborn', 'ipywidgets']
    for package in packages:
        try:
            __import__(package)
        except ImportError:
            print(f"Installing {package}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Uncomment the line below if you need to install packages
# install_packages()

In [4]:
# 3. Enable widgets in Jupyter
try:
    # Enable ipywidgets extension if in Jupyter
    from IPython import get_ipython
    if get_ipython() is not None:
        get_ipython().run_line_magic('matplotlib', 'inline')
        print("✅ Jupyter widgets enabled")
        print("📝 Note: If widgets don't display properly, run: jupyter nbextension enable --py widgetsnbextension")
except:
    print("⚠️  Widget extensions may need to be enabled manually")

print("🧬 Bioinformatics Pipeline Setup Complete!")
print("📊 Ready to create interactive dashboard...")

✅ Jupyter widgets enabled
📝 Note: If widgets don't display properly, run: jupyter nbextension enable --py widgetsnbextension
🧬 Bioinformatics Pipeline Setup Complete!
📊 Ready to create interactive dashboard...


In [5]:
# 4. Create directory structure and simulated FASTQ data
class FastqSimulator:
    """Generate simulated FASTQ data for testing"""
    
    def __init__(self):
        self.nucleotides = ['A', 'T', 'G', 'C']
        self.quality_chars = '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~'
    
    def generate_sequence(self, length=100):
        """Generate a random DNA sequence"""
        return ''.join(random.choices(self.nucleotides, k=length))
    
    def generate_quality(self, length=100, avg_quality=30):
        """Generate quality scores (Phred+33)"""
        qualities = []
        for i in range(length):
            # Simulate decreasing quality towards 3' end
            pos_factor = 1 - (i / length) * 0.3
            quality = int(random.normalvariate(avg_quality * pos_factor, 5))
            quality = max(2, min(40, quality))  # Clamp between 2 and 40
            qualities.append(chr(quality + 33))
        return ''.join(qualities)
    
    def generate_fastq_file(self, filename, num_reads=10000, read_length=100):
        """Generate a complete FASTQ file"""
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        
        with gzip.open(filename, 'wt') if filename.endswith('.gz') else open(filename, 'w') as f:
            for i in range(num_reads):
                read_id = f"@SIM_{i+1}_sample_read"
                sequence = self.generate_sequence(read_length)
                plus_line = "+"
                quality = self.generate_quality(read_length)
                
                f.write(f"{read_id}\n{sequence}\n{plus_line}\n{quality}\n")
        
        print(f"Generated {filename} with {num_reads} reads")

In [6]:
# 5. Setup project structure and generate data
def setup_project():
    """Create project directory structure and generate sample data"""
    
    # Create directories
    directories = [
        'data/raw',
        'data/processed',
        'results/fastqc',
        'results/alignment',
        'results/variants',
        'logs',
        'config'
    ]
    
    for directory in directories:
        Path(directory).mkdir(parents=True, exist_ok=True)
    
    # Generate sample FASTQ files
    simulator = FastqSimulator()
    
    samples = {
        'sample1': {'reads': 15000, 'quality': 35},
        'sample2': {'reads': 12000, 'quality': 30},
        'sample3': {'reads': 18000, 'quality': 32}
    }
    
    for sample_name, params in samples.items():
        # Generate paired-end reads
        r1_file = f"data/raw/{sample_name}_R1.fastq.gz"
        r2_file = f"data/raw/{sample_name}_R2.fastq.gz"
        
        simulator.generate_fastq_file(r1_file, params['reads'], 100)
        simulator.generate_fastq_file(r2_file, params['reads'], 100)
    
    print("Project structure created and sample data generated!")
    return list(samples.keys())

# Setup the project
sample_names = setup_project()


Generated data/raw/sample1_R1.fastq.gz with 15000 reads
Generated data/raw/sample1_R2.fastq.gz with 15000 reads
Generated data/raw/sample2_R1.fastq.gz with 12000 reads
Generated data/raw/sample2_R2.fastq.gz with 12000 reads
Generated data/raw/sample3_R1.fastq.gz with 18000 reads
Generated data/raw/sample3_R2.fastq.gz with 18000 reads
Project structure created and sample data generated!


In [7]:
# 6. Create Snakemake workflow
snakefile_content = '''
import os
from pathlib import Path

# Configuration
SAMPLES = ["sample1", "sample2", "sample3"]
DATA_DIR = "data"
RESULTS_DIR = "results"

# Target rule - what we want to produce
rule all:
    input:
        # FastQC reports (real)
        expand(f"{RESULTS_DIR}/fastqc/{{sample}}_R1_fastqc.html", sample=SAMPLES),
        expand(f"{RESULTS_DIR}/fastqc/{{sample}}_R2_fastqc.html", sample=SAMPLES),
        # Mock outputs
        expand(f"{RESULTS_DIR}/alignment/{{sample}}.bam", sample=SAMPLES),
        expand(f"{RESULTS_DIR}/variants/{{sample}}.vcf", sample=SAMPLES),
        f"{RESULTS_DIR}/multiqc_report.html",
        f"{RESULTS_DIR}/pipeline_summary.json"

# Real FastQC rule
rule fastqc:
    input:
        fastq=f"{DATA_DIR}/raw/{{sample}}_{{read}}.fastq.gz"
    output:
        html=f"{RESULTS_DIR}/fastqc/{{sample}}_{{read}}_fastqc.html",
        zip=f"{RESULTS_DIR}/fastqc/{{sample}}_{{read}}_fastqc.zip"
    params:
        outdir=f"{RESULTS_DIR}/fastqc"
    log:
        "logs/fastqc_{sample}_{read}.log"
    shell:
        """
        # Check if fastqc is available, if not use mock
        if command -v fastqc >/dev/null 2>&1; then
            fastqc {input.fastq} -o {params.outdir} --extract 2> {log}
        else
            echo "FastQC not found, creating mock output..." > {log}
            python scripts/mock_fastqc.py {input.fastq} {params.outdir} {wildcards.sample} {wildcards.read}
        fi
        """

# Mock alignment rule
rule align_reads:
    input:
        r1=f"{DATA_DIR}/raw/{{sample}}_R1.fastq.gz",
        r2=f"{DATA_DIR}/raw/{{sample}}_R2.fastq.gz"
    output:
        bam=f"{RESULTS_DIR}/alignment/{{sample}}.bam",
        bai=f"{RESULTS_DIR}/alignment/{{sample}}.bam.bai"
    log:
        "logs/align_{sample}.log"
    shell:
        """
        echo "Mock alignment for {wildcards.sample}" > {log}
        python scripts/mock_alignment.py {input.r1} {input.r2} {output.bam} {output.bai}
        """

# Mock variant calling rule
rule call_variants:
    input:
        bam=f"{RESULTS_DIR}/alignment/{{sample}}.bam",
        bai=f"{RESULTS_DIR}/alignment/{{sample}}.bam.bai"
    output:
        vcf=f"{RESULTS_DIR}/variants/{{sample}}.vcf"
    log:
        "logs/variants_{sample}.log"
    shell:
        """
        echo "Mock variant calling for {wildcards.sample}" > {log}
        python scripts/mock_variants.py {input.bam} {output.vcf}
        """

# Mock MultiQC rule
rule multiqc:
    input:
        fastqc_reports=expand(f"{RESULTS_DIR}/fastqc/{{sample}}_{{read}}_fastqc.zip", 
                             sample=SAMPLES, read=["R1", "R2"]),
        bams=expand(f"{RESULTS_DIR}/alignment/{{sample}}.bam", sample=SAMPLES)
    output:
        report=f"{RESULTS_DIR}/multiqc_report.html"
    log:
        "logs/multiqc.log"
    shell:
        """
        echo "Mock MultiQC report generation" > {log}
        python scripts/mock_multiqc.py {output.report}
        """

# Pipeline summary rule
rule pipeline_summary:
    input:
        vcfs=expand(f"{RESULTS_DIR}/variants/{{sample}}.vcf", sample=SAMPLES),
        multiqc=f"{RESULTS_DIR}/multiqc_report.html"
    output:
        summary=f"{RESULTS_DIR}/pipeline_summary.json"
    log:
        "logs/summary.log"
    shell:
        """
        echo "Generating pipeline summary" > {log}
        python scripts/generate_summary.py {output.summary}
        """
'''

# Write Snakefile
with open('Snakefile', 'w') as f:
    f.write(snakefile_content)

print("Snakefile created!")

Snakefile created!


In [8]:
# 7. Create mock scripts
def create_mock_scripts():
    """Create mock scripts for bioinformatics tools"""
    
    # Create scripts directory
    Path('scripts').mkdir(exist_ok=True)
    
    # Mock FastQC script
    mock_fastqc = '''
import sys
import os
import json
import gzip
from pathlib import Path

def count_reads(fastq_file):
    """Count reads in FASTQ file"""
    count = 0
    open_func = gzip.open if fastq_file.endswith('.gz') else open
    mode = 'rt' if fastq_file.endswith('.gz') else 'r'
    
    with open_func(fastq_file, mode) as f:
        for line in f:
            count += 1
    return count // 4  # 4 lines per read

def mock_fastqc_report(fastq_file, output_dir, sample, read):
    """Generate mock FastQC HTML report"""
    
    read_count = count_reads(fastq_file)
    
    html_content = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <title>FastQC Report - {sample}_{read}</title>
        <style>
            body {{ font-family: Arial, sans-serif; margin: 20px; }}
            .header {{ background-color: #f0f0f0; padding: 10px; }}
            .pass {{ color: green; }}
            .warn {{ color: orange; }}
            .fail {{ color: red; }}
            table {{ border-collapse: collapse; width: 100%; }}
            th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
            th {{ background-color: #f2f2f2; }}
        </style>
    </head>
    <body>
        <div class="header">
            <h1>FastQC Report</h1>
            <h2>File: {os.path.basename(fastq_file)}</h2>
        </div>
        
        <h3>Basic Statistics</h3>
        <table>
            <tr><th>Measure</th><th>Value</th></tr>
            <tr><td>Filename</td><td>{os.path.basename(fastq_file)}</td></tr>
            <tr><td>File type</td><td>Conventional base calls</td></tr>
            <tr><td>Encoding</td><td>Sanger / Illumina 1.9</td></tr>
            <tr><td>Total Sequences</td><td>{read_count:,}</td></tr>
            <tr><td>Sequence length</td><td>100</td></tr>
            <tr><td>%GC</td><td>42</td></tr>
        </table>
        
        <h3>Quality Metrics</h3>
        <table>
            <tr><th>Module</th><th>Status</th></tr>
            <tr><td>Basic Statistics</td><td class="pass">PASS</td></tr>
            <tr><td>Per base sequence quality</td><td class="pass">PASS</td></tr>
            <tr><td>Per sequence quality scores</td><td class="pass">PASS</td></tr>
            <tr><td>Per base sequence content</td><td class="warn">WARN</td></tr>
            <tr><td>Per sequence GC content</td><td class="pass">PASS</td></tr>
            <tr><td>Per base N content</td><td class="pass">PASS</td></tr>
            <tr><td>Sequence Length Distribution</td><td class="pass">PASS</td></tr>
            <tr><td>Sequence Duplication Levels</td><td class="pass">PASS</td></tr>
            <tr><td>Overrepresented sequences</td><td class="pass">PASS</td></tr>
            <tr><td>Adapter Content</td><td class="pass">PASS</td></tr>
        </table>
        
        <p><i>This is a mock FastQC report generated for demonstration purposes.</i></p>
    </body>
    </html>
    """
    
    # Write HTML report
    html_file = os.path.join(output_dir, f"{sample}_{read}_fastqc.html")
    with open(html_file, 'w') as f:
        f.write(html_content)
    
    # Create mock ZIP file (just touch it)
    zip_file = os.path.join(output_dir, f"{sample}_{read}_fastqc.zip")
    with open(zip_file, 'w') as f:
        f.write("Mock FastQC zip file")

if __name__ == "__main__":
    fastq_file = sys.argv[1]
    output_dir = sys.argv[2]
    sample = sys.argv[3]
    read = sys.argv[4]
    
    mock_fastqc_report(fastq_file, output_dir, sample, read)
'''
    
    with open('scripts/mock_fastqc.py', 'w') as f:
        f.write(mock_fastqc)
    
    # Mock alignment script
    mock_alignment = '''
import sys
import time

def mock_alignment(r1_file, r2_file, bam_file, bai_file):
    """Generate mock BAM and BAI files"""
    
    # Simulate processing time
    time.sleep(1)
    
    # Create mock BAM file
    with open(bam_file, 'w') as f:
        f.write(f"Mock BAM file for {r1_file} and {r2_file}\\n")
        f.write("This would contain aligned reads in binary format\\n")
    
    # Create mock BAI file
    with open(bai_file, 'w') as f:
        f.write("Mock BAM index file\\n")

if __name__ == "__main__":
    mock_alignment(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])
'''
    
    with open('scripts/mock_alignment.py', 'w') as f:
        f.write(mock_alignment)
    
    # Mock variant calling script
    mock_variants = '''
import sys
import random

def mock_variant_calling(bam_file, vcf_file):
    """Generate mock VCF file"""
    
    vcf_content = """##fileformat=VCFv4.2
##source=MockVariantCaller
##reference=mock_reference.fa
#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	SAMPLE
chr1	100	.	A	T	30	PASS	DP=20	GT:DP	0/1:20
chr1	200	.	G	C	35	PASS	DP=25	GT:DP	1/1:25
chr2	150	.	C	A	28	PASS	DP=18	GT:DP	0/1:18
chr3	300	.	T	G	40	PASS	DP=30	GT:DP	0/1:30
"""
    
    with open(vcf_file, 'w') as f:
        f.write(vcf_content)

if __name__ == "__main__":
    mock_variant_calling(sys.argv[1], sys.argv[2])
'''
    
    with open('scripts/mock_variants.py', 'w') as f:
        f.write(mock_variants)
    
    # Mock MultiQC script
    mock_multiqc = '''
import sys
import json

def mock_multiqc(output_file):
    """Generate mock MultiQC HTML report"""
    
    html_content = """
    <!DOCTYPE html>
    <html>
    <head>
        <title>MultiQC Report - Mock Analysis</title>
        <style>
            body { font-family: Arial, sans-serif; margin: 20px; }
            .header { background-color: #4CAF50; color: white; padding: 10px; }
            .summary { background-color: #f9f9f9; padding: 15px; margin: 10px 0; }
            table { border-collapse: collapse; width: 100%; }
            th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
            th { background-color: #f2f2f2; }
        </style>
    </head>
    <body>
        <div class="header">
            <h1>MultiQC Report</h1>
            <p>Aggregate results from bioinformatics analyses</p>
        </div>
        
        <div class="summary">
            <h2>General Statistics</h2>
            <table>
                <tr><th>Sample</th><th>Total Reads</th><th>%GC</th><th>Quality Score</th></tr>
                <tr><td>sample1</td><td>15,000</td><td>42%</td><td>35</td></tr>
                <tr><td>sample2</td><td>12,000</td><td>41%</td><td>30</td></tr>
                <tr><td>sample3</td><td>18,000</td><td>43%</td><td>32</td></tr>
            </table>
        </div>
        
        <div class="summary">
            <h2>FastQC Results</h2>
            <p>All samples passed basic quality checks. Mock analysis shows good quality scores across all samples.</p>
        </div>
        
        <div class="summary">
            <h2>Alignment Summary</h2>
            <p>Mock alignment statistics show >95% alignment rate for all samples.</p>
        </div>
        
        <p><i>This is a mock MultiQC report for demonstration purposes.</i></p>
    </body>
    </html>
    """
    
    with open(output_file, 'w') as f:
        f.write(html_content)

if __name__ == "__main__":
    mock_multiqc(sys.argv[1])
'''
    
    with open('scripts/mock_multiqc.py', 'w') as f:
        f.write(mock_multiqc)
    
    # Pipeline summary script
    summary_script = '''
import sys
import json
from datetime import datetime

def generate_summary(output_file):
    """Generate pipeline summary JSON"""
    
    summary = {
        "pipeline_name": "Mock Bioinformatics Pipeline",
        "run_date": datetime.now().isoformat(),
        "samples_processed": 3,
        "total_reads": 45000,
        "variants_called": 12,
        "success_rate": "100%",
        "processing_time": "5 minutes (mock)",
        "modules": {
            "fastqc": {"status": "completed", "samples": 3},
            "alignment": {"status": "completed", "samples": 3},
            "variant_calling": {"status": "completed", "samples": 3},
            "multiqc": {"status": "completed", "reports": 1}
        }
    }
    
    with open(output_file, 'w') as f:
        json.dump(summary, f, indent=2)

if __name__ == "__main__":
    generate_summary(sys.argv[1])
'''
    
    with open('scripts/generate_summary.py', 'w') as f:
        f.write(summary_script)
    
    print("Mock scripts created!")

create_mock_scripts()


Mock scripts created!


In [9]:
# 8. Pipeline Controller Class
class BioinformaticsPipelineController:
    """Interactive controller for the Snakemake pipeline"""
    
    def __init__(self):
        self.pipeline_running = False
        self.current_step = 0
        self.total_steps = 0
        self.progress_bar = None
        self.output_widget = None
        self.log_widget = None
        self.status_widget = None
        self.results_widget = None
        
        # Create the dashboard
        self.create_dashboard()
    
    def create_dashboard(self):
        """Create the interactive dashboard"""
        
        # Header
        header = widgets.HTML(
            value="<h2>🧬 Bioinformatics Pipeline Controller</h2>",
            layout=widgets.Layout(margin='0 0 20px 0')
        )
        
        # Control buttons
        self.run_button = widgets.Button(
            description='Run Full Pipeline',
            button_style='success',
            icon='play',
            layout=widgets.Layout(width='200px', margin='5px')
        )
        
        self.step_button = widgets.Button(
            description='Run Single Step',
            button_style='info',
            icon='step-forward',
            layout=widgets.Layout(width='200px', margin='5px')
        )
        
        self.stop_button = widgets.Button(
            description='Stop Pipeline',
            button_style='danger',
            icon='stop',
            layout=widgets.Layout(width='200px', margin='5px'),
            disabled=True
        )
        
        self.clear_button = widgets.Button(
            description='Clear Results',
            button_style='warning',
            icon='trash',
            layout=widgets.Layout(width='200px', margin='5px')
        )
        
        # Step selector
        self.step_selector = widgets.Dropdown(
            options=[
                ('FastQC Quality Control', 'fastqc'),
                ('Read Alignment', 'align_reads'),
                ('Variant Calling', 'call_variants'),
                ('MultiQC Report', 'multiqc'),
                ('Pipeline Summary', 'pipeline_summary')
            ],
            value='fastqc',
            description='Select Step:',
            layout=widgets.Layout(width='300px', margin='5px')
        )
        
        # Progress and status
        self.progress_bar = widgets.IntProgress(
            value=0,
            min=0,
            max=100,
            description='Progress:',
            bar_style='',
            layout=widgets.Layout(width='400px', margin='10px 0')
        )
        
        self.status_widget = widgets.HTML(
            value="<b>Status:</b> Ready to run pipeline",
            layout=widgets.Layout(margin='10px 0')
        )
        
        # Output area
        self.output_widget = widgets.Output(
            layout=widgets.Layout(
                border='1px solid gray',
                height='300px',
                overflow_y='scroll',
                padding='10px',
                margin='10px 0'
            )
        )
        
        # Results area
        self.results_widget = widgets.Output(
            layout=widgets.Layout(
                border='1px solid blue',
                height='400px',
                overflow_y='scroll',
                padding='10px',
                margin='10px 0'
            )
        )
        
        # Bind button events
        self.run_button.on_click(self.run_full_pipeline)
        self.step_button.on_click(self.run_single_step)
        self.stop_button.on_click(self.stop_pipeline)
        self.clear_button.on_click(self.clear_results)
        
        # Layout
        controls = widgets.HBox([
            self.run_button, 
            self.step_button, 
            self.stop_button, 
            self.clear_button
        ])
        
        step_control = widgets.HBox([self.step_selector])
        
        progress_area = widgets.VBox([
            self.progress_bar,
            self.status_widget
        ])
        
        tabs = widgets.Tab(children=[self.output_widget, self.results_widget])
        tabs.set_title(0, 'Pipeline Output')
        tabs.set_title(1, 'Results & Analysis')
        
        self.dashboard = widgets.VBox([
            header,
            controls,
            step_control,
            progress_area,
            tabs
        ])
    
    def update_status(self, message, progress=None):
        """Update status and progress"""
        self.status_widget.value = f"<b>Status:</b> {message}"
        if progress is not None:
            self.progress_bar.value = progress
    
    def log_output(self, message):
        """Add message to output log"""
        with self.output_widget:
            print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
    
    def run_snakemake_command(self, targets=None, dry_run=False):
        """Run Snakemake command"""
        import subprocess
        
        # Check if snakemake is available
        try:
            result = subprocess.run(['snakemake', '--version'], capture_output=True, check=True)
            version_output = result.stdout.decode().strip()
            self.log_output(f"Using Snakemake version: {version_output}")
        except (subprocess.CalledProcessError, FileNotFoundError):
            return False, "", "Snakemake not found. Please install with: pip install snakemake"
        
        # Build command - use simpler, more compatible syntax
        cmd = ['snakemake']
        
        # Add cores parameter (use -j instead of --cores for compatibility)
        cmd.extend(['-j', '1'])
        
        # Add dry run if requested
        if dry_run:
            cmd.append('--dry-run')
        
        # Add targets if specified
        if targets:
            cmd.extend(targets)
        else:
            # If no targets specified, run default target
            cmd.append('all')
        
        try:
            self.log_output(f"Executing: {' '.join(cmd)}")
            
            # Change to correct directory and run
            result = subprocess.run(
                cmd, 
                capture_output=True, 
                text=True, 
                timeout=120,
                cwd='.'
            )
            
            # Log the result details
            if result.returncode == 0:
                self.log_output(f"✅ Command completed successfully")
            else:
                self.log_output(f"❌ Command failed with return code {result.returncode}")
            
            return result.returncode == 0, result.stdout, result.stderr
            
        except subprocess.TimeoutExpired:
            return False, "", "Pipeline execution timed out (120s limit)"
        except FileNotFoundError:
            return False, "", "Snakemake command not found in PATH"
        except Exception as e:
            return False, "", f"Unexpected error: {str(e)}"
    
    def run_full_pipeline(self, button):
        """Run the complete pipeline"""
        if self.pipeline_running:
            return
        
        # Check if setup is complete
        if not self._check_setup():
            self.log_output("❌ Setup incomplete. Please run all setup cells first.")
            return
        
        self.pipeline_running = True
        self.run_button.disabled = True
        self.stop_button.disabled = False
        
        self.update_status("Starting full pipeline...", 0)
        self.log_output("🚀 Starting full bioinformatics pipeline")
        
        # Run in a separate thread to avoid blocking
        thread = threading.Thread(target=self._run_full_pipeline_thread)
        thread.start()
    
    def _check_setup(self):
        """Check if the project setup is complete"""
        required_files = [
            'Snakefile',
            'data/raw/sample1_R1.fastq.gz',
            'data/raw/sample1_R2.fastq.gz',
            'scripts/mock_fastqc.py'
        ]
        
        missing_files = []
        for file_path in required_files:
            if not os.path.exists(file_path):
                missing_files.append(file_path)
        
        if missing_files:
            self.log_output(f"❌ Missing required files: {', '.join(missing_files)}")
            self.log_output("📝 Please run the setup cells first (cells 2-5)")
            return False
        
        return True
    
    def _run_full_pipeline_thread(self):
        """Thread function for running full pipeline"""
        steps = [
            ("Quality Control (FastQC)", ["results/fastqc/sample1_R1_fastqc.html", "results/fastqc/sample1_R2_fastqc.html"], 20),
            ("Read Alignment", ["results/alignment/sample1.bam"], 40),
            ("Variant Calling", ["results/variants/sample1.vcf"], 60),
            ("MultiQC Report", ["results/multiqc_report.html"], 80),
            ("Pipeline Summary", ["results/pipeline_summary.json"], 100)
        ]
        
        for step_name, targets, progress in steps:
            if not self.pipeline_running:
                break
                
            self.update_status(f"Running: {step_name}", progress)
            self.log_output(f"📊 {step_name}")
            
            # Run actual snakemake command for specific targets
            success, stdout, stderr = self.run_snakemake_command(targets)
            
            if success:
                self.log_output(f"✅ {step_name} completed successfully")
                if stdout and len(stdout.strip()) > 0:
                    # Only show first few lines of output
                    output_lines = stdout.strip().split('\n')[:3]
                    for line in output_lines:
                        if line.strip():
                            self.log_output(f"   {line.strip()}")
            else:
                self.log_output(f"❌ {step_name} failed")
                if stderr:
                    self.log_output(f"   Error: {stderr[:100]}...")
                if stdout:
                    self.log_output(f"   Output: {stdout[:100]}...")
            
            # Small delay between steps
            time.sleep(1)
        
        if self.pipeline_running:
            self.update_status("Pipeline completed successfully!", 100)
            self.log_output("🎉 Full pipeline completed!")
            self._show_results()
        
        self.pipeline_running = False
        self.run_button.disabled = False
        self.stop_button.disabled = True
    
    def run_single_step(self, button):
        """Run a single pipeline step"""
        step = self.step_selector.value
        
        # Find the step name from options
        step_name = step
        for option_name, option_value in self.step_selector.options:
            if option_value == step:
                step_name = option_name
                break
        
        self.update_status(f"Running single step: {step_name}", 0)
        self.log_output(f"🔧 Running single step: {step_name}")
        
        # Map step values to actual Snakemake targets
        step_targets = {
            'fastqc': ['results/fastqc/sample1_R1_fastqc.html', 'results/fastqc/sample1_R2_fastqc.html'],
            'align_reads': ['results/alignment/sample1.bam'],
            'call_variants': ['results/variants/sample1.vcf'],
            'multiqc': ['results/multiqc_report.html'],
            'pipeline_summary': ['results/pipeline_summary.json']
        }
        
        targets = step_targets.get(step, [step])
        
        # Run the specific step
        success, stdout, stderr = self.run_snakemake_command(targets)
        
        if success:
            self.update_status(f"Step completed: {step_name}", 100)
            self.log_output(f"✅ {step_name} completed successfully")
            if stdout:
                self.log_output(f"Output: {stdout[:200]}...")
        else:
            self.update_status(f"Step failed: {step_name}", 0)
            self.log_output(f"❌ {step_name} failed: {stderr}")
            if stdout:
                self.log_output(f"Stdout: {stdout[:200]}...")
    
    def stop_pipeline(self, button):
        """Stop the running pipeline"""
        self.pipeline_running = False
        self.update_status("Pipeline stopped by user", self.progress_bar.value)
        self.log_output("⏹️ Pipeline stopped by user")
        self.run_button.disabled = False
        self.stop_button.disabled = True
    
    def clear_results(self, button):
        """Clear all results and outputs"""
        self.output_widget.clear_output()
        self.results_widget.clear_output()
        self.update_status("Results cleared - ready to run", 0)
        
        # Clean up result files
        import shutil
        if os.path.exists('results'):
            shutil.rmtree('results')
        Path('results').mkdir(exist_ok=True)
        for subdir in ['fastqc', 'alignment', 'variants']:
            Path(f'results/{subdir}').mkdir(exist_ok=True)
    
    def _show_results(self):
        """Display pipeline results"""
        with self.results_widget:
            clear_output()
            print("📈 PIPELINE RESULTS SUMMARY")
            print("=" * 50)
            
            # Check for generated files
            result_files = {
                'FastQC Reports': list(Path('results/fastqc').glob('*.html')),
                'BAM Files': list(Path('results/alignment').glob('*.bam')),
                'VCF Files': list(Path('results/variants').glob('*.vcf')),
                'MultiQC Report': list(Path('results').glob('multiqc_report.html')),
                'Summary': list(Path('results').glob('pipeline_summary.json'))
            }
            
            for category, files in result_files.items():
                print(f"\n{category}:")
                if files:
                    for file in files:
                        print(f"  ✅ {file}")
                else:
                    print(f"  ❌ No files found")
            
            # Show summary if available
            summary_file = Path('results/pipeline_summary.json')
            if summary_file.exists():
                print(f"\n📊 PIPELINE SUMMARY:")
                print("-" * 30)
                try:
                    with open(summary_file) as f:
                        summary = json.load(f)
                    for key, value in summary.items():
                        if isinstance(value, dict):
                            print(f"{key}:")
                            for k, v in value.items():
                                print(f"  {k}: {v}")
                        else:
                            print(f"{key}: {value}")
                except Exception as e:
                    print(f"Error reading summary: {e}")
            
            # Create simple visualization
            self._create_results_visualization()
    
    def _create_results_visualization(self):
        """Create a simple visualization of results"""
        try:
            # Sample data for visualization
            samples = ['sample1', 'sample2', 'sample3']
            read_counts = [15000, 12000, 18000]
            quality_scores = [35, 30, 32]
            
            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
            
            # Read counts
            ax1.bar(samples, read_counts, color=['#1f77b4', '#ff7f0e', '#2ca02c'])
            ax1.set_title('Read Counts per Sample')
            ax1.set_ylabel('Number of Reads')
            ax1.tick_params(axis='x', rotation=45)
            
            # Quality scores
            ax2.bar(samples, quality_scores, color=['#d62728', '#9467bd', '#8c564b'])
            ax2.set_title('Average Quality Scores')
            ax2.set_ylabel('Phred Quality Score')
            ax2.tick_params(axis='x', rotation=45)
            
            plt.tight_layout()
            plt.show()
            
        except Exception as e:
            print(f"Could not generate visualization: {e}")
    
    def display(self):
        """Display the dashboard"""
        return self.dashboard

In [10]:
# 9. Run the Interactive Dashboard
print("🧬 Bioinformatics Pipeline Dashboard")
print("=" * 50)
print("This interactive dashboard allows you to:")
print("• Run the complete pipeline or individual steps")
print("• Monitor progress in real-time")
print("• View results and analysis")
print("• Manage pipeline execution")
print("\nThe pipeline includes:")
print("• FastQC quality control (real tool)")
print("• Read alignment (mock)")
print("• Variant calling (mock)")
print("• MultiQC reporting (mock)")
print("• Results summarization")

# Create and display the controller
controller = BioinformaticsPipelineController()

# Display the interactive dashboard
display(controller.display())

# Also display some quick action buttons for convenience
print("\n" + "="*60)
print("📊 QUICK ACTIONS:")

# Create some additional standalone widgets for immediate use
quick_run_button = widgets.Button(
    description='🚀 Quick Start Full Pipeline',
    button_style='success',
    layout=widgets.Layout(width='250px', height='40px')
)

quick_fastqc_button = widgets.Button(
    description='🔬 Run FastQC Only',
    button_style='info',
    layout=widgets.Layout(width='250px', height='40px')
)

view_files_button = widgets.Button(
    description='📁 View Project Files',
    button_style='warning',
    layout=widgets.Layout(width='250px', height='40px')
)

# Create output area for quick actions
quick_output = widgets.Output()

def quick_run_pipeline(b):
    with quick_output:
        clear_output()
        print("🚀 Starting quick pipeline run...")
        controller.run_full_pipeline(b)

def quick_run_fastqc(b):
    with quick_output:
        clear_output()
        print("🔬 Running FastQC analysis...")
        controller.step_selector.value = 'fastqc'
        controller.run_single_step(b)

def quick_view_files(b):
    with quick_output:
        clear_output()
        show_project_structure()

quick_run_button.on_click(quick_run_pipeline)
quick_fastqc_button.on_click(quick_run_fastqc)
view_files_button.on_click(quick_view_files)

quick_buttons = widgets.HBox([
    quick_run_button,
    quick_fastqc_button, 
    view_files_button
], layout=widgets.Layout(margin='10px 0'))

display(widgets.VBox([
    widgets.HTML("<h3>Quick Actions</h3>"),
    quick_buttons,
    quick_output
]))

🧬 Bioinformatics Pipeline Dashboard
This interactive dashboard allows you to:
• Run the complete pipeline or individual steps
• Monitor progress in real-time
• View results and analysis
• Manage pipeline execution

The pipeline includes:
• FastQC quality control (real tool)
• Read alignment (mock)
• Variant calling (mock)
• MultiQC reporting (mock)
• Results summarization


VBox(children=(HTML(value='<h2>🧬 Bioinformatics Pipeline Controller</h2>', layout=Layout(margin='0 0 20px 0'))…


📊 QUICK ACTIONS:


VBox(children=(HTML(value='<h3>Quick Actions</h3>'), HBox(children=(Button(button_style='success', description…

In [11]:
# 10. Additional utility functions
def show_project_structure():
    """Display the current project structure"""
    print("📁 PROJECT STRUCTURE:")
    print("=" * 30)
    
    def print_tree(directory, prefix="", max_depth=3, current_depth=0):
        if current_depth >= max_depth:
            return
            
        try:
            items = sorted(Path(directory).iterdir())
            for i, item in enumerate(items):
                is_last = i == len(items) - 1
                current_prefix = "└── " if is_last else "├── "
                print(f"{prefix}{current_prefix}{item.name}")
                
                if item.is_dir() and current_depth < max_depth - 1:
                    next_prefix = prefix + ("    " if is_last else "│   ")
                    print_tree(item, next_prefix, max_depth, current_depth + 1)
        except PermissionError:
            pass
    
    print_tree(".")

def check_dependencies():
    """Check if required tools are available"""
    print("🔍 DEPENDENCY CHECK:")
    print("=" * 25)
    
    dependencies = {
        'snakemake': 'Workflow management',
        'fastqc': 'Quality control (optional - will use mock if unavailable)',
        'python': 'Required for pipeline execution'
    }
    
    for tool, description in dependencies.items():
        try:
            result = subprocess.run(['which', tool], capture_output=True)
            status = "✅ Available" if result.returncode == 0 else "❌ Not found"
            print(f"{tool:15} | {status:15} | {description}")
        except:
            print(f"{tool:15} | ❌ Not found    | {description}")

def view_sample_data():
    """Show information about the generated sample data"""
    print("🧪 SAMPLE DATA INFORMATION:")
    print("=" * 35)
    
    for sample in ['sample1', 'sample2', 'sample3']:
        r1_file = f"data/raw/{sample}_R1.fastq.gz"
        r2_file = f"data/raw/{sample}_R2.fastq.gz"
        
        if os.path.exists(r1_file):
            r1_size = os.path.getsize(r1_file)
            r2_size = os.path.getsize(r2_file)
            print(f"{sample}:")
            print(f"  R1: {r1_file} ({r1_size:,} bytes)")
            print(f"  R2: {r2_file} ({r2_size:,} bytes)")
        else:
            print(f"{sample}: Files not found")

In [12]:
# 11. Make utility functions available
print("\n📚 UTILITY FUNCTIONS AVAILABLE:")
print("• show_project_structure() - Display project file tree")
print("• check_dependencies() - Check for required tools")
print("• view_sample_data() - Show sample data information")


📚 UTILITY FUNCTIONS AVAILABLE:
• show_project_structure() - Display project file tree
• check_dependencies() - Check for required tools
• view_sample_data() - Show sample data information


In [13]:
# 12. Alternative Controls (if widgets don't display)
print("\n" + "="*60)
print("🔧 ALTERNATIVE PIPELINE CONTROLS")
print("="*60)
print("If the interactive widgets above don't display properly,")
print("you can use these direct function calls:")


🔧 ALTERNATIVE PIPELINE CONTROLS
If the interactive widgets above don't display properly,
you can use these direct function calls:


In [14]:
## 13.  Utility function to Run Pipeline directly ##
def run_pipeline_directly():
    """Direct function to run the full pipeline"""
    print("🚀 Running full pipeline directly...")
    
    # Check setup first
    required_files = ['Snakefile', 'data/raw/sample1_R1.fastq.gz', 'scripts/mock_fastqc.py']
    missing = [f for f in required_files if not os.path.exists(f)]
    if missing:
        print(f"❌ Missing files: {', '.join(missing)}")
        print("📝 Please run the setup cells first (cells 2-5)")
        return
    
    import subprocess
    
    steps = [
        ("FastQC Quality Control", ["results/fastqc/sample1_R1_fastqc.html", "results/fastqc/sample1_R2_fastqc.html"]),
        ("Read Alignment", ["results/alignment/sample1.bam"]), 
        ("Variant Calling", ["results/variants/sample1.vcf"]),
        ("MultiQC Report", ["results/multiqc_report.html"]),
        ("Pipeline Summary", ["results/pipeline_summary.json"])
    ]
    
    for step_name, targets in steps:
        print(f"\n📊 Running: {step_name}")
        try:
            # Check if snakemake is available and get version
            version_result = subprocess.run(['snakemake', '--version'], capture_output=True, check=True, text=True)
            print(f"   Using Snakemake: {version_result.stdout.strip()}")
            
            # Use compatible command syntax
            cmd = ['snakemake', '-j', '1'] + targets
            print(f"   Command: {' '.join(cmd)}")
            
            result = subprocess.run(
                cmd, 
                capture_output=True, 
                text=True,
                timeout=60
            )
            if result.returncode == 0:
                print(f"✅ {step_name} completed successfully")
                # Show some output if available
                if result.stdout and result.stdout.strip():
                    output_lines = result.stdout.strip().split('\n')[:2]
                    for line in output_lines:
                        if line.strip():
                            print(f"   {line.strip()}")
            else:
                print(f"❌ {step_name} failed (return code: {result.returncode})")
                if result.stderr:
                    print(f"   Error: {result.stderr[:300]}...")
                if result.stdout:
                    print(f"   Output: {result.stdout[:200]}...")
        except subprocess.CalledProcessError:
            print(f"❌ Snakemake not available. Install with: pip install snakemake")
            break
        except subprocess.TimeoutExpired:
            print(f"⏰ {step_name} timed out")
        except Exception as e:
            print(f"❌ Error running {step_name}: {e}")
    
    print("\n🎉 Pipeline execution complete!")
    show_results_summary()

def run_fastqc_only():
    """Run just the FastQC step"""
    print("🔬 Running FastQC analysis only...")
    
    # Check setup
    if not os.path.exists('data/raw/sample1_R1.fastq.gz'):
        print("❌ Sample data not found. Run setup cells first.")
        return
    
    import subprocess
    try:
        # Check Snakemake availability
        version_result = subprocess.run(['snakemake', '--version'], capture_output=True, check=True, text=True)
        print(f"Using Snakemake: {version_result.stdout.strip()}")
        
        # Use compatible command syntax
        cmd = ['snakemake', '-j', '1', 
               'results/fastqc/sample1_R1_fastqc.html', 
               'results/fastqc/sample1_R2_fastqc.html']
        
        print(f"Command: {' '.join(cmd)}")
        
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
        
        if result.returncode == 0:
            print("✅ FastQC completed successfully")
            print("📁 Check results/fastqc/ for HTML reports")
            # Show generated files
            from pathlib import Path
            fastqc_files = list(Path('results/fastqc').glob('*.html'))
            for file in fastqc_files:
                print(f"   Generated: {file}")
        else:
            print(f"❌ FastQC failed (return code: {result.returncode})")
            if result.stderr:
                print(f"   Error: {result.stderr[:300]}...")
            if result.stdout:
                print(f"   Output: {result.stdout[:200]}...")
                
    except subprocess.CalledProcessError:
        print("❌ Snakemake not available. Install with: pip install snakemake")
    except Exception as e:
        print(f"❌ Error running FastQC: {e}")

def test_snakemake_setup():
    """Test if Snakemake setup is working"""
    print("🧪 Testing Snakemake Setup")
    print("=" * 30)
    
    import subprocess
    
    # Test 1: Check if Snakemake is installed
    try:
        result = subprocess.run(['snakemake', '--version'], capture_output=True, check=True, text=True)
        print(f"✅ Snakemake version: {result.stdout.strip()}")
    except Exception as e:
        print(f"❌ Snakemake not found: {e}")
        print("📝 Install with: pip install snakemake")
        return False
    
    # Test 2: Check if Snakefile exists
    if os.path.exists('Snakefile'):
        print("✅ Snakefile found")
    else:
        print("❌ Snakefile missing - run setup cells first")
        return False
    
    # Test 3: Check if sample data exists
    sample_files = ['data/raw/sample1_R1.fastq.gz', 'data/raw/sample1_R2.fastq.gz']
    missing_samples = [f for f in sample_files if not os.path.exists(f)]
    if not missing_samples:
        print("✅ Sample data found")
    else:
        print(f"❌ Missing sample files: {missing_samples}")
        return False
    
    # Test 4: Try a dry run
    try:
        result = subprocess.run(['snakemake', '--dry-run', '-j', '1'], 
                              capture_output=True, text=True, timeout=30)
        if result.returncode == 0:
            print("✅ Snakemake dry run successful")
            return True
        else:
            print(f"❌ Snakemake dry run failed:")
            print(f"   {result.stderr[:200]}...")
            return False
    except Exception as e:
        print(f"❌ Error testing Snakemake: {e}")
        return False

def show_results_summary():
    """Display a summary of pipeline results"""
    print("\n📈 PIPELINE RESULTS SUMMARY")
    print("=" * 50)
    
    # Check for generated files
    result_files = {
        'FastQC Reports': list(Path('results/fastqc').glob('*.html')),
        'BAM Files': list(Path('results/alignment').glob('*.bam')),
        'VCF Files': list(Path('results/variants').glob('*.vcf')),
        'MultiQC Report': list(Path('results').glob('multiqc_report.html')),
        'Summary': list(Path('results').glob('pipeline_summary.json'))
    }
    
    for category, files in result_files.items():
        print(f"\n{category}:")
        if files:
            for file in files:
                print(f"  ✅ {file}")
        else:
            print(f"  ❌ No files found")
    
    # Show file sizes
    print(f"\n📊 SAMPLE DATA SIZES:")
    for sample in ['sample1', 'sample2', 'sample3']:
        r1_file = f"data/raw/{sample}_R1.fastq.gz"
        r2_file = f"data/raw/{sample}_R2.fastq.gz"
        
        if os.path.exists(r1_file):
            r1_size = os.path.getsize(r1_file)
            r2_size = os.path.getsize(r2_file)
            print(f"  {sample}: R1={r1_size:,} bytes, R2={r2_size:,} bytes")

print("\n📋 AVAILABLE FUNCTIONS:")
print("• test_snakemake_setup() - Test if everything is configured correctly")
print("• run_pipeline_directly() - Run the complete pipeline")
print("• run_fastqc_only() - Run just quality control")
print("• show_results_summary() - Display results overview")
print("• show_project_structure() - Show file tree")
print("• check_dependencies() - Check installed tools")
print("• view_sample_data() - Show sample information")

print("\n💡 EXAMPLE USAGE:")
print("test_snakemake_setup()   # Check setup first")
print("run_fastqc_only()        # Quick test")
print("run_pipeline_directly()  # Full analysis")
print("show_results_summary()   # View outputs")

# Test Widget Display and Final Instructions
print("🧪 TESTING WIDGET DISPLAY")
print("=" * 30)

# Test if widgets are working with a simple example
test_button = widgets.Button(
    description='🧪 Test Widget',
    button_style='info',
    layout=widgets.Layout(width='200px')
)

test_output = widgets.Output()

def test_widget_click(b):
    with test_output:
        clear_output()
        print("✅ Widgets are working correctly!")
        print("You should see the interactive dashboard above.")

test_button.on_click(test_widget_click)

print("If you see a button below, widgets are enabled:")
display(widgets.VBox([
    test_button,
    test_output
]))

print("\n🎯 WHAT YOU SHOULD SEE ABOVE:")
print("1. 🧬 Interactive Dashboard with buttons:")
print("   • Run Full Pipeline (green)")
print("   • Run Single Step (blue)")
print("   • Stop Pipeline (red)")
print("   • Clear Results (orange)")
print("2. 📊 Step selector dropdown")
print("3. 📈 Progress bar")
print("4. 📋 Tabbed output areas")
print("5. 🚀 Quick action buttons")

print("\n🔧 TROUBLESHOOTING:")
print("If widgets don't display:")
print("1. Run: pip install ipywidgets")
print("2. Run: jupyter nbextension enable --py widgetsnbextension")
print("3. Restart your Jupyter kernel")
print("4. Use the alternative functions in Cell 9")

print("\n🎉 READY TO USE!")
print("Your bioinformatics pipeline is set up and ready.")
print("Use the interactive dashboard above or the direct functions below.")

# Final summary
print(f"\n📁 Project created with:")
print(f"• {len(sample_names)} sample datasets")
print(f"• Complete Snakemake workflow")
print(f"• Interactive Jupyter controls")
print(f"• Mock bioinformatics tools")
print(f"• Real FastQC integration")

print(f"\n🏁 Next steps:")
print(f"1. Test setup: test_snakemake_setup()")
print(f"2. Click 'Run Full Pipeline' in the dashboard above")
print(f"3. Or call run_pipeline_directly() below")
print(f"4. Monitor progress and view results")
print(f"5. Explore the generated files in results/")


📋 AVAILABLE FUNCTIONS:
• test_snakemake_setup() - Test if everything is configured correctly
• run_pipeline_directly() - Run the complete pipeline
• run_fastqc_only() - Run just quality control
• show_results_summary() - Display results overview
• show_project_structure() - Show file tree
• check_dependencies() - Check installed tools
• view_sample_data() - Show sample information

💡 EXAMPLE USAGE:
test_snakemake_setup()   # Check setup first
run_fastqc_only()        # Quick test
run_pipeline_directly()  # Full analysis
show_results_summary()   # View outputs
🧪 TESTING WIDGET DISPLAY
If you see a button below, widgets are enabled:


VBox(children=(Button(button_style='info', description='🧪 Test Widget', layout=Layout(width='200px'), style=Bu…


🎯 WHAT YOU SHOULD SEE ABOVE:
1. 🧬 Interactive Dashboard with buttons:
   • Run Full Pipeline (green)
   • Run Single Step (blue)
   • Stop Pipeline (red)
   • Clear Results (orange)
2. 📊 Step selector dropdown
3. 📈 Progress bar
4. 📋 Tabbed output areas
5. 🚀 Quick action buttons

🔧 TROUBLESHOOTING:
If widgets don't display:
1. Run: pip install ipywidgets
2. Run: jupyter nbextension enable --py widgetsnbextension
3. Restart your Jupyter kernel
4. Use the alternative functions in Cell 9

🎉 READY TO USE!
Your bioinformatics pipeline is set up and ready.
Use the interactive dashboard above or the direct functions below.

📁 Project created with:
• 3 sample datasets
• Complete Snakemake workflow
• Interactive Jupyter controls
• Mock bioinformatics tools
• Real FastQC integration

🏁 Next steps:
1. Test setup: test_snakemake_setup()
2. Click 'Run Full Pipeline' in the dashboard above
3. Or call run_pipeline_directly() below
4. Monitor progress and view results
5. Explore the generated files in

In [None]:
## End of Notebook ##