In [1]:
import pandas as pd

df = pd.read_csv('metadata/OSA_SraRunTable.csv', header = 0)

In [2]:
def generate_sample_names(samples_df, naming_scheme='sequential'):
    """
    Generate sample names based on the chosen naming scheme
    
    Args:
        samples_df: DataFrame with grouped samples
        naming_scheme: 'gsm', 'status', or 'sequential'
    
    Returns:
        Dictionary mapping GSM IDs to sample names
    """
    sample_mapping = {}
    
    if naming_scheme == 'gsm':
        for _, row in samples_df.iterrows():
            sample_mapping[row['Sample Name']] = row['Sample Name']
    
    elif naming_scheme == 'status':
        for _, row in samples_df.iterrows():
            gsm = row['Sample Name']
            status = row['status']
            sex = row['sex']
            age = int(row['AGE'])
            sample_name = f"{status}_{sex}_{age}"
            
            # Handle duplicates by adding suffix
            base_name = sample_name
            counter = 1
            while sample_name in sample_mapping.values():
                sample_name = f"{base_name}_{counter}"
                counter += 1
            
            sample_mapping[gsm] = sample_name
    
    elif naming_scheme == 'sequential':
        # Sort by status and then by AHI (severity)
        samples_sorted = samples_df.sort_values(['status', 'ahi'], ascending=[True, False])
        
        # Count samples per status
        status_counts = {}
        
        for _, row in samples_sorted.iterrows():
            gsm = row['Sample Name']
            status = row['status']
            
            if status not in status_counts:
                status_counts[status] = 0
            status_counts[status] += 1
            
            sample_name = f"{status}_{status_counts[status]:03d}"
            sample_mapping[gsm] = sample_name
    
    return sample_mapping

In [3]:
def generate_samplesheet(metadata_file, fastq_base_dir, output_file, naming_scheme='sequential'):
    """
    Generate samplesheet grouping runs by GSM ID
    
    Args:
        metadata_file: Path to SRA metadata TSV file
        fastq_base_dir: Base directory where FASTQ files are stored
        output_file: Output samplesheet CSV file
        naming_scheme: How to name samples ('gsm', 'status', or 'sequential')
    """
    
    # Read metadata
    df = pd.read_csv(metadata_file)
    
    def safe_agg(x, field_name):
        """Aggregate by taking first value, but validate all are the same"""
        unique_vals = x.unique()
        if len(unique_vals) > 1:
            print(f"WARNING: Inconsistent {field_name} values found: {unique_vals}")
            print(f"         Using first value: {unique_vals[0]}")
        return unique_vals[0]
    
    # Group by Sample Name (GSM ID) to get unique biological samples
    samples = df.groupby('Sample Name').agg({
        'Run': lambda x: list(x),  # List of all SRR runs
        'sex': lambda x: safe_agg(x, 'sex'),
        'AGE': lambda x: safe_agg(x, 'AGE'),
        'BMI': lambda x: safe_agg(x, 'BMI'),
        'ahi': lambda x: safe_agg(x, 'ahi'),
        'status': lambda x: safe_agg(x, 'status')
    }).reset_index()
    
    # Generate sample names based on scheme
    sample_mapping = generate_sample_names(samples, naming_scheme)
    
    # Create samplesheet
    samplesheet_data = []
    
    for _, row in samples.iterrows():
        gsm_id = row['Sample Name']
        sample_id = sample_mapping[gsm_id]
        runs = row['Run']
        
        # Path to directory containing all FASTQ files for this sample
        fastq_dir = f"{fastq_base_dir}/{sample_id}"
        
        samplesheet_data.append({
            'sample_id': sample_id,
            'gsm_id': gsm_id,
            'fastq_dir': fastq_dir,
            'n_runs': len(runs),
            'runs': ','.join(runs),
            'sex': row['sex'],
            'age': row['AGE'],
            'bmi': row['BMI'],
            'ahi': row['ahi'],
            'status': row['status']
        })
    
    # Create DataFrame and save
    samplesheet_df = pd.DataFrame(samplesheet_data)
    
    # Save full version with metadata
    samplesheet_df.to_csv(output_file.replace('.csv', '_full.csv'), index=False)
    print(f"Full samplesheet saved to: {output_file.replace('.csv', '_full.csv')}")
    
    # Save minimal version for pipeline
    minimal_df = samplesheet_df[['sample_id', 'fastq_dir']]
    minimal_df.to_csv(output_file, index=False)
    print(f"Minimal samplesheet saved to: {output_file}")
    
    # Print summary
    print(f"\nSummary:")
    print(f"Total biological samples: {len(samplesheet_df)}")
    print(f"Total sequencing runs: {samplesheet_df['n_runs'].sum()}")
    print(f"\nRuns per sample:")
    print(samplesheet_df['n_runs'].value_counts().sort_index())
    
    print(f"\nSample breakdown by status:")
    print(samplesheet_df['status'].value_counts())
    
    return samplesheet_df

In [4]:
df = generate_samplesheet('metadata/OSA_SraRunTable.csv','fastq','metadata/samples.csv','sequential')

Full samplesheet saved to: metadata/samples_full.csv
Minimal samplesheet saved to: metadata/samples.csv

Summary:
Total biological samples: 22
Total sequencing runs: 88

Runs per sample:
n_runs
4    22
Name: count, dtype: int64

Sample breakdown by status:
status
NoOSA    11
OSA      11
Name: count, dtype: int64


In [5]:
def create_download_script(metadata_file, output_script, naming_scheme='status'):
    """
    Create a bash script to download FASTQ files organized by sample
    
    Args:
        naming_scheme: How to name samples
            - 'gsm': Use GSM IDs (e.g., GSM6617010)
            - 'status': Use status_sex_age (e.g., OSA_female_4)
            - 'sequential': Use status with numbers (e.g., OSA_001, NoOSA_001)
    """
    
    df = pd.read_csv(metadata_file)
    
    # Group by Sample Name
    sample_groups = df.groupby('Sample Name').agg({
        'Run': lambda x: list(x),
        'sex': lambda x: x.iloc[0],
        'AGE': lambda x: x.iloc[0],
        'status': lambda x: x.iloc[0]
    }).reset_index()
    
    # Generate sample names based on scheme
    sample_mapping = {}
    
    if naming_scheme == 'gsm':
        for _, row in sample_groups.iterrows():
            sample_mapping[row['Sample Name']] = row['Sample Name']
    
    elif naming_scheme == 'status':
        for _, row in sample_groups.iterrows():
            gsm = row['Sample Name']
            status = row['status']
            sex = row['sex']
            age = int(row['AGE'])
            sample_name = f"{status}_{sex}_{age}"
            
            # Handle duplicates by adding suffix
            base_name = sample_name
            counter = 1
            while sample_name in sample_mapping.values():
                sample_name = f"{base_name}_{counter}"
                counter += 1
            
            sample_mapping[gsm] = sample_name
    
    elif naming_scheme == 'sequential':
        # Count samples per status
        status_counts = {'OSA': 0, 'NoOSA': 0}
        
        for _, row in sample_groups.iterrows():
            gsm = row['Sample Name']
            status = row['status']
            status_counts[status] += 1
            sample_name = f"{status}_{status_counts[status]:03d}"
            sample_mapping[gsm] = sample_name
    
    with open(output_script, 'w') as f:
        f.write("#!/bin/bash\n\n")
        f.write("# Download and organize FASTQ files by sample\n")
        f.write("# Files are renamed to Cell Ranger format\n")
        f.write(f"# Naming scheme: {naming_scheme}\n")
        f.write("# Uses SRA Toolkit (prefetch + fasterq-dump)\n\n")
        f.write("set -e\n\n")
        f.write("# Check if required tools are installed\n")
        f.write("command -v prefetch >/dev/null 2>&1 || { echo 'prefetch not found. Install SRA Toolkit.'; exit 1; }\n")
        f.write("command -v fasterq-dump >/dev/null 2>&1 || { echo 'fasterq-dump not found. Install SRA Toolkit.'; exit 1; }\n")
        f.write("command -v gzip >/dev/null 2>&1 || { echo 'pigz not found. Install pigz for compression.'; exit 1; }\n\n")
        
        # Write mapping table as comment
        f.write("# Sample Mapping:\n")
        for gsm, sample_name in sample_mapping.items():
            f.write(f"# {gsm} -> {sample_name}\n")
        f.write("\n")
        
        for _, row in sample_groups.iterrows():
            gsm = row['Sample Name']
            sample_name = sample_mapping[gsm]
            runs = row['Run']
            
            f.write(f"\n# {'='*60}\n")
            f.write(f"# Sample: {sample_name} (Original: {gsm})\n")
            f.write(f"# Status: {row['status']}, Sex: {row['sex']}, Age: {row['AGE']}\n")
            f.write(f"# {'='*60}\n\n")
            
            f.write(f"echo 'Processing {sample_name}...'\n")
            f.write(f"mkdir -p fastq/{sample_name}\n")
            f.write(f"cd fastq/{sample_name}\n\n")
            
            for i, run in enumerate(runs, 1):
                f.write(f"# Lane {i}/4: {run}\n")
                f.write(f"echo '  Downloading {run} (Lane {i})...'\n")
                f.write(f"prefetch {run} || {{ echo 'Failed to prefetch {run}'; exit 1; }}\n")
                f.write(f"fasterq-dump {run} --split-files --include-technical --threads 8 || {{ echo 'Failed to dump {run}'; exit 1; }}\n\n")
                
                f.write(f"# For 10x data: _3 = R1 (28bp barcode+UMI), _4 = R2 (90bp cDNA)\n")
                f.write(f"# Rename to Cell Ranger format and discard index reads (_1, _2)\n")
                f.write(f"if [ -f {run}_3.fastq ] && [ -f {run}_4.fastq ]; then\n")
                f.write(f"    echo '  Using _3 as R1 and _4 as R2 (10x Genomics format)'\n")
                f.write(f"    mv {run}_3.fastq {sample_name}_S1_L00{i}_R1_001.fastq\n")
                f.write(f"    mv {run}_4.fastq {sample_name}_S1_L00{i}_R2_001.fastq\n")
                f.write(f"    rm -f {run}_1.fastq {run}_2.fastq  # Remove index reads\n")
                f.write(f"elif [ -f {run}_1.fastq ] && [ -f {run}_2.fastq ]; then\n")
                f.write(f"    echo '  Using _1 as R1 and _2 as R2 (standard paired-end)'\n")
                f.write(f"    mv {run}_1.fastq {sample_name}_S1_L00{i}_R1_001.fastq\n")
                f.write(f"    mv {run}_2.fastq {sample_name}_S1_L00{i}_R2_001.fastq\n")
                f.write(f"else\n")
                f.write(f"    echo 'ERROR: Cannot find expected FASTQ files for {run}'\n")
                f.write(f"    ls -lh {run}*.fastq\n")
                f.write(f"    exit 1\n")
                f.write(f"fi\n\n")
                
                f.write(f"# Compress FASTQ files\n")
                f.write(f"echo '  Compressing...'\n")
                f.write(f"gzip {sample_name}_S1_L00{i}_R1_001.fastq\n")
                f.write(f"gzip {sample_name}_S1_L00{i}_R2_001.fastq\n\n")
                
                f.write(f"# Clean up SRA file\n")
                f.write(f"rm -rf {run}\n\n")
            
            f.write(f"cd ../..\n")
            f.write(f"echo 'âœ“ Completed {sample_name}'\n")
            f.write(f"echo ''\n\n")
        
        f.write("\necho 'All samples downloaded and renamed!'\n")
        f.write("echo 'Directory structure:'\n")
        f.write("tree fastq/ -L 2\n")
    
    # Save sample mapping to CSV
    mapping_df = pd.DataFrame([
        {'gsm_id': gsm, 'sample_name': name, **sample_groups[sample_groups['Sample Name'] == gsm].iloc[0].to_dict()}
        for gsm, name in sample_mapping.items()
    ])
    mapping_df = mapping_df[['gsm_id', 'sample_name', 'status', 'sex', 'AGE']]
    mapping_file = output_script.replace('.sh', '_mapping.csv')
    mapping_df.to_csv(mapping_file, index=False)
    
    print(f"Download script saved to: {output_script}")
    print(f"Sample mapping saved to: {mapping_file}")
    print(f"Make executable with: chmod +x {output_script}")


In [6]:
create_download_script('metadata/OSA_SraRunTable.csv', 'metadata/download.sh', 'sequential')

Download script saved to: metadata/download.sh
Sample mapping saved to: metadata/download_mapping.csv
Make executable with: chmod +x metadata/download.sh
