# SAMOSA-Tag
#### 08/03/2022
This notebook demonstrates how to analyze SAMOSA-Tag data using the [SAMOSA-ChAAT computational pipeline](https://github.com/RamaniLab/SAMOSA-ChAAT) and perform the set of analyses described in our associated manuscript.

In [None]:
## Imports
import os

In [None]:
## Get path of notebook 
NOTEBOOK_PATH = os.getcwd()
## Set working directory paths
BASE_DIR=os.path.join(os.path.dirname(NOTEBOOK_PATH))
TOP_DIR=os.path.join(BASE_DIR,'smrt_tag')
## Specify the top-level directory 
TOP_DIR='/Users/snanda/storage/lab/projects/SMRT-Tag/smrt_tag/'

In [None]:
## Set up data directories
directories=[
    'raw/OS152/',
    'analyses/OS152/',
    'preprocess/align/',
    'preprocess/samosa/ccs/',
    'preprocess/samosa/processed/full/',
    'preprocess/samosa/processed/forNN/',
    'preprocess/samosa/processed/forHMM/',
    'preprocess/samosa/processed/binarized/HMMout/',
    'preprocess/samosa/processed/inaccessibleRegions/',
    'preprocess/samosa/processed/density/',
    'preprocess/samosa/processed/annot/'
]

for directory in directories:
    os.makedirs("{}/{}".format(TOP_DIR,directory),exist_ok=True)

###  0) Download raw data from SRA

In [None]:
# %%bash
# #!/usr/bin/env bash
# # download_OS152_raw_data.sh: bash script for downloading OS152 SAMOSA-Tag raw data from the NCBI SRA
# # Usage: ./download_OS152_raw_data.sh
# prefetch SRRXXX SRRXXX SRRXXX SRRXXX SRRXXX SRRXXX SRRXXX SRRXXX SRRXXX SRRXXX
# ## plusM
# --output-file "$TOP_DIR/raw/OS152/SAMOSA-Tag.subread.OS152_PR27_plusM--OS152_PR27_plusM.bam"
# --output-file "$TOP_DIR/raw/OS152/SAMOSA-Tag.subread.OS152_PR28_plusM--OS152_PR28_plusM.bam"
# --output-file "$TOP_DIR/raw/OS152/SAMOSA-Tag.subread.OS152_PR29_plusM--OS152_PR29_plusM.bam"
# --output-file "$TOP_DIR/raw/OS152/SAMOSA-Tag.subread.OS152_PR30_plusM--OS152_PR30_plusM.bam"
# --output-file "$TOP_DIR/raw/OS152/SAMOSA-Tag.subread.OS152_PR31_plusM--OS152_PR31_plusM.bam"
# --output-file "$TOP_DIR/raw/OS152/SAMOSA-Tag.subread.OS152_PR32_plusM--OS152_PR32_plusM.bam"
# --output-file "$TOP_DIR/raw/OS152/SAMOSA-Tag.subread.OS152_PR33_plusM--OS152_PR33_plusM.bam"
# --output-file "$TOP_DIR/raw/OS152/SAMOSA-Tag.subread.OS152_PR34_plusM--OS152_PR34_plusM.bam"
# ## minusM
# --output-file "$TOP_DIR/raw/OS152/SAMOSA-Tag.subread.OS152_PR12_minusM--OS152_PR12_minusM.bam"
# --output-file "$TOP_DIR/raw/OS152/SAMOSA-Tag.subread.OS152_PR30_minusM--OS152_PR30_minusM.bam"
# --output-file "$TOP_DIR/raw/OS152/SAMOSA-Tag.subread.OS152_PR31_minusM--OS152_PR31_minusM.bam"

### 1a.) Run CCS

In [None]:
%%bash
#!/usr/bin/env bash
# OS152_ccs.sh: bash script to run ccs on OS152 SAMOSA-Tag raw subread BAM files 
# Usage: ./OS152_ccs.sh
set -eo pipefail

for subreads_file in $(ls "${TOP_DIR}/raw/OS152/*bam");
do;
    file=$(basename $subreads_file)
    
    outfile="${file/subread/subread.ccs}"
    
    ccs \
        -j 50 \
        --log-level=INFO \
        --hifi-kinetics \
        $subreads_file \
        "${TOP_DIR}/preprocess/samosa/ccs/${outfile/--*/.bam}"
done



### 1b.) Merge relevant CCS files

In [None]:
%%bash
#!/usr/bin/env bash
# merge_OS152_ccs_data.sh: bash script to merge OS152 SAMOSA-Tag ccs bam files by condition
# Usage: ./merge_OS152_ccs_data.sh
set -eo pipefail

pbmerge -j 15 \
    "$TOP_DIR/preprocess/samosa/ccs/SAMOSA-Tag.subread.ccs.OS152_PR27_plusM.bam" \
    "$TOP_DIR/preprocess/samosa/ccs/SAMOSA-Tag.subread.ccs.OS152_PR28_plusM.bam" \
    "$TOP_DIR/preprocess/samosa/ccs/SAMOSA-Tag.subread.ccs.OS152_PR29_plusM.bam" \
    "$TOP_DIR/preprocess/samosa/ccs/SAMOSA-Tag.subread.ccs.OS152_PR30_plusM.bam" \
    "$TOP_DIR/preprocess/samosa/ccs/SAMOSA-Tag.subread.ccs.OS152_PR31_plusM.bam" \
    "$TOP_DIR/preprocess/samosa/ccs/SAMOSA-Tag.subread.ccs.OS152_PR32_plusM.bam" \
    "$TOP_DIR/preprocess/samosa/ccs/SAMOSA-Tag.subread.ccs.OS152_PR33_plusM.bam" \
    "$TOP_DIR/preprocess/samosa/ccs/SAMOSA-Tag.subread.ccs.OS152_PR34_plusM.bam"
    -o "$TOP_DIR/preprocess/samosa/ccs/SAMOSA-Tag.subread.ccs.OS152_plusM.bam"
    
    
pbmerge -j 15 \
    "$TOP_DIR/preprocess/samosa/ccs/SAMOSA-Tag.subread.ccs.OS152_PR12_minusM.bam" \
    "$TOP_DIR/preprocess/samosa/ccs/SAMOSA-Tag.subread.ccs.OS152_PR30_minusM.bam" \
    "$TOP_DIR/preprocess/samosa/ccs/SAMOSA-Tag.subread.ccs.OS152_PR31_minusM.bam" \
    -o "$TOP_DIR/preprocess/samosa/ccs/SAMOSA-Tag.subread.ccs.OS152_minusM.bam"

### 1c.) Run primrose on aggregated ccs files 

In [None]:
%%bash
#!/usr/bin/env bash
# run_primrose_OS152_ccs_data.sh: bash script to run primrose on merged OS152 SAMOSA-Tag ccs data
# Usage: ./run_primrose_OS152_ccs_data.sh

primrose \
    -j 30 \
    --log-level INFO 
    --keep-kinetics \
    "$TOP_DIR/preprocess/samosa/ccs/SAMOSA-Tag.subread.ccs.OS152_plusM.bam"
    "$TOP_DIR/preprocess/samosa/ccs/SAMOSA-Tag.subread.ccs.OS152_plusM.primrose.bam"


primrose \
    -j 30 \
    --log-level INFO 
    --keep-kinetics \
    "$TOP_DIR/preprocess/samosa/ccs/SAMOSA-Tag.subread.ccs.OS152_minusM.bam"
    "$TOP_DIR/preprocess/samosa/ccs/SAMOSA-Tag.subread.ccs.OS152_minusM.primrose.bam"



### 1d.) Align all ccs files

In [None]:
%%bash
#!/usr/bin/env bash
# align_OS152_ccs_data.sh: bash script to align all OS152 SAMOSA-Tag ccs data
# Usage: ./align_OS152_ccs_data.sh

for ccs_file in $(ls "${TOP_DIR}/preprocess/samosa/ccs/*OS152*bam");
do;
    file=$(basename $ccs_file)
    
    pbmm2 align \
    --preset CCS \
    --sort -j 30 \
    --bam-index BAI \
    --log-level DEBUG \
    $ccs_file \
    "$TOP_DIR/ref/GRCh38/hg38.fa" \
    "$TOP_DIR/preprocess/align/${file/.bam/.align.sorted.bam}"
    
done

### 2) Running the SAMOSA-ChAAT computational pipeline

In [None]:
## Set up the sample reference file for the SAMOSA-ChAAT pipeline
sampleref_string = [
    "index,ccsFile,unalignedSubreadsFile,sampleName,cell,reference\n",
    "0,{}/preprocess/samosa/ccs/SAMOSA-Tag.subread.ccs.OS152_PR27_plusM.bam,{}/raw/OS152/SAMOSA-Tag.subread.OS152_PR27_plusM--OS152_PR27_plusM.bam,PR27_plusM,OS152,none\n",
    "1,{}/preprocess/samosa/ccs/SAMOSA-Tag.subread.ccs.OS152_PR28_plusM.bam,{}/raw/OS152/SAMOSA-Tag.subread.OS152_PR28_plusM--OS152_PR28_plusM.bam,PR28_plusM,OS152,none\n",
    "2,{}/preprocess/samosa/ccs/SAMOSA-Tag.subread.ccs.OS152_PR29_plusM.bam,{}/raw/OS152/SAMOSA-Tag.subread.OS152_PR29_plusM--OS152_PR29_plusM.bam,PR29_plusM,OS152,none\n",
    "3,{}/preprocess/samosa/ccs/SAMOSA-Tag.subread.ccs.OS152_PR30_plusM.bam,{}/raw/OS152/SAMOSA-Tag.subread.OS152_PR30_plusM--OS152_PR30_plusM.bam,PR30_plusM,OS152,none\n",
    "4,{}/preprocess/samosa/ccs/SAMOSA-Tag.subread.ccs.OS152_PR31_plusM.bam,{}/raw/OS152/SAMOSA-Tag.subread.OS152_PR31_plusM--OS152_PR31_plusM.bam,PR31_plusM,OS152,none\n",
    "5,{}/preprocess/samosa/ccs/SAMOSA-Tag.subread.ccs.OS152_PR32_plusM.bam,{}/raw/OS152/SAMOSA-Tag.subread.OS152_PR32_plusM--OS152_PR32_plusM.bam,PR32_plusM,OS152,none\n",
    "6,{}/preprocess/samosa/ccs/SAMOSA-Tag.subread.ccs.OS152_PR33_plusM.bam,{}/raw/OS152/SAMOSA-Tag.subread.OS152_PR33_plusM--OS152_PR33_plusM.bam,PR33_plusM,OS152,none\n",
    "7,{}/preprocess/samosa/ccs/SAMOSA-Tag.subread.ccs.OS152_PR34_plusM.bam,{}/raw/OS152/SAMOSA-Tag.subread.OS152_PR34_plusM--OS152_PR34_plusM.bam,PR34_plusM,OS152,none\n",
    "8,{}/preprocess/samosa/ccs/SAMOSA-Tag.subread.ccs.OS152_PR12_minusM.bam,{}/raw/OS152/SAMOSA-Tag.subread.OS152_PR12_minusM--OS152_PR12_minusM.bam,PR12_minusM,OS152,none\n",
    "9,{}/preprocess/samosa/ccs/SAMOSA-Tag.subread.ccs.OS152_PR30_minusM.bam,{}/raw/OS152/SAMOSA-Tag.subread.OS152_PR30_minusM--OS152_PR30_minusM.bam,PR30_minusM,OS152,none\n",
    "10,{}/preprocess/samosa/ccs/SAMOSA-Tag.subread.ccs.OS152_PR31_minusM.bam,{}/raw/OS152/SAMOSA-Tag.subread.OS152_PR31_minusM--OS152_PR31_minusM.bam,PR31_minusM,OS152,none"
]

sample_reference_filepath = '{}/preprocess/samosa/OS152.referenceFile.csv'.format(TOP_DIR)
with open(sample_reference_filepath,'w+') as f:
    f.write("".join(x.format(TOP_DIR,TOP_DIR) for x in sampleref_string))

print(sample_reference_filepath)

#### 2a.) Extract sample IPDs

In [None]:
%%bash
#!/usr/bin/env bash
# 01_extractIpdlinear.sh: bash script to run 01_extractIpdlinear.py, and extract IPD values per read
# Usage: ./01_extractIpdlinear.sh *sample_ids*
#
## Inputs:
#     --referenceFile: SAMOSA-ChAAT reference file
#     --threads: number of threads (15)
#     --outputlocation: Where to write output files
#
## Outputs:
#     Output files are written to $outputlocation/processed/full/
#     *_full.pickle: dictionary containing IPD measurements per molecule
#     *_full_zmwinfo.pickle: data frame containing molecule attributes (length, ZMWID etc.)
set -eu pipefail

python3 $BASE_DIR/samosa-chaat/01_extractIpdlinear.py \
    0 1 2 3 4 5 6 7 8 9 10 \ ## samples {PR27-PR34 plusM, PR12-31 minusM}
    --referenceFile "$TOP_DIR/preprocess/samosa/OS152.referenceFile.csv" \
    --threads 15 \
    --outputlocation "$TOP_DIR/preprocess/samosa/"


#### 2b.) Format data for network inference

In [None]:
%%bash
#!/usr/bin/env bash
# 02_formatNN.sh: bash script to run 02_formatNN.py, and format reads for NN inference
# Usage: ./02_formatNN.sh *sample_ids*
#
## Inputs:
#     --referenceFile: SAMOSA-ChAAT reference file
#     --outputlocation: Where to write output files
#     Input Files are read from $outputlocation/processed/full/*_full.pickle
#
## Outputs:
#     Output files are written to $outputlocation/processed/forNN/
#     *_forNN.npz: Read & IPD data formatted for NN inference
set -eu pipefail

python3 $BASE_DIR/samosa-chaat/02_formatNN.py \
    0 1 2 3 4 5 6 7 8 9 10 \ ## samples {PR27-PR34 plusM, PR12-31 minusM} 
    --referenceFile "$TOP_DIR/preprocess/samosa/OS152.referenceFile.csv" \
    --outputlocation "$TOP_DIR/preprocess/samosa/"



#### 2c.) Run network inference

In [None]:
%%bash
#!/usr/bin/env bash
# 03_NN_inference.sh: bash script to run 03_NN_inference.py, and carry out NN inference on reads
# Usage: ./03_NN_inference.sh *sample_ids*
#
## Inputs:
#     --threshold: Threshold for determining bases as m6dA methylated (0.42)
#     --modeldir: pathway to directory holding IPD models
#     --referenceFile: SAMOSA-ChAAT reference file
#     --outputlocation: Where to write output files
#     Input Files are read from $outputlocation/processed/full/*_full_zmwinfo.pickle
#     Input Files are read from $outputlocation/processed/forNN/*_forNN.npz
#
## Outputs:
#     Output files are written to $outputlocation/processed/forHMM/
#     *_forHMM_resid-*_piece*.pickle: Data frame containing inferred methylation probability using the provided thresold
##                                    per adenine. Input for HMM.
set -eu pipefail

python3 $BASE_DIR/samosa-chaat/03_NN_inference.py \
    0 1 2 3 4 5 6 7 8 9 10 \ ## samples {PR27-PR34 plusM, PR12-31 minusM}
    --threshold 0.42 \
    --modeldir "$BASE_DIR/samosa-chaat/models/"
    --referenceFile "$TOP_DIR/preprocess/samosa/OS152.referenceFile.csv" \
    --outputlocation "$TOP_DIR/preprocess/samosa/"


#### 2d.) Run HMM to predcit accessibility footprints

In [None]:
%%bash
#!/usr/bin/env bash
# 04_HMM.sh: bash script to run 04_HMM.py, and carry out HMM accesbility footprint binarization using NN inferred methylation probabilities
# Usage: ./04_HMM.sh *sample_ids*
#
## Inputs:
#     --threshold: Threshold for determining bases as m6dA methylated (0.42)
#     --threads: number of threads (15)
#     --referenceFile: SAMOSA-ChAAT reference file
#     --outputlocation: Where to write output files
#     Input Files are read from $outputlocation/processed/forHMM/*_forHMM_resid-*_piece*.pickle
#
## Outputs:
#     Output files are written to $outputlocation/processed/binarized/HMMout/
#     *_NNsingle.pickle: Dictionary containing per-molecule binarized accessibility footprints. (ZMWIDs are keys)
set -eu pipefail

python3 $BASE_DIR/samosa-chaat/04_HMM.py \
    0 1 2 3 4 5 6 7 8 9 10 \ ## samples {PR27-PR34 plusM, PR12-31 minusM}
    --threshold 0.42 \
    --threads 15 \
    --referenceFile "$TOP_DIR/preprocess/samosa/OS152.referenceFile.csv" \
    --outputlocation "$TOP_DIR/preprocess/samosa/"

#### 2e.) Run post-analysis on accessible footprints to generate nucleosome calls

In [None]:
%%bash
#!/usr/bin/env bash
# 05_postprocessing.sh: bash script to run 05_postprocessing.py, and call accessibility footprint size + nucleosome density
# Usage: ./05_postprocessing.sh *sample_ids*
#
## Inputs:
#     --referenceFile: SAMOSA-ChAAT reference file
#     --outputlocation: Where to write output files
#     Input Files are read from $outputlocation/processed/binarized/HMMout/*_NNsingle.pickle
#     Input Files are read from $outputlocation/processed/full/*_full_zmwinfo.pickle
#
## Outputs:
#     Output files are written to $outputlocation/processed/inaccessibleRegions
#     *_inacRegions.csv: CSV file inidicating inaccessible regions and size of regions
#
#     Output files are written to $outputlocation/processed/density/
#    *_density.csv: CSV file indicating nucleosome density
set -eu pipefail

python3 $BASE_DIR/samosa-chaat/05_postprocessing.py \
    0 1 2 3 4 5 6 7 \ ## samples {PR27-PR34 plusM}
    --referenceFile "$TOP_DIR/preprocess/samosa/OS152.referenceFile.csv" \
    --outputlocation "$TOP_DIR/preprocess/samosa/"
    
    

#### 2f.) Annotate BAM file with accessibility strings

In [None]:
%%bash
#!/usr/bin/env bash
# 06_annotate_bams.sh: bash script to run 06_annotate_bams.py, and annotate reads with accessibility footprints using MM and ML tags
# Usage: ./06_annotate_bams.sh *sample_ids*
#
## Inputs:
#     --merged-bam: Path to merged bam file containing all ccs reads 
#     --referenceFile: SAMOSA-ChAAT reference file
#     --outputlocation: Where to write output files
#     Input Files are read from $outputlocation/processed/binarized/HMMout/*_NNsingle.pickle
#
## Outputs:
#     Output files are written to $outputlocation/processed/annot/
#     *_samosa.bam: BAM file where ML and MM tags represent accessibility profiles
set -eu pipefail

python3 $BASE_DIR/samosa-chaat/06_annotate_bams.py \
    0 1 2 3 4 5 6 7 \ ## samples {PR27-PR34 plusM, PR12-31 minusM}
    --merged-bam "$TOP_DIR/preprocess/samosa/ccs/SAMOSA-Tag.subread.ccs.OS152_plusM.bam"
    --referenceFile "$TOP_DIR/preprocess/samosa/OS152.referenceFile.csv" \
    --outputlocation "$TOP_DIR/preprocess/samosa/"


# Secondary Analyses 

#### 3a.) Compute autocorrelation per read 

In [None]:
%%bash
#!/usr/bin/env bash
#
# 01_compute_autocors_persample.sh: bash script to run 01_compute_autocors_persample.py, and compute autocorrelated accessibility profiles per molecule
# Usage: ./01_compute_autocors_persample.sh *HMM pickle files*
#     --project: Name of project being analyzed
#     --output-dir: Directory to save results
#
## Outputs:
#     output-dir/hmm_pickle.autocors.npy: Numpy array containing autocorrelation information per molecule
#     output-dir/hmm_pickle.zmw_ids: text file listing ZMWIDs that were analyzed via autocorrelation

$BASE_DIR/samosa_tag/01_compute_autocors_persample.py \
    "$TOP_DIR/preprocess/samosa/binarized/HMMout/*plusM*NNsingle.pickle"
    --project OS152_plusM \
    --output-dir "$TOP_DIR/analyses/OS152/"
    

#### 3b.) Cluster autocorrelation patterns to determine fiber types

In [None]:
%%bash
#!/usr/bin/env bash
#
# 02_cluster_autocorrelograms_tn5.sh: bash script to run 02_cluster_autocorrelograms_tn5.py, and cluser per-molecule autocorrelograms to determine fiber types
# Usage: ./02_cluster_autocorrelograms_tn5.sh *HMM pickle files*
#     --autocor-dir: Directory containing results from 01_compute_autocors_persample.py
#     --project: Name of project being analyzed
#     --output-dir: Directory to save results
#
## Outputs:
#     output-dir/project_autocor_clusters.data: text file indicating fiber type cluster membership per read
#     output-dir/project_autocor_averages.data: text file containing average autocorrelation profile per cluster
#     output-dir/project_autocor_signal_averages.data: text file containing average accesibility profile per cluster

$BASE_DIR/samosa_tag/02_cluster_autocorrelograms_tn5.py \
    "$TOP_DIR/preprocess/samosa/binarized/HMMout/*plusM*pickle"
    --autocor-dir "$TOP_DIR/analyses/OS152/"
    --project OS152_plusM \
    --output-dir "$TOP_DIR/analyses/OS152/"
    
    

#### 3c.) Examine fiber type enrichment at CTCF sites

In [None]:
%%bash
#!/usr/bin/env bash
#
# 03_samosa_feature_signal.sh: bash script to run 03_samosa_feature_signal.py, and determine fiber enrichment at U2OS CTCF sites
# Usage: ./02_cluster_autocorrelograms_tn5.sh *HMM pickle files*
#     --aligned-dir: Directory containing aligned SAMOSA-Tag reads
#     --project: Name of project being analyzed
#     --output-dir: Directory to save results
#
## Outputs:
#     output-dir/project_CTCF_access_ctcf_final_include_hole.data: dataframe containing sites that overlap with reads
#     output-dir/project_Ctcf_mols_750bp.npy: aggregated per-molecule accessibility information across all sites

$BASE_DIR/samosa_tag/03_samosa_feature_signal.py \
    "$TOP_DIR/preprocess/samosa/binarized/HMMout/*pickle"
    --aligned-dir "$TOP_DIR/preprocess/align/"
    --project OS152_plusM \
    --output-dir "$TOP_DIR/analyses/OS152/"

#### 4a.) Extract 5mC methylation predictions at CpG sites generated by primrose

In [None]:
%%bash
#!/usr/bin/env bash
# 05_cpg2pickle.sh: bash script to run 05_cpg2pickle.py, and extract primrose 5mC predictions per molecule from reads
# Usage: ./05_cpg2pickle.sh *primrose aligned BAM*
#     --project: Name of project being analyzed
#     --output-dir: Directory to save results
#
## Outputs:
#     output-dir/project_cpg_data.pickle: dictionary containing per-molecule 5mC methylation probability from all BAMs

$BASE_DIR/samosa_tag/05_cpg2pickle.py \
    "$TOP_DIR/preprocess/align/SAMOSA-Tag.subread.ccs.OS152_plusM.primrose.align.sorted.bam"
    --project OS152_plusM \
    --output-dir "$TOP_DIR/analyses/OS152/"
    
    
$BASE_DIR/samosa_tag/05_cpg2pickle \
    "$TOP_DIR/preprocess/align/SAMOSA-Tag.subread.ccs.OS152_minusM.primrose.align.sorted.bam"
    --project OS152_minusM \
    --output-dir "$TOP_DIR/analyses/OS152/"
    

#### 4b.) Examine 5mC enrichment at CTCF sites

In [None]:
%%bash
#!/usr/bin/env bash
#
# 06_primrose_samosa_ctcf_integration_OS.sh: bash script to run 06_primrose_samosa_ctcf_integration_OS.py, and determine 5mC methylation enrichment at U2OS CTCF sites
# Usage: ./06_primrose_samosa_ctcf_integration_OS.sh *HMM pickle files*
#     --cpg-pickle: Pickle file produced by 05_cpg2pickle.py
#     --site-list: List of CTCF sites
#     --project: Name of project being analyzed
#     --output-dir: Directory to save results
#
## Outputs:
#     output-dir/project_CTCF_meth_ctcf_final.data: dataframe containing sites that overlap with reads
#     output-dir/project_Ctcf_methmols_50bp.npy: aggregated per-molecule 5mC information across all sites

$BASE_DIR/samosa_tag/06_primrose_samosa_ctcf_integration_OS.py \
    --cpg-pickle "$TOP_DIR/analyses/OS152/OS152_plusM_cpg_data.pickle" \
    --site-list U2OS_total_ZMWs.5kb.zmws
    --project OS152_plusM \
    --output-dir "$TOP_DIR/analyses/OS152/"
    

#### 5a.) Integrate 5mC signal at CpGs, with SAMOSA signal per molecule

In [None]:
%%bash
#!/usr/bin/env bash
#
# 07_compare_cpg_samosa_OS_data.sh: bash script to run 07_compare_cpg_samosa_OS_data.py, and link 5mC / CpG information and SAMOSA accessibility profiles per read
# Usage: ./07_compare_cpg_samosa_OS_data.sh
#     --cpg-pickle: Pickle file produced by 05_cpg2pickle.py
#     --cluster-labels: Fiber type cluster membership produced by 02_cluster_autocorrelograms_tn5
#     --project: Name of project being analyzed
#     --output-dir: Directory to save results
#
## Outputs:
#     output-dir/project_cluster_cpg.dataout.filtered: text file containing per-molecule cluster membership, cpG content, and CpG methylation prediction

$BASE_DIR/samosa_tag/07_compare_cpg_samosa_OS_data.py \
    --cpg-pickle "$TOP_DIR/analyses/OS152/OS152_plusM_cpg_data.pickle" \
    --cluster-labels "$TOP_DIR/analyses/OS152/OS152_plusM_autocor_clusters.data"
    --project OS152_plusM \
    --output-dir "$TOP_DIR/analyses/OS152/"
    

#### 5b.) Test for enrichment of fiber types stratified by 5mC methylation and CpG count, per molecule

In [None]:
%%bash
#!/usr/bin/env bash
#
# 08_fishers_methylation_OS_SMRT_tag.sh: bash script to run 08_fishers_methylation_OS_SMRT_tag.py, and test for fiber type enrichment in differentially methylated regions
# Usage: ./08_fishers_methylation_OS_SMRT_tag.sh
#     --dataout-filtered: text file produced by 07_compare_cpg_samosa_OS_data.py
#     --project: Name of project being analyzed
#     --output-dir: Directory to save results
#
## Outputs:
#     output-dir/project_fishers.data: Fisher's exact test results for enrichment across all data.

$BASE_DIR/samosa_tag/08_fishers_methylation_OS_SMRT_tag.py \
    --dataout-filtered "$TOP_DIR/analyses/OS152/OS152_plusM_cluster_cpg.dataout.filtered" \
    --project OS152_plusM \
    --output-dir "$TOP_DIR/analyses/OS152/"
    

    
    

#### 5c) Test for enrichment of fiber types stratified by 5mC methylation and CpG count, per molecule, per replicate


In [None]:
%%bash
#!/usr/bin/env bash
#
# 08_fishers_methylation_OS_SMRT_tag.sh: bash script to run 08_fishers_methylation_OS_SMRT_tag.py, and test for fiber type enrichment in differentially methylated regions, stratified by replicate
# Usage: ./08_fishers_methylation_OS_SMRT_tag.sh
#     --dataout-filtered: text file produced by 07_compare_cpg_samosa_OS_data.py
#     --cluster-labels: Fiber type cluster membership produced by 02_cluster_autocorrelograms_tn5
#     --project: Name of project being analyzed
#     --output-dir: Directory to save results
#
## Outputs:
#     output-dir/project_fishers_by_rep.data: Fisher's exact test results for enrichment, stratified by replicate

$BASE_DIR/samosa_tag/08b_reps_samosa_fishers_OS.py \
    --dataout-filtered "$TOP_DIR/analyses/OS152/OS152_plusM_cluster_cpg.dataout.filtered" \
    --cluster-labels "$TOP_DIR/analyses/OS152/OS152_plusM_autocor_clusters.data" \
    --project OS152_plusM \
    --output-dir "$TOP_DIR/analyses/OS152/"

#### 6) Examine enrichment of tagmented molecule ends in TSSs

In [None]:
%%bash
#
# 09_tss2endmatrix_pb.sh: bash script to run 09_tss2endmatrix_pb.py, and examine enrichment of Tn5-ends in TSSs
# Usage: ./09_tss2endmatrix_pb.sh *SAMOSA-Tag aligned.sorted.bam files*
#     --genomic-sites: A TSV containing TSS sites to examine
#     --genomic-sites-name: Name to designate sites examined
#     --valid-chroms: comma-separted string of valid chromosomes to examine (ignore chrM)
#     --project: Name of project being analyzed
#     --output-dir: Directory to save results
#
## Outputs:
#     output-dir/genomic-sites-name_mat.ends.BAMfile: counts of read ends 2500bp up/downstream across all genomic sites 

$BASE_DIR/samosa_tag/09_tss2endmatrix_pb.py \
    "$TOP_DIR/preprocess/align/SAMOSA-Tag*align.sorted.bam" \
    --genomic-sites hg38_TSS_sites.tsv \
    --genomic-sites-name TSS \
    --valid-chroms chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,ch10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22,chrX,chrY
    --project OS152 \
    --output-dir smrt_tag/analyses/OS152/tss/
    
