# Lab 4

In [None]:
# Imports (should be alphabetized)

# Relative imports first
from library.python import OSUtils
from library.python import GenomeUtils
from library.python import RequestUtils

# Globbing (file pattern searching)
import glob

# File I/O
import os

# Plotting
import matplotlib.pyplot as plt
import numpy as np

# # Regular expressions
# import re

# System calls
import subprocess

# Batch IDs
import uuid

In [None]:




# --- !!! IMPORTANT !!! --- #




# Set the home directory.
home_directory = '/home/helios/'





In [None]:




# --- !!! IMPORTANT !!! --- #




# Set the working directory.

# Path must be absolute, cannot use '~/analyses'.
os.chdir(home_directory + 'lab_4/')





In [None]:
# First, define a batch ID.
# Note the str(batch_id) typecast.
batch_id = str(uuid.uuid4())

In [None]:
# Create a variable to hold the batch folder.
batch_folder = os.getcwd() + '/batches/' + batch_id

# Create the batch folder and enter it.
os.mkdir(batch_folder)
os.chdir(batch_folder)

In [None]:
# Make sure we're in the right folder.
print(os.getcwd())

# Step 1 - Get the data from NIH

In [None]:
# Instantiate
OU = OSUtils.OSUtils()
RU = RequestUtils.RequestUtils()

In [None]:
# The URL to try.
dwnld = 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE116334&format=file'

In [None]:
# Set the output folder for this step.

# Notice that we do not os.chdir(os.getcwd() + '/nih_pull/')
# because we want to stay in the top-level folder
# during our pipeline run.  Same logic goes for all
# following steps.
output_folder = os.getcwd() + '/nih_pull/'

In [None]:
# Establish where to download URLs.
url_path = OU.pathalize(
    pth = output_folder
)

In [None]:
# Create the output folder.
os.mkdir(url_path)

In [None]:
# Can we download the URL?
# If so, download the URL and decompress it.
if RU.is_downloadable(url = dwnld):
    
    # Download.
    RU.download_url(
        where = url_path,
        url = dwnld
    )
    
    # Decompress the tar.
    OU.decompress_tar(
        source_file = url_path + RU.filename_from_url(
            url = dwnld
        ),
        where = output_folder
    )
    
    # Decompress each of the .gz files.
    for f in OU.file_list_by_extension(
        p = OU.homogenize_path(
                p = output_folder
            ), 
        xtnsn = 'gz'
        ):

            OU.decompress_gzip(
                source_file = f,
                where = url_path
            )

# Step 2 - Find peaks by category

In [None]:
# Set the output folder for this step.
output_folder = os.getcwd() + '/peaks_by_category/'

In [None]:
# Create the output folder.
os.mkdir(output_folder)

In [None]:
# TODO: fix later to remove race condition!

# Let's use the R script from our library
# to find the peaks by category.

# This step may take a second.

# Note that there are other subprocess commands
# that we can use to delay the next step in the script
# until this one finishes.
subprocess.Popen("Rscript " + home + library/R/peaks_by_category.r -c ctrl\(.*?\)narrowPeak$ -t salt\(.*?\)narrowPeak$ -l ./nih_pull/ -w " + output_folder, shell = True)

# Step 3 - Use BLAST to assess the Mosi and random sequences

In [None]:
# Set the output folder for this step.
output_folder = os.getcwd() + '/blast/'

In [None]:
# Create the output folder.
os.mkdir(output_folder)

In [None]:
# Set where the BLAST database is.
blastdb = '/home/helios/analyses/genomes/a_thaliana/blastdbs/blastdbs'

In [None]:
# Instantiate
GU = GenomeUtils.GenomeUtils()

In [None]:
# We have to tell the interpreter where BLAST is
# because it was a custom build.
GU.blast_path = '/home/helios/built/ncbi-blast-2.13.0+/'

In [None]:
# Write the Mosi sequence to file.
GU.write_fastas(
    custom_name = 'Mosi_sequence',
    sequences = ['AGCCTAGCTG'],
    where = output_folder
)

In [None]:
# Write some random sequences to file.
GU.write_fastas(
    sequences = GU.random_motif(
        n = 20,
        t = 5
    ),
    where = output_folder
)

In [None]:
# Define the FASTA files.
fasta_files = glob.glob(output_folder + '*.fa')

In [None]:
# Try the sequences against BlastDB.

# Use the sequence as the output file name.
for sequence in fasta_files:
    GU.call_blast(
        db_path = blastdb,
        name = sequence.split('/')[-1],
        sequence = sequence,
        write_to = output_folder
    )

In [None]:
# TODO: Fix later to remove race condition!

# Parse the BLAST output.
for sequence in fasta_files:
    GU.parse_blast(
        where = sequence + '.BLAST.results',
        write_to = output_folder
    )

# Step 4 - See what matches between a given sequence and the experimental data

In [None]:
# Set the output folder for this step.
output_folder = os.getcwd() + '/treatment_match/'

In [None]:
# Create the output folder.
os.mkdir(output_folder)

In [None]:
# Define where the BLAST matches are as well
# as the peaks file.
blast_matches = os.getcwd() + '/blast/*.matches'
peaks = os.getcwd() + '/peaks_by_category/*.peaks'

In [None]:
# Get the blast matches and the peaks files.
match_list = []

for m in glob.glob(blast_matches):
    match_list.append(m)

peaks_list = []

for p in glob.glob(peaks):
    peaks_list.append(p)

In [None]:
# Go over each BLAST matches file and compare
# to each peaks file.
for m in match_list:
    for p in peaks_list:
        subprocess.Popen("Rscript ../../library/R/blast_to_peaks.r -b " + m + " -p " + p + " -w " + output_folder, shell = True)