# Lab 4

In [19]:
# Imports (should be alphabetized)

# Relative imports first
from library.python import OSUtils
from library.python import GenomeUtils
from library.python import RequestUtils

# Globbing (file pattern searching)
import glob

# File I/O
import os

# Plotting
import matplotlib.pyplot as plt
import numpy as np

# # Regular expressions
# import re

# System calls
import subprocess

# Batch IDs
import uuid

In [20]:




# --- !!! IMPORTANT !!! --- #




# Set the home directory.
home_directory = '/home/aeros/'





In [21]:




# --- !!! IMPORTANT !!! --- #




# Set the working directory.

# Path must be absolute, cannot use '~/analyses'.
os.chdir(home_directory + 'analyses/lab_4/')





In [22]:
# First, define a batch ID.
# Note the str(batch_id) typecast.
batch_id = str(uuid.uuid4())

In [23]:
# Create a variable to hold the batch folder.
batch_folder = os.getcwd() + '/batches/' + batch_id

# Create the batch folder and enter it.
os.makedirs(batch_folder)
os.chdir(batch_folder)

In [24]:
# Make sure we're in the right folder.
print(os.getcwd())

/home/aeros/analyses/lab_4/batches/6a164690-a690-405d-8b61-56a4608e1d21


# Step 1 - Get the data from NIH

In [25]:
# Instantiate
OU = OSUtils.OSUtils()
RU = RequestUtils.RequestUtils()

In [26]:
# The URL to try.
dwnld = 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE116334&format=file'

In [27]:
# Set the output folder for this step.

# Notice that we do not os.chdir(os.getcwd() + '/nih_pull/')
# because we want to stay in the top-level folder
# during our pipeline run.  Same logic goes for all
# following steps.
output_folder = os.getcwd() + '/001_nih_pull/'

In [28]:
# Establish where to download URLs.
url_path = OU.pathalize(
    pth = output_folder
)

In [29]:
# Create the output folder.
os.mkdir(url_path)

In [30]:
# Can we download the URL?
# If so, download the URL and decompress it.
if RU.is_downloadable(url = dwnld):
    
    # Download.
    RU.download_url(
        where = url_path,
        url = dwnld
    )
    
    # Decompress the tar.
    OU.decompress_tar(
        source_file = url_path + RU.filename_from_url(
            url = dwnld
        ),
        where = output_folder
    )
    
    # Decompress each of the .gz files.
    for f in OU.file_list_by_extension(
        p = OU.homogenize_path(
                p = output_folder
            ), 
        xtnsn = 'gz'
        ):

            OU.decompress_gzip(
                source_file = f,
                where = url_path
            )

# Step 2 - Find peaks by category

In [31]:
# Set the output folder for this step.
output_folder = os.getcwd() + '/002_peaks_by_category/'

In [32]:
# Create the output folder.
os.mkdir(output_folder)

In [33]:
# TODO: fix later to remove race condition!

# Let's use the R script from our library
# to find the peaks by category.

# This step may take a second.

# Note that there are other subprocess commands
# that we can use to delay the next step in the script
# until this one finishes.
subprocess.Popen("Rscript " + home_directory + "analyses/git_repos/pib792/lab_4/library/R/peaks_by_category.r -c ctrl\(.*?\)narrowPeak$ -t salt\(.*?\)narrowPeak$ -l ./001_nih_pull/ -w " + output_folder, shell = True)

<Popen: returncode: None args: 'Rscript /home/aeros/analyses/git_repos/pib79...>

# Step 3 - Use BLAST to assess the Mosi and random sequences

In [34]:
# Set the output folder for this step.
output_folder = os.getcwd() + '/003_blast/'

In [35]:
# Create the output folder.
os.mkdir(output_folder)

In [36]:
# Set where the BLAST database is.
blastdb = home_directory + 'analyses/genomes/a_thaliana/blastdbs/blastdbs'

In [37]:
# Instantiate
GU = GenomeUtils.GenomeUtils()

In [38]:
# We have to tell the interpreter where BLAST is
# because it was a custom build.
GU.blast_path = home_directory + 'built/ncbi-blast-2.13.0+/'

In [39]:
# Write the Mosi sequence to file.
GU.write_fastas(
    custom_name = 'Mosi_sequence',
    sequences = ['AGCCTAGCT'],
    where = output_folder
)

In [40]:
# Write some random sequences to file.
GU.write_fastas(
    sequences = GU.random_motif(
        n = 20,
        t = 5
    ),
    where = output_folder
)

In [41]:
# Define the FASTA files.
fasta_files = glob.glob(output_folder + '*.fa')

In [42]:
# Try the sequences against BlastDB.

# Use the sequence as the output file name.
for sequence in fasta_files:
    GU.call_blast(
        db_path = blastdb,
        name = sequence.split('/')[-1],
        sequence = sequence,
        write_to = output_folder
    )

In [43]:
# TODO: Fix later to remove race condition!

# Parse the BLAST output.
for sequence in fasta_files:
    GU.parse_blast(
        where = sequence + '.BLAST.results',
        write_to = output_folder
    )

# Step 4 - See what matches between a given sequence and the experimental data

In [44]:
# Set the output folder for this step.
output_folder = os.getcwd() + '/004_treatment_match/'

In [45]:
# Create the output folder.
os.mkdir(output_folder)

In [46]:
# Define where the BLAST matches are as well
# as the peaks file.
blast_matches = os.getcwd() + '/003_blast/*.matches'
peaks = os.getcwd() + '/002_peaks_by_category/*.peaks'

In [47]:
# Get the blast matches and the peaks files.
match_list = []

for m in glob.glob(blast_matches):
    match_list.append(m)

peaks_list = []

for p in glob.glob(peaks):
    peaks_list.append(p)

In [48]:
# Go over each BLAST matches file and compare
# to each peaks file.

# Note that the message "Error in seq.default(start, stop) : 'from' must be of length 1"
# indicates that BLAST couldn't find any matches for
# the given sequence m in match_list.
for m in match_list:
    for p in peaks_list:
        subprocess.Popen("Rscript " + home_directory + "analyses/git_repos/pib792/lab_4/library/R/blast_to_peaks.r -b " + m + " -p " + p + " -w " + output_folder, shell = True)

Error in seq.default(start, stop) : 'from' must be of length 1
Calls: [ -> [.data.table -> seq -> seq.default
Execution halted
Error in seq.default(start, stop) : 'from' must be of length 1
Calls: [ -> [.data.table -> seq -> seq.default
Execution halted
Error in seq.default(start, stop) : 'from' must be of length 1
Calls: [ -> [.data.table -> seq -> seq.default
Execution halted
