This is a notebook used to filter out stago reads from each sequencing run and put into a fasta file for blast analysis

generally it contains two simple steps:
1. cut the readid from the final dataframe of each flowcell
2. put into a txt file

In [None]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import argparse
import subprocess as sub
from ete3 import NCBITaxa
ncbi = NCBITaxa()

In [None]:
# here just used replicate 4 as an example again
BASEDIR = '/home/yiheng/data/20170617_replicate4'

# here we define the folder name of the dataframe it created by capturing the folder from the BASDIR
folder_name = os.path.basename(BASEDIR)
column_name = folder_name.split('_')[-1]

In [None]:
# first check if the analysis folder is there
folder_list = 'analysis  basecalled_data  scripts  tracking  workspace'.split(' ')
for x in range(0,folder_list.count('')):
    folder_list.remove('')
#fix this test
if not set(os.listdir(os.path.abspath(BASEDIR))) == set (folder_list):
    print("Something wrong with basefolder. check it please.")

In [None]:
# get the dataframe there
dataframe = os.path.join(BASEDIR, 'analysis', 'summary_df_%s.tab' % folder_name)
sum_df = pd.read_csv(dataframe, sep='\t')
# set the display option so easier to check through writing

In [None]:
# fills the nan with T/F for easier handling
sum_df.sseqid_rg.fillna(False, inplace=True)
# filter out the rg blast hit
rgblasthit_reads = sum_df[(sum_df.sseqid_rg != False) & (sum_df.passes_filtering == True) & (sum_df.pc_survived == True) & (sum_df.nl_survived == True)]
pd.set_option('display.max_columns', None)
if column_name == 'replicate3' or column_name == 'replicate4':
    filtered_barcode_reads = rgblasthit_reads[(rgblasthit_reads.barcode_arrangement == 'barcode01') | 
                                              (rgblasthit_reads.barcode_arrangement == 'barcode02') | 
                                              (rgblasthit_reads.barcode_arrangement == 'barcode03') | 
                                              (rgblasthit_reads.barcode_arrangement == 'barcode04') |
                                              (rgblasthit_reads.barcode_arrangement == 'barcode05')]
    barcode_list = ['barcode01', 'barcode02', 'barcode03', 'barcode04', 'barcode05']
else:
    pass

if column_name == 'replicate1':
    filtered_barcode_reads = rgblasthit_reads[(rgblasthit_reads.barcode_arrangement == 'barcode06') | 
                                              (rgblasthit_reads.barcode_arrangement == 'barcode02') | 
                                              (rgblasthit_reads.barcode_arrangement == 'barcode03') | 
                                              (rgblasthit_reads.barcode_arrangement == 'barcode04') |
                                              (rgblasthit_reads.barcode_arrangement == 'barcode05')]
    barcode_list = ['barcode06', 'barcode02', 'barcode03', 'barcode04', 'barcode05']
else:
    pass

if column_name == 'replicate2':
    filtered_barcode_reads = rgblasthit_reads[(rgblasthit_reads.barcode_arrangement == 'barcode07') | 
                                              (rgblasthit_reads.barcode_arrangement == 'barcode08') | 
                                              (rgblasthit_reads.barcode_arrangement == 'barcode09') | 
                                              (rgblasthit_reads.barcode_arrangement == 'barcode10') |
                                              (rgblasthit_reads.barcode_arrangement == 'barcode11')]
    barcode_list = ['barcode07', 'barcode08', 'barcode09', 'barcode10', 'barcode11']
else:
    pass

In [None]:
stago_reads = filtered_barcode_reads[filtered_barcode_reads.sseqid_rg.str.contains('Stago')]

In [None]:
stago_reads.read_id.to_csv(BASEDIR + '/analysis/%s_stagohit.txt' % column_name, index=None)

In [None]:
# now get all the rghityes fasta file together into one fasta file into analysis folder for filter
if column_name == 'replicate3' or column_name == 'replicate4':
    cmd = r'cat %s/workspace/%s/Hu_%s_albacore202.chopped.rghityes.*.fasta >> %s/analysis/%s_rghityes.fasta'
     
if column_name == 'replicate1' or column_name == 'replicate2':
    cmd = r'cat %s/workspace/%s/Wagga_%s_albacore202.chopped.rghityes.*.fasta >> %s/analysis/%s_rghityes.fasta'
    
for x in barcode_list:
        cmd_stderr = sub.check_output(cmd % (BASEDIR, x, column_name, BASEDIR, folder_name), shell=True, stderr=sub.STDOUT)
        print(cmd)

In [None]:
# now filter out the stago reads from the five samples (five barcodes) using filterbyname.sh script
filter_command = r'filterbyname.sh in=%s/analysis/%s_rghityes.fasta out=%s/analysis/%s_stago.fasta names=%s/analysis/%s_stagohit.txt include=t'
filter_command_stderr = sub.check_output(filter_command % (BASEDIR, folder_name, BASEDIR, column_name, BASEDIR, column_name), shell=True, stderr=sub.STDOUT)
print(filter_command)