## Fit-seq: Tracking allele counts 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sort_seq as ss
import glob
from Bio import SeqIO
from bisect import bisect 
import bisect
import time
import numba

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [None]:
path = '../sequencing/20171114_fitseq_rel_2/'


path = '../sequencing/20171114_fitseq_rel_2/'

file_day_0 = "20160519_MG1655_prel_day0_rel.qcfilt.endATGG.bin1.fastq"

file_day_1 = "20160519_MG1655_prel_day1_rel.qcfilt.endATGG.bin2.fastq"

file_day_3 = "20160519_MG1655_prel_day3_rel.qcfilt.endATGG.bin3.fastq"

fname_day_0 = path + file_day_0
fname_day_1 = path + file_day_1
fname_day_3 = path + file_day_3

In [None]:
@numba.jit(nopython=True)
def nuctup2array(tup):
    
    """
    Turn the sequence tuple of nucleotides to a sequence of numbers in np.array format.
    """
    nuc_arr = np.array(''.join(tup))
    
    mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    
    vec_array = np.array([mapping[x] for x in nuc_array[0]])
    
    return nuc_arr

In [None]:
#@numba.jit(nopython=True)
def nuc2vec_py(nuc_array):
    
    """
    Turn a nucleotide sequence into a sequence of numbers. 
    """
    mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    
    vec_array = ''.join(list(np.array([mapping[x] for x in nuc_array]).astype(str)))
    
    return vec_array


#@numba.jit(nopython=True)
def nuc2vec(nuc_array):
    
    """
    Turn a nucleotide sequence into a sequence of numbers. 
    """
    mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    
    vec_array = [mapping[x] for x in nuc_array[0]]
    
    return vec_array

def vec2nuc(arr):
    
    """
    Turn a nucleotide sequence into a sequence of numbers. 
    """
    mapping = {0:'A', 1: 'C', 2:'G', 3:'T'}
    
    vec_array = np.array([mapping[x] for x in arr[0]])
    
    return vec_array


def get_seqs_py(fname):
    
    with open(fname_day_0) as file: # Use file to refer to the file object

        fastq_data = file.read()
        
    seqs = fastq_data.split('+')
    
    #print(len(seqs))
    
    raw_seqs = [seqs[0].split('\n')[1]]
    
    for i in np.arange(1, len(seqs)-1):
        
        #print(len(seqs[i].split('\n')))
        raw_seqs.append(seqs[i].split('\n')[3])
        
    raw_seqs = np.array(raw_seqs)
        
    return raw_seqs

In [None]:
def get_allele_counts_day_zero(fname):
    
    """
    Get the allele counts for Day 0 
    
    inputs ~~

    fname : path to where the file lives
    
    outputs : df with counts 
    """
    
    t0 = time.time()
    #seqs = []
    
    #Read the sequences with biopython
    #for seq_record in SeqIO.parse(fname, "fastq"):
    #   seqs.append(seq_record.seq)
    
    
    seqs = get_seqs_py(fname)
    
    #Make the sequences a Pandas Series object
    seq_ser = pd.Series(seqs)
    
    #Apply value counts method to get the counts of unique seqs
    df = pd.DataFrame({'seq': seq_ser.value_counts().index, 
        'counts' : seq_ser.value_counts().values}).reset_index().rename(columns =
                                                                        {'index': 'allele'})
    
    #Turn the tuple of nucleotides into a vectorized array
    df['seq'] = df['seq'].apply(nuc2vec_py)
    
    #Sort the values to make the bisection later
    df.sort_values(by = 'seq', inplace = True )
    
    #Make a column to keep track of the day 
    df['day'] = [0]*df.shape[0]
    
    cols = ['seq', 'counts', 'day', 'allele']
    
    df = df[cols]
    
    t1 = time.time()
    
    print('Making the dataframe took ', t1-t0, ' seconds')
    
    return df

In [None]:
def get_allele_counts_post_day_zero(fname, day_number):
    
    """
    Get the allele counts for the days after day 0. 
    
    inputs ~~

    fname : path to where the file lives
    
    outputs : df with counts 
    """
    
    t0 = time.time()
    
    #Load sequences using biopython 
    #seqs = []
    
    #for seq_record in SeqIO.parse(fname, "fastq"):
    #    seqs.append(seq_record.seq)

    seqs = get_seqs_py(fname)
    
    seq_ser = pd.Series(seqs)
    
    df = pd.DataFrame({'seq': seq_ser.value_counts().index, 
        'counts' : seq_ser.value_counts().values})
    
    df['seq'] = df['seq'].apply(nuc2vec_py)
    
    df.sort_values(by = 'seq', inplace= True)
    
    df['day'] = [day_number]*df.shape[0]
    
    t1 = time.time()
    
    print('The code ran in ', t1-t0, ' seconds')
    
    return df

In [None]:
def get_allele_barcodes_post_day_zero(day_zero_df, post_df):
    
    """
    In order to keep track of the alleles, this function maps the seqs from 
    experiments post-day zero and assigns them the same tag as the day zero 
    experiment. 
    
    inputs~~
    
    day_zero_df : DataFrame containing the counts for each allele in day 0.
    
    outputs~~~
    
    post_df : DataFrame containing the counts for each allele post day 0, without allele tags.
    """
    
    t0 = time.time()
    n_barcodes_day_zero = day_zero_df.shape[0]
    
    day_zero_seqs = day_zero_df.seq.values
    
    #barcodes for post day zero df sequences 
    post_day_zero_allele_bc =[]
    
    
    for i,post_df_seq in enumerate(post_df.seq.values):
        
        if post_df_seq in day_zero_seqs:
            
            index = bisect.bisect(day_zero_seqs, post_df_seq) 
            
            post_day_zero_allele_bc.append(index)
            
        else: 
            
            post_day_zero_allele_bc.append(n_barcodes_day_zero + i )
            
    post_df['allele'] = post_day_zero_allele_bc
    
    t1 = time.time()
    
    print('The code took ', t1-t0, ' seconds to run.')
    
    return post_df


## Making the master dataframe

With this functions at play, we can now get the counts for each allele across time. Let's go ahead and make that for day 0. You'll notice that the seqs consist of a sequence of numbers 0-3. After all is said and done, we'll have to convert back to DNA sequences using the `vec_2_nuc` function.

In [None]:
df_day_zero = get_allele_counts_day_zero(fname_day_0)

In [None]:
df_day_zero.head()

In [None]:
df_day_zero.shape

Nice, now we can get the counts for each sequence for posterior days, and then match the allele annotations. 

In [None]:
df_1 = get_allele_counts_post_day_zero(fname_day_1, 1)

In [None]:
df_1.tail()

As you can see we now have the counts for each individual sequence, we just have to match each sequence to the allele annotations from day one. We're making use of the `bisect` module, but it might be a better idea to make some other type of pattern matching. An alternative idea is to take the log of each number, and convert the number back after matching. The numbers that are not 70 integers of length will be those that will have to be added 0s to the left. Last time I ran this code it took > 3 hrs to run...

In [None]:
#Get the "barcodes" a.k.a. allele annotations for the day zero dataframe
df_day_one = get_allele_barcodes_post_day_zero(df_day_zero, df_1)


In [None]:
df_3 = get_allele_counts_post_day_zero(fname_day_3, 3)


In [None]:

df_day_three = get_allele_barcodes_post_day_zero(df_day_zero, df_3)