# Trying to understand mpathic and how to make it work with stan

In [11]:
import pandas as pd
import re

In [12]:
%load_ext blackcellmagic

The blackcellmagic extension is already loaded. To reload it, use:
  %reload_ext blackcellmagic


In [31]:
df = pd.read_csv("../data/RegSeq/sequence_counts/ykgEarcAdataset_alldone_with_large", delim_whitespace=True)
df

Unnamed: 0,ct,ct_0,ct_1,seq
0,4.0,4.0,0.0,ACAATTTCACCATAAAATGTCGGCGTTGCCGAAAGAAATAAAATGA...
1,19.0,13.0,6.0,ACGAGTTCCCCATAAAATTTGAGCGATGCCGAAAGAAATAAAAGTA...
2,5.0,5.0,0.0,ACGAGTTCCCCATAAAATTTGAGCGATGCCGAAAGAAATAAAAGTA...
3,11.0,11.0,0.0,ACGATTATCCCATAAAATGTGAACGATGCCGAAAGAAATAAAATTA...
4,2.0,2.0,0.0,ACGATTTACCCGCAAAACGGGAGCGACGCCGCAAGAAACAAAATTA...
...,...,...,...,...
908,8.0,4.0,4.0,TTGATTTTCCCATTAAACATGCCCGATGCCGAAAGACATAAAATTA...
909,4.0,4.0,0.0,TTGATTTTCCCATTAAACATGCCCGATGCCGAAAGACATAAAATTA...
910,5.0,1.0,4.0,TTGTTTTTCCCAGAAAATGTAAGTCACGTCGACAGAAATAAAATTA...
911,7.0,7.0,0.0,TTGTTTTTCCCAGAAAATGTAAGTCACGTCGACAGAAATAAAATTA...


The function which is called to run the inference is `learn_model.main`. Let's just copy the code of the function and then try to understand what every line is doing.

In [None]:
def main(
    df,
    lm='IM',
    modeltype='MAT',
    LS_means_std=None,
    db=None,
    iteration=30000,
    burnin=1000,
    thin=10,
    runnum=0,
    initialize='LS',
    start=0,
    end=None,
    foreground=1,
    background=0,
    alpha=0,
    pseudocounts=1,
    test=False,
    drop_library=False,
    verbose=False
):
    
    # Determine dictionary
    seq_cols = qc.get_cols_from_df(df,'seqs')
    
    if not len(seq_cols)==1:
        raise SortSeqError('Dataframe has multiple seq cols: %s'%str(seq_cols))
    dicttype = qc.colname_to_seqtype_dict[seq_cols[0]]

    seq_dict,inv_dict = utils.choose_dict(dicttype,modeltype=modeltype)
    
    '''Check to make sure the chosen dictionary type correctly describes
         the sequences. An issue with this test is that if you have DNA sequence
         but choose a protein dictionary, you will still pass this test bc A,C,
         G,T are also valid amino acids'''
    #set name of sequences column based on type of sequence
    type_name_dict = {'dna':'seq','rna':'seq_rna','protein':'seq_pro'}
    seq_col_name = type_name_dict[dicttype]
    lin_seq_dict,lin_inv_dict = utils.choose_dict(dicttype,modeltype='MAT')
    #wtseq = utils.profile_counts(df.copy(),dicttype,return_wtseq=True,start=start,end=end)
    #wt_seq_dict_list = [{inv_dict[np.mod(i+1+seq_dict[w],len(seq_dict))]:i for i in range(len(seq_dict)-1)} for w in wtseq]
    par_seq_dict = {v:k for v,k in seq_dict.items() if k != (len(seq_dict)-1)}
    #drop any rows with ct = 0
    df = df[df.loc[:,'ct'] != 0]
    df.reset_index(drop=True,inplace=True)
    
    #If there are sequences of different lengths, then print error but continue
    if len(set(df[seq_col_name].apply(len))) > 1:
         sys.stderr.write('Lengths of all sequences are not the same!')
    #select target sequence region
    df.loc[:,seq_col_name] = df.loc[:,seq_col_name].str.slice(start,end)
    df = utils.collapse_further(df)
    col_headers = utils.get_column_headers(df)
    #make sure all counts are ints
    df[col_headers] = df[col_headers].astype(int)
    #create vector of column names
    val_cols = ['val_' + inv_dict[i] for i in range(len(seq_dict))]
    df.reset_index(inplace=True,drop=True)
    #Drop any sequences with incorrect length
    if not end:
        '''is no value for end of sequence was supplied, assume first seq is
            correct length'''
        seqL = len(df[seq_col_name][0]) - start
    else:
        seqL = end-start
    df = df[df[seq_col_name].apply(len) == (seqL)]
    df.reset_index(inplace=True,drop=True)
    #Do something different for each type of learning method (lm)
    if lm == 'ER':
        emat = Berg_von_Hippel(
            df,dicttype,foreground=foreground,background=background,
            pseudocounts=pseudocounts)
    if lm == 'LS':
        '''First check that is we don't have a penalty for ridge regression,
            that we at least have all possible base values so that the analysis
            will not fail'''
        if LS_means_std: #If user supplied preset means and std for each bin
            means_std_df = io.load_meanstd(LS_means_std)

            #change bin number to 'ct_number' and then use as index
            labels = list(means_std_df['bin'].apply(add_label))
            std = means_std_df['std']
            std.index = labels
            #Change Weighting of each sequence by dividing counts by bin std
            df[labels] = df[labels].div(std)
            means = means_std_df['mean']
            means.index = labels
        else:
            means = None
        #drop all rows without counts
        df['ct'] = df[col_headers].sum(axis=1)
        df = df[df.ct != 0]        
        df.reset_index(inplace=True,drop=True)
        ''' For sort-seq experiments, bin_0 is library only and isn't the lowest
            expression even though it is will be calculated as such if we proceed.
            Therefore is drop_library is passed, drop this column from analysis.'''
        if drop_library:
            try:     
                df.drop('ct_0',inplace=True)
                col_headers = utils.get_column_headers(df)
                if len(col_headers) < 2:
                    raise SortSeqError(
                        '''After dropping library there are no longer enough 
                        columns to run the analysis''')
            except:
                raise SortSeqError('''drop_library option was passed, but no ct_0
                    column exists''')
        #parameterize sequences into 3xL vectors
                               
        raveledmat,batch,sw = utils.genweightandmat(
                                  df,par_seq_dict,dicttype,means=means,modeltype=modeltype)
        #Use ridge regression to find matrix.       
        emat = Compute_Least_Squares(raveledmat,batch,sw,alpha=alpha)

    if lm == 'IM':
        seq_mat,wtrow = numerics.dataset2mutarray(df.copy(),modeltype)
        #this is also an MCMC routine, do the same as above.
        if initialize == 'rand':
            if modeltype == 'MAT':
                emat_0 = utils.RandEmat(len(df[seq_col_name][0]),len(seq_dict))
            elif modeltype == 'NBR':
                emat_0 = utils.RandEmat(len(df['seq'][0])-1,len(seq_dict))
        elif initialize == 'LS':
            emat_cols = ['val_' + inv_dict[i] for i in range(len(seq_dict))]
            emat_0_df = main(df.copy(),lm='LS',modeltype=modeltype,alpha=alpha,start=0,end=None,verbose=verbose)
            emat_0 = np.transpose(np.array(emat_0_df[emat_cols]))   
            #pymc doesn't take sparse mat        
        emat = MaximizeMI_memsaver(
                seq_mat,df.copy(),emat_0,wtrow,db=db,iteration=iteration,burnin=burnin,
                thin=thin,runnum=runnum,verbose=verbose)
    #now format the energy matrices to get them ready to output
    if (lm == 'IM' or lm == 'memsaver'):       
        if modeltype == 'NBR':
             emat_typical = gauge.fix_neighbor(np.transpose(emat))
        elif modeltype == 'MAT':
             emat_typical = gauge.fix_matrix(np.transpose(emat))
    
    elif lm == 'ER': 
        '''the emat for this format is currently transposed compared to other formats
        it is also already a data frame with columns [pos,val_...]'''
        emat_cols = ['val_' + inv_dict[i] for i in range(len(seq_dict))]
        emat_typical = emat[emat_cols]
        emat_typical = (gauge.fix_matrix((np.array(emat_typical))))
        
    else: #must be Least squares
        emat_typical = utils.emat_typical_parameterization(emat,len(seq_dict))        
        if modeltype == 'NBR':
             emat_typical = gauge.fix_neighbor(np.transpose(emat_typical))
        elif modeltype == 'MAT':
             emat_typical = gauge.fix_matrix(np.transpose(emat_typical))
    
    em = pd.DataFrame(emat_typical)
    em.columns = val_cols
    #add position column
    if modeltype == 'NBR':
        pos = pd.Series(range(start,start - 1 + len(df[seq_col_name][0])),name='pos') 
    else:
        pos = pd.Series(range(start,start + len(df[seq_col_name][0])),name='pos')    
    output_df = pd.concat([pos,em],axis=1)

    # Validate model and return
    output_df = qc.validate_model(output_df,fix=True)
    return output_df

### Data preparation

The first line of the function uses a function form the `gc` submodule.

In [4]:
def get_cols_from_df(df, col_types):
    """
    Returns all column names of a given type from a dataframe, sorted alphabetically.
    """
    return sorted([c for c in df.columns if is_col_type(c, col_types)])

Here we need another function, which is found in the same submodule, and we also need to define a dictionary.

In [16]:
col_patterns = {
    'seqs'  :   r'^seq$|^seq_rna$|^seq_pro$',
    'tag'   :   r'^tag$',
    'cts'   :   r'^ct',
    'ct_'   :   r'^ct_',
    'ct'    :   r'^ct$',
    'file'  :   r'^file$',
    'bin'   :   r'^bin$',
    'pos'   :   r'^pos$',
    'val'   :   r'^val$',
    'vals'  :   r'^val_|^val$',
    'info'  :   r'^info$',
    'infos' :   r'^info$|^info_err$',
    'errs'  :   r'_err$',
    'freq_' :   r'^freq_',
    'wts'   :   r'^wt$|^wt_rna$|^wt_pro$',
    'mut'   :   r'^mut$',
    'muts'  :   r'^mut$|^mut_err$',
    'mean'  :   r'^mean$',
    'std'   :   r'^std$',
    'lr'    :   r'^left$|^right$',
    'contig':   r'^contig$',
    'ori'   :   r'^ori$'
}

def is_col_type(col_name, col_types="all"):
    """ 
    Checks whether col_name is a valid column name, as specified by col_types. col_types can be either a string (for a single column type) or a list of strings (for multimple column types). Default col_types='all' causes function to check all available column types
    """
    col_match = False

    # Make col_types_list
    if type(col_types) == list:
        col_types_list = col_types
    elif type(col_types) == str:
        if col_types == "all":
            col_types_list = col_patterns.values()
        else:
            col_types_list = [col_types]
    else:
        raise SortSeqError("col_types is not a string or a list.")

    # Check for matches wihtin col_type list
    for col_type in col_types_list:
        pattern = col_patterns[col_type]
        if re.search(pattern, col_name):
            col_match = True

    # Return true if any match found
    return col_match

Let's redefine the function `main` and only run the first line.

In [17]:
def main(
    df
):
    
    # Determine dictionary
    seq_cols = get_cols_from_df(df,'seqs')
    return seq_cols

main(df)

['seq']

OK That works. Let' proceed with the next line, which is simply a check that there is only one column in the data file. The following line is `dicttype = qc.colname_to_seqtype_dict[seq_cols[0]]`, which is simply accessing a dictionary.

In [18]:
colname_to_seqtype_dict = {
    'seq':'dna',
    'seq_rna':'rna',
    'seq_pro':'protein',
    'tag':'dna',
    'wt':'dna',
    'wt_rna':'rna',
    'wt_pro':'protein'
}

In the following line, `seq_dict,inv_dict = utils.choose_dict(dicttype,modeltype=modeltype)`, we are choosing a dictionary, which translates letters in to numbers, and depending on the type of sequence we are observing, we need to choose a different dictionary. Also depending on the model, which can either be energy matrices (`modeltype="MAT"`) or a neighbor model (`modeltype="NBR"`).

In [20]:
def choose_dict(dicttype, modeltype="MAT"):
    """Get numbering dictionary for either dna,rna, or proteins"""
    if modeltype == "MAT":
        if dicttype == "dna":
            seq_dict = {"A": 0, "C": 1, "G": 2, "T": 3}
            inv_dict = {0: "A", 1: "C", 2: "G", 3: "T"}
        elif dicttype == "rna":
            seq_dict = {"A": 0, "C": 1, "G": 2, "U": 3}
            inv_dict = {0: "A", 1: "C", 2: "G", 3: "U"}
        elif dicttype == "protein":
            seq_dict = {
                "*": 0,
                "A": 1,
                "C": 2,
                "D": 3,
                "E": 4,
                "F": 5,
                "G": 6,
                "H": 7,
                "I": 8,
                "K": 9,
                "L": 10,
                "M": 11,
                "N": 12,
                "P": 13,
                "Q": 14,
                "R": 15,
                "S": 16,
                "T": 17,
                "V": 18,
                "W": 19,
                "Y": 20,
            }
            inv_dict = {v: k for k, v in seq_dict.items()}
        else:
            raise SortSeqError("Unkonwn dicttype: {}".format(dicttype))

    elif modeltype == "NBR":
        seq_dict = {
            "".join([inv_dict[i], inv_dict[z]]): i * len(seq_dict) + z
            for i in range(len(seq_dict))
            for z in range(len(seq_dict))
        }
        inv_dict = {seq_dict[i]: i for i in seq_dict.keys()}
    else:
        raise SortSeqError("Unkonwn model type: {}".format(modeltype))
    
    return seq_dict, inv_dict

In [24]:
def main(
    df,
    modeltype='MAT',
):
    
    # Determine dictionary
    seq_cols = get_cols_from_df(df, 'seqs')
    
    if not len(seq_cols)==1:
        raise SortSeqError('Dataframe has multiple seq cols: %s'%str(seq_cols))
    dicttype = colname_to_seqtype_dict[seq_cols[0]]

    seq_dict, inv_dict = choose_dict(dicttype, modeltype=modeltype)
    return seq_dict, inv_dict

main(df)

({'A': 0, 'C': 1, 'G': 2, 'T': 3}, {0: 'A', 1: 'C', 2: 'G', 3: 'T'})

So far so good. The next two lines are `type_name_dict = {'dna':'seq','rna':'seq_rna','protein':'seq_pro'}` and `seq_col_name = type_name_dict[dicttype]` where the name of the sequence column is just taken again?

In [25]:
def main(
    df,
    modeltype='MAT',
):
    
    # Determine dictionary
    seq_cols = get_cols_from_df(df,'seqs')
    
    if not len(seq_cols)==1:
        raise SortSeqError('Dataframe has multiple seq cols: %s'%str(seq_cols))
    dicttype = colname_to_seqtype_dict[seq_cols[0]]

    seq_dict, inv_dict = choose_dict(dicttype, modeltype=modeltype)

    type_name_dict = {'dna':'seq','rna':'seq_rna','protein':'seq_pro'}
    seq_col_name = type_name_dict[dicttype]
    return seq_col_name
main(df)

'seq'

The next line creates a dictionary form the sequence dictionary with all entries but the last. This seems to be needed only for least square inferences, and I don't understand yet what for that is needed.

In [27]:
def main(
    df,
    modeltype='MAT',
):
    
    # Determine dictionary
    seq_cols = get_cols_from_df(df,'seqs')
    
    if not len(seq_cols)==1:
        raise SortSeqError('Dataframe has multiple seq cols: %s'%str(seq_cols))
    dicttype = colname_to_seqtype_dict[seq_cols[0]]

    seq_dict, inv_dict = choose_dict(dicttype, modeltype=modeltype)

    type_name_dict = {'dna':'seq','rna':'seq_rna','protein':'seq_pro'}
    seq_col_name = type_name_dict[dicttype]
    par_seq_dict = {v:k for v,k in seq_dict.items() if k != (len(seq_dict)-1)}
    return par_seq_dict
main(df)

{'A': 0, 'C': 1, 'G': 2}

The next two lines are only removing sequences from the dataframe which were not observed in the experiment, and should not be there in the first place, but I guess this is a nice way to be sure about that.

In [30]:
def main(
    df,
    modeltype='MAT',
):
    
    # Determine dictionary
    seq_cols = get_cols_from_df(df,'seqs')
    
    if not len(seq_cols)==1:
        raise SortSeqError('Dataframe has multiple seq cols: %s'%str(seq_cols))
    dicttype = colname_to_seqtype_dict[seq_cols[0]]

    seq_dict, inv_dict = choose_dict(dicttype, modeltype=modeltype)

    type_name_dict = {'dna':'seq','rna':'seq_rna','protein':'seq_pro'}
    seq_col_name = type_name_dict[dicttype]
    
    par_seq_dict = {v:k for v,k in seq_dict.items() if k != (len(seq_dict)-1)}
    
    #drop any rows with ct = 0
    df = df[df.loc[:,'ct'] != 0]
    df.reset_index(drop=True, inplace=True)
    return df

main(df)

Unnamed: 0,ct,ct_0,ct_1,seq
0,4.0,4.0,0.0,ACAATTTCACCATAAAATGTCGGCGTTGCCGAAAGAAATAAAATGA...
1,19.0,13.0,6.0,ACGAGTTCCCCATAAAATTTGAGCGATGCCGAAAGAAATAAAAGTA...
2,5.0,5.0,0.0,ACGAGTTCCCCATAAAATTTGAGCGATGCCGAAAGAAATAAAAGTA...
3,11.0,11.0,0.0,ACGATTATCCCATAAAATGTGAACGATGCCGAAAGAAATAAAATTA...
4,2.0,2.0,0.0,ACGATTTACCCGCAAAACGGGAGCGACGCCGCAAGAAACAAAATTA...
...,...,...,...,...
908,8.0,4.0,4.0,TTGATTTTCCCATTAAACATGCCCGATGCCGAAAGACATAAAATTA...
909,4.0,4.0,0.0,TTGATTTTCCCATTAAACATGCCCGATGCCGAAAGACATAAAATTA...
910,5.0,1.0,4.0,TTGTTTTTCCCAGAAAATGTAAGTCACGTCGACAGAAATAAAATTA...
911,7.0,7.0,0.0,TTGTTTTTCCCAGAAAATGTAAGTCACGTCGACAGAAATAAAATTA...


Next the function checks that there is only one sequence length. This is also confirmed in prior steps of our analysis, so it should not be needed at this step, however, it might not be included in every data processing pipeline, so better keep it here.

    #If there are sequences of different lengths, then print error but continue
    if len(set(df[seq_col_name].apply(len))) > 1:
         sys.stderr.write('Lengths of all sequences are not the same!')

In [32]:
def main(
    df,
    modeltype='MAT',
):
    
    # Determine dictionary
    seq_cols = get_cols_from_df(df,'seqs')
    
    if not len(seq_cols)==1:
        raise SortSeqError('Dataframe has multiple seq cols: %s'%str(seq_cols))
    dicttype = colname_to_seqtype_dict[seq_cols[0]]

    seq_dict, inv_dict = choose_dict(dicttype, modeltype=modeltype)

    type_name_dict = {'dna':'seq','rna':'seq_rna','protein':'seq_pro'}
    seq_col_name = type_name_dict[dicttype]
    
    par_seq_dict = {v:k for v,k in seq_dict.items() if k != (len(seq_dict)-1)}
    
    #drop any rows with ct = 0
    df = df[df.loc[:,'ct'] != 0]
    df.reset_index(drop=True, inplace=True)
    
    #If there are sequences of different lengths, then print error but continue
    if len(set(df[seq_col_name].apply(len))) > 1:
         sys.stderr.write('Lengths of all sequences are not the same!')
    return df

main(df)

Unnamed: 0,ct,ct_0,ct_1,seq
0,4.0,4.0,0.0,ACAATTTCACCATAAAATGTCGGCGTTGCCGAAAGAAATAAAATGA...
1,19.0,13.0,6.0,ACGAGTTCCCCATAAAATTTGAGCGATGCCGAAAGAAATAAAAGTA...
2,5.0,5.0,0.0,ACGAGTTCCCCATAAAATTTGAGCGATGCCGAAAGAAATAAAAGTA...
3,11.0,11.0,0.0,ACGATTATCCCATAAAATGTGAACGATGCCGAAAGAAATAAAATTA...
4,2.0,2.0,0.0,ACGATTTACCCGCAAAACGGGAGCGACGCCGCAAGAAACAAAATTA...
...,...,...,...,...
908,8.0,4.0,4.0,TTGATTTTCCCATTAAACATGCCCGATGCCGAAAGACATAAAATTA...
909,4.0,4.0,0.0,TTGATTTTCCCATTAAACATGCCCGATGCCGAAAGACATAAAATTA...
910,5.0,1.0,4.0,TTGTTTTTCCCAGAAAATGTAAGTCACGTCGACAGAAATAAAATTA...
911,7.0,7.0,0.0,TTGTTTTTCCCAGAAAATGTAAGTCACGTCGACAGAAATAAAATTA...


In the next two lines, we clip bases off the sequence, depending on the arguments given to the function. Then, the dataframe is reorganized into a common format.

    #select target sequence region
    df.loc[:,seq_col_name] = df.loc[:,seq_col_name].str.slice(start,end)
    df = utils.collapse_further(df)

In [35]:
def collapse_further(df):
    """take clipped df and then collapse it further"""
    # automatically pick the column name for the sequences
    seq_col_name = [x for x in df.columns if "seq" in x][0]
    output_df = df.groupby(seq_col_name).sum()
    output_df = output_df.reset_index()
    # now reorder columns so we have 'ct' first and 'seq' last
    ct_columns = [x for x in df.columns if "ct" in x]
    output_df = output_df[ct_columns + [seq_col_name]]
    # The evaluated column will now be incorrect, so we should delete it.
    try:
        output_df = output_df.drop("val", axis=1)
    except:
        pass
    return output_df


def main(
    df,
    modeltype='MAT',
    start=0,
    end=None
):
    
    # Determine dictionary
    seq_cols = get_cols_from_df(df,'seqs')
    
    if not len(seq_cols)==1:
        raise SortSeqError('Dataframe has multiple seq cols: %s'%str(seq_cols))
    dicttype = colname_to_seqtype_dict[seq_cols[0]]

    seq_dict, inv_dict = choose_dict(dicttype, modeltype=modeltype)

    type_name_dict = {'dna':'seq','rna':'seq_rna','protein':'seq_pro'}
    seq_col_name = type_name_dict[dicttype]
    
    par_seq_dict = {v:k for v,k in seq_dict.items() if k != (len(seq_dict)-1)}
    
    #drop any rows with ct = 0
    df = df[df.loc[:,'ct'] != 0]
    df.reset_index(drop=True, inplace=True)
    
    #If there are sequences of different lengths, then print error but continue
    if len(set(df[seq_col_name].apply(len))) > 1:
        sys.stderr.write('Lengths of all sequences are not the same!')
            
    #select target sequence region
    df.loc[:,seq_col_name] = df.loc[:,seq_col_name].str.slice(start,end)
    df = collapse_further(df)
    return df

main(df)

Unnamed: 0,ct,ct_0,ct_1,seq
0,4.0,4.0,0.0,ACAATTTCACCATAAAATGTCGGCGTTGCCGAAAGAAATAAAATGA...
1,19.0,13.0,6.0,ACGAGTTCCCCATAAAATTTGAGCGATGCCGAAAGAAATAAAAGTA...
2,5.0,5.0,0.0,ACGAGTTCCCCATAAAATTTGAGCGATGCCGAAAGAAATAAAAGTA...
3,11.0,11.0,0.0,ACGATTATCCCATAAAATGTGAACGATGCCGAAAGAAATAAAATTA...
4,2.0,2.0,0.0,ACGATTTACCCGCAAAACGGGAGCGACGCCGCAAGAAACAAAATTA...
...,...,...,...,...
908,8.0,4.0,4.0,TTGATTTTCCCATTAAACATGCCCGATGCCGAAAGACATAAAATTA...
909,4.0,4.0,0.0,TTGATTTTCCCATTAAACATGCCCGATGCCGAAAGACATAAAATTA...
910,5.0,1.0,4.0,TTGTTTTTCCCAGAAAATGTAAGTCACGTCGACAGAAATAAAATTA...
911,7.0,7.0,0.0,TTGTTTTTCCCAGAAAATGTAAGTCACGTCGACAGAAATAAAATTA...


Again, this did not change anything. But only because the dataframe is already prepared to be in the right format. In the next lines are only further formatting the data table.

    col_headers = utils.get_column_headers(df)
    #make sure all counts are ints
    df[col_headers] = df[col_headers].astype(int)
    #create vector of column names
    val_cols = ['val_' + inv_dict[i] for i in range(len(seq_dict))]
    df.reset_index(inplace=True,drop=True)
    #Drop any sequences with incorrect length
    if not end:
        '''is no value for end of sequence was supplied, assume first seq is
            correct length'''
        seqL = len(df[seq_col_name][0]) - start
    else:
        seqL = end-start
    df = df[df[seq_col_name].apply(len) == (seqL)]
    df.reset_index(inplace=True,drop=True)

In [36]:
def get_column_headers(df,exptype=None):
    col_headers = [name for name in df.columns if 'ct_' in name]              
    return col_headers

In [39]:
def main(
    df,
    modeltype='MAT',
    start=0,
    end=None
):
    
    # Determine dictionary
    seq_cols = get_cols_from_df(df,'seqs')
    
    if not len(seq_cols)==1:
        raise SortSeqError('Dataframe has multiple seq cols: %s'%str(seq_cols))
    dicttype = colname_to_seqtype_dict[seq_cols[0]]

    seq_dict, inv_dict = choose_dict(dicttype, modeltype=modeltype)

    type_name_dict = {'dna':'seq','rna':'seq_rna','protein':'seq_pro'}
    seq_col_name = type_name_dict[dicttype]
    
    par_seq_dict = {v:k for v,k in seq_dict.items() if k != (len(seq_dict)-1)}
    
    #drop any rows with ct = 0
    df = df[df.loc[:,'ct'] != 0]
    df.reset_index(drop=True, inplace=True)
    
    #If there are sequences of different lengths, then print error but continue
    if len(set(df[seq_col_name].apply(len))) > 1:
        sys.stderr.write('Lengths of all sequences are not the same!')
            
    #select target sequence region
    df.loc[:,seq_col_name] = df.loc[:,seq_col_name].str.slice(start,end)
    df = collapse_further(df)
    col_headers = get_column_headers(df)
    #make sure all counts are ints
    df[col_headers] = df[col_headers].astype(int)
    #create vector of column names
    val_cols = ['val_' + inv_dict[i] for i in range(len(seq_dict))]
    df.reset_index(inplace=True,drop=True)
    #Drop any sequences with incorrect length
    if not end:
        '''is no value for end of sequence was supplied, assume first seq is
            correct length'''
        seqL = len(df[seq_col_name][0]) - start
    else:
        seqL = end-start
    df = df[df[seq_col_name].apply(len) == (seqL)]
    df.reset_index(inplace=True,drop=True)
    
    return df

main(df)

Unnamed: 0,ct,ct_0,ct_1,seq
0,4.0,4,0,ACAATTTCACCATAAAATGTCGGCGTTGCCGAAAGAAATAAAATGA...
1,19.0,13,6,ACGAGTTCCCCATAAAATTTGAGCGATGCCGAAAGAAATAAAAGTA...
2,5.0,5,0,ACGAGTTCCCCATAAAATTTGAGCGATGCCGAAAGAAATAAAAGTA...
3,11.0,11,0,ACGATTATCCCATAAAATGTGAACGATGCCGAAAGAAATAAAATTA...
4,2.0,2,0,ACGATTTACCCGCAAAACGGGAGCGACGCCGCAAGAAACAAAATTA...
...,...,...,...,...
908,8.0,4,4,TTGATTTTCCCATTAAACATGCCCGATGCCGAAAGACATAAAATTA...
909,4.0,4,0,TTGATTTTCCCATTAAACATGCCCGATGCCGAAAGACATAAAATTA...
910,5.0,1,4,TTGTTTTTCCCAGAAAATGTAAGTCACGTCGACAGAAATAAAATTA...
911,7.0,7,0,TTGTTTTTCCCAGAAAATGTAAGTCACGTCGACAGAAATAAAATTA...


### Inference setup

Now that we understood all lines which format the data, we can start looking at the inference part of the code. For now, we start with the information maximization method. The first line there is:

    seq_mat,wtrow = numerics.dataset2mutarray(df.copy(), modeltype)
    
As we look at this function, we will need to do a little detour.

#### => `dataset2mutarray`

In [None]:
def dataset2mutarray(dataset_df, modeltype, chunksize=1000, rowsforwtcalc=100):

    # Determine the type of model and set seq2array function appropriately
    if modeltype=='MAT':
        seqs2array = mpathic.fast.seqs2array_for_matmodel
    elif modeltype=='NBR':
        seqs2array = mpathic.fast.seqs2array_for_nbrmodel
    else:
        raise SortSeqError('Unknown model type: %s'%modeltype)

    # Determine seqtype, etc.
    seqcol = qc.get_cols_from_df(dataset_df,'seqs')[0]
    seqtype = qc.colname_to_seqtype_dict[seqcol]
    wtcol = qc.seqtype_to_wtcolname_dict[seqtype]

    # Compute the wt sequence
    rowsforwtcalc = min(rowsforwtcalc,dataset_df.shape[0])
    dataset_head_df = dataset_df.head(rowsforwtcalc)
    mut_df = profile_mut(dataset_head_df)
    wtseq = ''.join(list(mut_df[wtcol]))
    wtrow = seqs2array([wtseq], seq_type=seqtype).ravel().astype(bool)
    numfeatures = len(wtrow)

    # Process dataframe in chunks
    startrow = 0
    endrow = startrow+chunksize-1
    numrows = dataset_df.shape[0]

    # Fill in mutarray (a lil matrix) chunk by chunk
    mutarray_lil = lil_matrix((numrows,numfeatures),dtype=int)
    matrix_filled = False
    while not matrix_filled:

        if startrow >= numrows:
            matrix_filled = True
            continue
        elif endrow >= numrows:
            endrow = numrows-1
            matrix_filled = True

        # Compute seqarray
        seqlist = list(dataset_df[seqcol][startrow:(endrow+1)])
        seqarray = seqs2array(seqlist, seq_type=seqtype)

        # Remove wt entries
        tmp = seqarray.copy()
        tmp[:,wtrow] = 0

        # Store results from this chunk
        mutarray_lil[startrow:(endrow+1),:] = tmp

        # Increment rows
        startrow = endrow+1
        endrow = startrow + chunksize - 1

    # Convert to csr matrix
    mutarray_csr = mutarray_lil.tocsr()

    # Return vararray as well as binary representation of wt seq
    return mutarray_csr, wtrow

#### => `mpathic.fast.seqs2array_for_matmodel`

The first part uses some code written C I think. So from here we kinda need to switch to Julia I think to proceed. At least there we won't need to use C or anything, and write everything in native Julia. So from here we just describe what the code is supposed to do instead of running it line by line.

    def seqs2array_for_matmodel(list seq_list, str seq_type, bool safe=True):
    """
    Converts a list of sequences (all of which must be the same length) to a numpy array to be used for matrix model evaluation
    """
Just define some parameters

    cdef np.ndarray[DTYPE_t, ndim=2] mat
    cdef str c, seq
    cdef int num_seqs, seq_length, num_chars, i, n, k
    cdef dict c_to_i_dict

Self explaining line

    # Validate seq_type if in safe mode
    if safe and (not seq_type in qc.seqtypes):
        raise SortSeqError('Invalid seq_type: %s.'%seq_type)

Just another dictionary which translates letters from a certain sequence type into integers

    # Get character dictionary
    c_to_i_dict = qc.char_to_mat_index_dicts[seq_type]
    num_chars = len(c_to_i_dict)

Initialize a matrix in which each sequence is translated into an array of 1's and 0's, with the ith to ith+alphabet_size positions being for the ith letter in the sequence, and having a 1 at the position in this subsequence depending on the dictionary.

    # Initialize matrix
    num_seqs = len(seq_list)
    seq_length = len(seq_list[0])
    mat = np.zeros([num_seqs,num_chars*seq_length], dtype=DTYPE)

    # Fill matrix row by row
    for n, seq in enumerate(seq_list):

        # Validate sequence composition if in safe mode
        if safe and qc.seqerr_re_dict[seq_type].search(seq):
            raise SortSeqError(\
                'Invalid character found in %s sequence.'%seq_type)

        # Validate sequence length if in safe mode
        if safe and len(seq)!=seq_length:
            raise SortSeqError('Sequences are not all the same length.')

        # Fill in array
        for i, c in enumerate(seq):
            k = c_to_i_dict[c]
            mat[n,num_chars*i+k] = 1
    return mat


#### `dataset2mutarray`<=

Going back to the function `dataset2mutarray` function. After computing this array, the next three lines are 

    # Determine seqtype, etc.
    seqcol = qc.get_cols_from_df(dataset_df,'seqs')[0]
    seqtype = qc.colname_to_seqtype_dict[seqcol]
    wtcol = qc.seqtype_to_wtcolname_dict[seqtype]
    
where we already used the function in the first line and defined the dictionary in the second, leaving us only with defining the dictionary in the third line.

In [40]:
seqtype_to_wtcolname_dict = {
    'dna':'wt',
    'rna':'wt_rna',
    'protein':'wt_pro'
}

In the next lines, we try to find the wild type sequence. First, we take a number of rows of the dataframe (first two lines), and then use this number of rows from the data frame. In the third row, we use the function `profile_mut`, which we will investigate below.

    # Compute the wt sequence
    rowsforwtcalc = min(rowsforwtcalc,dataset_df.shape[0])
    dataset_head_df = dataset_df.head(rowsforwtcalc)
    mut_df = profile_mut(dataset_head_df)
   

#### => `profile_mut`

The first step is to validate the data set. In this pipeline however, this is done already multiple times, so we can ignore that for now. Next,     
    
    def main(dataset_df, bin=None, start=0, end=None, err=False):
        """
        Computes the mutation rate (0.0 to 1.0) at each position. Mutation rate is defined as 1.0 minus the maximum character frequency at a position. Errors are estimated using bionomial uncertainty

        Arguments:
            dataset_df (pd.DataFrame): A dataframe containing a valid dataset.
            bin (int): A bin number specifying which counts to use
            start (int): An integer specifying the sequence start position
            end (int): An integer specifying the sequence end position

        Returns:
            freq_df (pd.DataFrame): A dataframe containing results. 
        """

Validate data set. We don't have to look at the details here, since the data sets will be in the right format already.

        # Validate dataset_df
        qc.validate_dataset(dataset_df)

Returns a dataframe, giving the counts of each letter at each position.

        # Compute counts
        counts_df = profile_ct.main(dataset_df, bin=bin, start=start, end=end)

Get all count columns.

        # Create columns for profile_freqs table
        ct_cols = [c for c in counts_df.columns if qc.is_col_type(c,'ct_')]

Create new data frame to store mutation rates and initiate with positions.

        # Record positions in new dataframe
        mut_df = counts_df[['pos']].copy()

Look for base with highest count at each position, and assume it to be wild type base. Then divide by total number of counts, to get frequency of wild type base. Finally, subtract wild type base from 1.0, giving mutation rate. Store mutation rate in data frame.

        # Compute mutation rate across counts
        max_ct = counts_df[ct_cols].max(axis=1)
        sum_ct = counts_df[ct_cols].sum(axis=1)
        mut = 1.0 - (max_ct/sum_ct)
        mut_df['mut'] = mut

If wanted, compute error (which we see from the function call is not done).

        # Computation of error rate is optional
        if err:
            mut_err = np.sqrt(mut*(1.0-mut)/sum_ct)
            mut_df['mut_err'] = mut_err

Figure out which alphabet is looked at. (I feel like we do this a lot and it could be done once and given as keyword argument, which is set as default to None, in which case it is looking the alphabet up.)

        # Figure out which alphabet the cts dataframe specifies
        alphabet = ''.join([c.split('_')[1] for c in ct_cols])
        seqtype = qc.alphabet_to_seqtype_dict[alphabet]
        wt_col = qc.seqtype_to_wtcolname_dict[seqtype]

Store wild type base at each position.

        # Compute WT base at each position
        mut_df[wt_col] = 'X'
        for col in ct_cols:
            indices = (counts_df[col]==max_ct).values
            mut_df.loc[indices,wt_col] = col.split('_')[1]

Validate that the data frame has the right format.

        # Validate as counts dataframe
        mut_df = qc.validate_profile_mut(mut_df,fix=True)
        return mut_df

#### `dataset2mutarray` <=

Now we go back to the function `dataset2mutarray`. The next lines write the wild type sequence. Here

    wtseq = ''.join(list(mut_df[wtcol]))
  
Use the function we read earlier to translate the wild type sequence to an array of `false` and `true`.

    wtrow = seqs2array([wtseq], seq_type=seqtype).ravel().astype(bool)
    numfeatures = len(wtrow)

In the following section, we compute how many rows are processed at once. It seems like that is beneficial to processing the entire table at once. Maybe due smaller but repeated memory allocation (?).
    
    # Process dataframe in chunks
    startrow = 0
    endrow = startrow+chunksize-1
    numrows = dataset_df.shape[0]
    
Then we create a scipy `lil_matrix`, which is a structure used to create sparse matrices incrementally.

    # Fill in mutarray (a lil matrix) chunk by chunk
    mutarray_lil = lil_matrix((numrows,numfeatures),dtype=int)
    matrix_filled = False
    
In the following `while` loop, we compute the mutation array for chunks of the dataframe, until the entire matrix is filled. Since the wildtype is computed each time, it has to be removed in each iteration (this could be done better). 

    while not matrix_filled:

        if startrow >= numrows:
            matrix_filled = True
            continue
        elif endrow >= numrows:
            endrow = numrows-1
            matrix_filled = True

        # Compute seqarray
        seqlist = list(dataset_df[seqcol][startrow:(endrow+1)])
        seqarray = seqs2array(seqlist, seq_type=seqtype)

        # Remove wt entries
        tmp = seqarray.copy()
        tmp[:,wtrow] = 0

        # Store results from this chunk
        mutarray_lil[startrow:(endrow+1),:] = tmp

        # Increment rows
        startrow = endrow+1
        endrow = startrow + chunksize - 1
        
Finally, the matrix is transformed to a compressed sparse row (csr) matrix, which improves the computation time.

    # Convert to csr matrix
    mutarray_csr = mutarray_lil.tocsr()

    # Return vararray as well as binary representation of wt seq
    return mutarray_csr, wtrow
    
So now we now how the dataframe is prepared for maximization of information.

#### `learn_model.main` <=

Next, we initialize the model. The model is set up differently depending on the model we are using. 
    
    if initialize == 'rand':
        if modeltype == 'MAT':
            emat_0 = utils.RandEmat(len(df[seq_col_name][0]),len(seq_dict))
        elif modeltype == 'NBR':
            emat_0 = utils.RandEmat(len(df['seq'][0])-1,len(seq_dict))
    elif initialize == 'LS':
        emat_cols = ['val_' + inv_dict[i] for i in range(len(seq_dict))]
        emat_0_df = main(df.copy(),lm='LS',modeltype=modeltype,alpha=alpha,start=0,end=None,verbose=verbose)
        emat_0 = np.transpose(np.array(emat_0_df[emat_cols]))   
         
        
We are only going to consider the line `emat_0 = utils.RandEmat(len(df[seq_col_name][0]),len(seq_dict))`, since this step initializes the energy matrix for our inference.

#### => `utils.RandEmat(len(df[seq_col_name][0]),len(seq_dict))`

The function initializing the energy matrix seems to be quite simple. We simply draw each parameter from a the Standard Normal distribution.
   
    def RandEmat(L,Ldict):
        '''Makes 4xL random emat'''
        emat_0 = fix_matrix_gauge(sp.randn(Ldict,L))
        return emat_0
        
Then, the values in each column are shifted such that the mean of each column is zero. Also, the matrix is rescaled such that the sum of variances of each column is equal to the length of the matrix. Here I wonder why that is important. Does that help with convergence?    
        
    def fix_matrix_gauge(emat):
        """Fix gauge of an energy matrix such that the minimum value
        of each column is zero (columns correspond to positions), and
        overall matrix norm is equal to 1."""
        # fix mean
        for j in range(emat.shape[1]):
            emat[:,j] = emat[:,j] -sp.mean(emat[:,j])
        # fix sum of variances equal to length of matrix
        svar = np.sum(np.var(emat,axis=0))
        emat = sp.sqrt(emat.shape[1])*emat/sp.sqrt(svar)
        return emat

#### `learn_model.main` <=

Now we can finally get to the meat of the code, the actual inference! Now here we should see why the whole preparation is done the way it is and how we can learn from this to Stan this baby.

    emat = MaximizeMI_memsaver(
        seq_mat,
        df.copy(),
        emat_0,
        wtrow,
        db=db,
        iteration=iteration,
        burnin=burnin,
        thin=thin,
        runnum=runnum,
        verbose=verbose
        )

#### => `MaximizeMI_memsaver`

Let's look at the function and go through it line by line
    
    def MaximizeMI_memsaver(
            seq_mat,df,emat_0,wtrow,db=None,burnin=1000,iteration=30000,thin=10,
            runnum=0,verbose=False):
        '''Performs MCMC MI maximzation in the case where lm = memsaver'''    
        '''
        @pymc.stochastic(observed=True,dtype=sp.sparse.csr_matrix)
        def sequences(value=seq_mat):
            return 0
        '''

First we just count how many sequences we are looking at.

        n_seqs = seq_mat.shape[0]
        
Next, we define a pymc variable, which is stochastic, but observed, meaning that it is fixed (data).
        
        @pymc.stochastic(observed=True,dtype=pd.DataFrame)
        def pymcdf(value=df):
            return 0
            
Next, we define another variable, which returns the exponent of the log likelihood, the number of sequences times the mutual information. Here we need to take a couple of detours.

        @pymc.stochastic(dtype=float)
        def emat(p=pymcdf,value=emat_0):         
            p['val'] = numerics.eval_modelmatrix_on_mutarray(np.transpose(value),seq_mat,wtrow)                     
            MI = EstimateMutualInfoforMImax.alt4(p.copy())  # New and improved
            return n_seqs*MI


#### =>`numerics.eval_modelmatrix_on_mutarray`

The function starts by a bunch of error checking, to verify that the input data is of the right format.
    
    def eval_modelmatrix_on_mutarray(modelmatrix, mutarray, wtrow):

        # Do error checking
        if not isinstance(modelmatrix,np.ndarray):
            raise SortSeqError('modelmatrix is not a np.ndarray')
        if not isinstance(wtrow,np.ndarray):
            raise SortSeqError('wtrow is not an np.ndarray')
        if not isinstance(mutarray,csr.csr_matrix):
            raise SortSeqErorr('mutarray is not a sparse csr_matrix')
            raise SortSeqError('Unrecognized model type %s'%modeltype)
        if len(wtrow.shape)!=1:
            raise SortSeqError('wtrow is not 1-dimensional')
        if len(modelmatrix.shape)!=2:
            raise SortSeqError('modelmatrix is not 2-dimensional')
        if wtrow.size!=modelmatrix.size:
            raise SortSeqError('wtrow does not match modelmatrix')

Here we flatten the array of the model matrix, and do a scalar product with the wildtype sequence. **This is way better than what I was doing, since scalar product should be much faster than evaluating the matrix the way I did.**

        # Compute constant contribution to model prediciton
        modelmatrix_vec = modelmatrix.ravel()
        const_val = np.dot(wtrow,modelmatrix_vec)

Here we do some reshaping to bring the wild type sequence into the format of the energy matrix.

        # Prepare matrix for scanning mutarray
        tmp_matrix = modelmatrix.copy()
        indices = wtrow.reshape(modelmatrix.shape).astype(bool)
        
Now we evaluate the energy matrix for the wild type and get its energy values. Then the values in the matrix are subtracted by the wild type energy.       
        
        wt_matrix_vals = tmp_matrix[indices]
        tmp_matrix -= wt_matrix_vals[:,np.newaxis]
        
Now we can evaluate the matrix for the sequences and compute the final energy values by adding the wild type energies.
        
        modelmatrix_for_mutarray = csr_matrix(np.matrix(tmp_matrix.ravel()).T)

        # Compute values
        mutarray_vals = mutarray*modelmatrix_for_mutarray
        vals = const_val + mutarray_vals.toarray().ravel()
        return vals

#### `MaximizeMI_memsaver` <=

#### =>`EstimateMutualInfoforMImax.alt4`

Now having the energies in hand, we need to perform the KDE.    
    
    def alt4(df, coarse_graining_level = 0.01):
        '''
        MI ESTIMATOR EDITED BY JBK 
        Used when lm=memsaver 
        REQUIRES TESTING AND PROFILING.
        '''
        
First initiate the number if discrete energies for which we evaluate the KDE and how many distinct sequences, and therefore observed binding energies have. We also count how many different 'batches' there are in the data set, i.e., DNA and cDNA in our case.        

        n_groups=500
        n_seqs = len(df.index)
        binheaders = utils.get_column_headers(df)
        n_batches = len(binheaders)
        cts_grouped = sp.zeros([n_groups,n_batches])
        group_num = 0
        frac_empty = 1.0

        #copy dataframe
        tmp_df = df.copy(binheaders+['val'])

Here we can coarse grain the observed binding energies, to reduce the space and speed up computations.

        # Speed computation by coarse-graining model predictions
        if coarse_graining_level:
            assert type(coarse_graining_level)==float
            assert coarse_graining_level > 0
            vals = tmp_df['val'].values
            scale = np.std(vals)
            coarse_vals = np.floor((vals/scale)/coarse_graining_level)
            tmp_df['val'] = coarse_vals
            grouped = tmp_df.groupby('val')
            grouped_tmp_df = grouped.aggregate(np.sum)
            grouped_tmp_df.sort_index(inplace=True)
        else:
            grouped_tmp_df = tmp_df
            grouped_tmp_df.sort_values(by='val',inplace=True)
        # Get ct_xxx columns
        
Extract the columns for counts of DNA and cDNA, and then compute the total number of counts for each. 
        
        ct_df = grouped_tmp_df[binheaders].astype(float)
        cts_per_group = ct_df.sum(axis=0).sum()/n_groups
        # Histogram counts in groups. This is a bit tricky
        group_vec = np.zeros(n_batches)
        
Iterate through every row of the data frame.

        for i,row in ct_df.iterrows():
        
Compute the fraction of DNA and cDNA counts

            row_ct_tot = row.sum()
            row_ct_vec = row.values
            row_frac_vec = row_ct_vec/row_ct_tot 

            while row_ct_tot >= cts_per_group*frac_empty:
                group_vec = group_vec + row_frac_vec*(cts_per_group*frac_empty)
                row_ct_tot -= cts_per_group*frac_empty

                # Only do once per group_num
                cts_grouped[group_num,:] = group_vec.copy() 
                # Reset for new group_num
                group_num += 1
                frac_empty = 1.0
                group_vec[:] = 0.0
            group_vec += row_frac_vec*row_ct_tot

            frac_empty -= row_ct_tot/cts_per_group
        if group_num == n_groups-1:
            cts_grouped[group_num,:] = group_vec.copy()
        elif group_num == n_groups:
            pass
        else:
            raise TypeError(\
                'group_num=%d does not match n_groups=%s'%(group_num,n_groups))
        # Smooth empirical distribution with gaussian KDE
        f_reg = scipy.ndimage.gaussian_filter1d(cts_grouped,0.04*n_groups,axis=0)

At this point we simply calculate the mutual information for the smoothed probability distribution.

        # Return mutual information
        return info.mutualinfo(f_reg)

    if db:
            dbname = db + '_' + str(runnum) + '.sql'
            M = pymc.MCMC([pymcdf,emat],db='sqlite',dbname=dbname)
        else:
            M = pymc.MCMC([pymcdf,emat])
        M.use_step_method(stepper.GaugePreservingStepper,emat)

        if not verbose:
            M.sample = shutthefuckup(M.sample)

        M.sample(iteration,thin=thin)
        emat_mean = np.mean(M.trace('emat')[burnin:],axis=0)
        return emat_mean