In [None]:
#@author Shray Alag
#@version 08.05.2020
#@mentor Abhi
#@runtime running the whole notebook takes less than 10 seconds, but remember to input your choice of which motifs it
#should get the occurance of
#Finds the occurance of a motifs (either inputted or found by MEME) and prints out the sequence logos. Displays all
#of the meme motif sequences (in the sequence logo format) and the meme motifs information (e-value, etc.). Asks user
#which of those motifs they would like to get occurance of and then reads through the fasta file. The user can also
#input a motif different from the one MEME recommends. The results of the sequences before and after the occurances 
#of the desired motif are then outputted into sequence logos pngs.




#VERY IMPORTANT: You need to install biopython (pip install biopython) and logomaker (pip install logomaker)



#import statements
from modisco.visualization import viz_sequence
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
from pandas import DataFrame
import logomaker as logo
from Bio.Seq import Seq
from Bio import SeqIO
import pandas as pd
import numpy as np
%matplotlib inline
import random
import time
import re
import os




#VERY IMPORTANT: Change the BASE_DIR and the other file directories to use
BASE_DIR = "HT-SELEX/improved_html_SELEX_MEME_Results/ZNF436_GC40NCACTTC_Lysate_BatchAATA_Cycle3_R1/"
fastaFile, memeTextFileName = BASE_DIR + "ZNF436_GC40NCACTTC_Lysate_BatchAATA_Cycle3_R1.fa", BASE_DIR + "meme_out/meme.txt"





motifList = []
infoMotifList = []
firstMotifIndex, secondMotifIndex = -2, -2
firstMotifStr, secondMotifStr = "", ""
bothOccuranceLength = {}
doExtraAnalysis = True
global_time = time.time()

In [None]:
#This Cell defines some useful methods of displaying/visualizing the sequences. Additionaly, this cell also outputs 
#the main motifs that MEME finds, so that the user can decide which of these motifs should be looked for in the 
#fasta file


def convert_to_ic_scale(pwm):
    background = np.array([0.27, 0.23, 0.23, 0.27])
    pwm_sum = np.sum(pwm, axis=1, keepdims=True)
    pwm_sum[pwm_sum == 0] = 1  # Keep 0 where 0
    ic_scale_pwm = viz_sequence.ic_scale(pwm / pwm_sum, background)
    return ic_scale_pwm




def setUp():
    readFasta(motifList)
    #print(motifList)
    displayLogos(motifList)
    firstMotifIndex, secondMotifIndex = -1, -1
    
    
    
    
    
def setUpParam(fastaFileTemp, memeTextFileNameTemp):
    fastaFile = fastaFileTemp
    memeTextFileName = memeTextFileNameTemp
    print(fastaFile, memeTextFileName)
    setUp()
    
    
    
    
    
    
def setUpParamWithIndexes(fastaFileTemp, memeTextFileNameTemp, firstMotifIndexTemp, secondMotifIndex):
    firstMotifIndex = firstMotifIndexTemp
    secondMotifIndex = secondMotifIndexTemp
    setUpParam(fastaFileTemp, memeTextFileNameTemp)
    
    
    
    
#reads through fasta file
def readFasta(motifs):
    fastaReader = open(memeTextFileName, "r") 
    inMotif = False
    
    np_motif_array = []
    
    
    for row in fastaReader:
        if inMotif and not row.startswith("------"):
            #print(row.strip().split())
            np_motif_array.append(row.strip().split())
        if row.startswith("letter-probability"):
            inMotif = True
        elif inMotif and row.startswith("--------"):
            pd_Motif = pd.DataFrame(np_motif_array,
                columns=['A', 'C', 'G', 'T'])
            motifs.append(pd_Motif)
            np_motif_array = []
            inMotif = False
        
        if row.startswith("letter-probability matrix:"):
            infoMotifList.append(row[38:])
    motifList = motifs

    
    
    
    
#shows all of the sequence logos MEME found
def displayLogos(motifs):
    for i in range(len(motifs)):
        logo_logo = logo.Logo(getSpecialEncoding(motifs[i]), color_scheme={'A': 'red','C': 'blue', 'G': 'orange', 'T': 'green'},baseline_width=0,
                           show_spines=False,
                           vsep=.005,
                           width=.80)
        logo_logo.ax.text(14, 0, "#" + str(i+1) + ": " + infoMotifList[i], fontsize=40-len(motifs[i]))
    logo_logo.draw()
    
    
    
    
    
    
#takes in a 2D Pandas Dataframe
def getSpecialEncoding(motifs):
    rows = []
    for index, row in motifs.iterrows():
        rows.append((computationNeg(row["A"]), computationNeg(row["C"]), computationNeg(row["G"]), computationNeg(row["T"])))
    df = pd.DataFrame(rows, columns=['A', 'C', 'G', 'T'])
    return df
        
    
    
    
    
#scaling algorithm. I need to change all of the logos to use convert_to_ic_scale
def computationNeg(coloumn):
    temp = float(coloumn)
    if temp < 0.15:
        return (temp - 0.10)/1.75
    if temp > 0.8:
        return ((temp)*4)/1.75
    if temp > 0.6:
        return ((temp)*3.5)/1.75
    if temp > 0.35:
        return ((temp*3))/1.75
    else:
        return (temp -0.1)/1.75
    
    
    
    
    
#simplier method to get the data frames (calls roundedVersion). This is not used here   
#takes in a 2D Pandas Dataframe
def getOneHot(motifs):
    rows = []
    for index, row in motifs.iterrows():
        rows.append((roundedVersion(row["A"]), roundedVersion(row["C"]), roundedVersion(row["G"]), roundedVersion(row["T"])))
    df = pd.DataFrame(rows, columns=['A', 'C', 'G', 'T'])
    return df
    
    
    
    
    
#simplier scaling algorithm that is not used
def roundedVersion(coloumn):
    if float(coloumn) > 0.8:
        return 1
    else:
        return 0

    
    

def convert_to_one_hot(sequence, count):
    convert_dict = {'a':(float(count),0.0,0.0,0.0),
                   'c':(0.0,float(count),0.0,0.0),
                   'g':(0.0,0.0,float(count),0.0),
                   't':(0.0,0.0,0.0,float(count)),
                   'n':(0.25,0.25,0.25,0.25)}
    sequence = sequence.lower()
    arr = [convert_dict[s] for s in sequence]
    arr = np.vstack(arr)
    return(arr)





def convert_one_hot_to_seq(one_hot_arr):
    convert_dict = {(1,0,0,0):'A',(0,1,0,0):'C',(0,0,1,0):'G',(0,0,0,1):'T'}
    seq = ''
    for i in one_hot_arr:
        seq+=convert_dict[tuple(i)]
    return(seq)




def concatenate(negProbabilities, key, count):
    return np.concatenate((negProbabilities, convert_to_one_hot(key, count)), axis=0)


#There are many different methods for the setUp method. Depending on which parameters you know choose the appropriate 
#method
setUp() #this just reads in the input files and displayes the top 10 motifs MEME found.

In [None]:
#This cell asks the user for input of which of the MEME motifs they want to find (out of motifs #1-#10)
#Then, it searches for that motif after defining the key characteristics (in defineKeyMarkersForMotifs) of the motif 
#sequence the user wants the program to look for. readInFasta goes through the fasta file and searches for matches


firstMotifIndex = int(input("Please enter the number (1 to 10) of the first motif you would like to get the co-occurance of: ")) - 1
# secondMotifIndex = int(input("Please enter the number (1 to 10) of the second motif you would like to get the co-occurance of: ")) - 1
# ^no occurrence needed in this notebook (only looking for 1 motif, not when 2 motifs occur together)
print("Thanks! Running getting motif for #" + str(firstMotifIndex + 1))    




#defines key characteristics to look for
def defineKeyMarkersForMotifs(firstMotifStr):
    
    for index, row in motifList[firstMotifIndex].iterrows():
        hasFoundOne = False
        for i in range(4):
            if float(row[i]) > 0.8:
                firstMotifStr = firstMotifStr + str((i + 1))
                hasFoundOne = True
            if i == 3 and hasFoundOne == False:
                firstMotifStr = firstMotifStr + "."
         
    firstMotifStr = firstMotifStr.replace('1','A')
    firstMotifStr = firstMotifStr.replace('2','C')
    firstMotifStr = firstMotifStr.replace('3','G')
    firstMotifStr = firstMotifStr.replace('4','T') 
    
    
    print(firstMotifStr)
    return firstMotifStr
        
    
    
    
    
    
#goes through the fasta file and records the occurrences
#here, bothOccuranceLength records the string from the motif occurrence to the end of the kmer sequence
#while posThirteenSequence (ignore the name) records the string before the motif occurrence.
def readInFasta(firstMotifStr, bothOccuranceLength):
    posThirteenSequence = {}
    counter = 0
    relevantCounter = 0
    print("length of motif is", len(firstMotifStr))
    
    
    fasta_sequences = SeqIO.parse(open(fastaFile),'fasta')
    
    for fasta in fasta_sequences:
        counter = counter + 1
        name, sequence = fasta.id, str(fasta.seq)
        sequence = sequence.upper()
        firstFound = re.search(firstMotifStr, sequence)
        
        #checks reverse compliment
        if not firstFound:
            sequence = str(Seq(sequence).reverse_complement())
            firstFound = re.search(firstMotifStr, sequence)
        
        if firstFound:    
            relevantCounter = relevantCounter + 1
            if sequence[firstFound.start():] in bothOccuranceLength:
                bothOccuranceLength.update({sequence[firstFound.start():]: (bothOccuranceLength.get(sequence[firstFound.start():]) + 1)})       
            else:
                bothOccuranceLength.update({sequence[firstFound.start():]: 1})
            
            if sequence[0:firstFound.start()+len(firstMotifStr)] in posThirteenSequence and firstFound.start() != 0:
                posThirteenSequence.update({sequence[0:firstFound.start()+len(firstMotifStr)] : (posThirteenSequence.get(sequence[0:firstFound.start()+len(firstMotifStr)]) + 1)})
            elif firstFound.start() != 0:
                posThirteenSequence.update({sequence[0:firstFound.start()+len(firstMotifStr)] : 1})
    
    #sort the sequences by occurrence
    bothOccuranceLength = {k: v for k, v in sorted(bothOccuranceLength.items(), key=lambda item: item[1])}
    posThirteenSequence = {k: v for k, v in sorted(posThirteenSequence.items(), key=lambda item: item[1])}
    print(counter)
    print("number of times a relevant motif was found is", relevantCounter)

    return bothOccuranceLength, posThirteenSequence



# "C.CC.CC.CC..G..G.CC.CC..GATT"
# AATC..GG.GG.C..C..GG.GG.GG.G


#IMPORTANT: IF YOU WOULD LIKE TO CHECK THE OCCURRENCE OF A SPECIFIC INPUTTED STRING then replace the string with 
#temp1 in bothOccuranceLength, posThirteenSequence = readInFasta(temp1, bothOccuranceLength)
#Input the string like the two examples above.

    
temp1 = defineKeyMarkersForMotifs(firstMotifStr)
bothOccuranceLength, posThirteenSequence = readInFasta(temp1, bothOccuranceLength)

In [None]:
#This cell is to plot the results of the previous few cells. 



#IMPORTANT: Some of the following code may seem convoluted but that is because in many scenarios, the lengths of the 
#sequences captured differ (motif may come at the 1st index or at the middle, so the length of the squence varies)





#Adds up the count frequencies for each base pair
rows, cols = (50, 4) 
thirteenList = [[0]*cols]*rows



count = 0
for key in posThirteenSequence:
    one_hot_result = convert_to_one_hot(key, posThirteenSequence.get(key))
    if len(one_hot_result) != 50:
        rows, cols = (50 - len(one_hot_result), 4)
        extra = np.asarray([[0]*cols]*rows)
        one_hot_result = np.vstack((extra, one_hot_result))  
    thirteenList = thirteenList + one_hot_result
    count = count + 1

    
    
    

#normalizes the list and takes out all of the empty rows
thirteenList = normalize(thirteenList, axis=1, norm='l1')
thirteenList = thirteenList[~np.all(thirteenList == 0, axis=1)]







#IMPORTANT: THE FIRST TWO PLOTS ARE USING MY SCALING ALGORITHM. THE SECOND ONE IS THE REVERSE COMPLIMENT OF THE FIRST
#IMPORTANT: THE LAST TWO PLOTS ARE USING THE INFORMATION CONTENT SCALING ALGORITHM. AGAIN, THE LAST PLOT (4TH ONE)
#IS THE REVERSE COMPLIMENT OF THE THIRD.
logo_logo = logo.Logo(getSpecialEncoding(pd.DataFrame(np.asarray(thirteenList),
                columns=['A', 'C', 'G', 'T'])), color_scheme={'A': 'red','C': 'blue', 'G': 'orange', 'T': 'green'},baseline_width=0,
                           show_spines=False,
                           vsep=.005,
                           width=.80)


thirteenListFlipped = np.flip(thirteenList)
np.save("/home/shray/ZNF436_numpy.npy", thirteenListFlipped)
logo_logo = logo.Logo(getSpecialEncoding(pd.DataFrame(np.asarray(thirteenListFlipped),
                columns=['A', 'C', 'G', 'T'])), color_scheme={'A': 'red','C': 'blue', 'G': 'orange', 'T': 'green'},baseline_width=0,
                           show_spines=False,
                           vsep=.005,
                           width=.80)





##Logos below use the convert to ic scale
logo_logo = logo.Logo((pd.DataFrame(convert_to_ic_scale(np.asarray(thirteenList)),
                columns=['A', 'C', 'G', 'T'])), color_scheme={'A': 'red','C': 'blue', 'G': 'orange', 'T': 'green'},baseline_width=0,
                           show_spines=False,
                           vsep=.005,
                           width=.80)


logo_logo = logo.Logo(pd.DataFrame(convert_to_ic_scale(np.asarray(thirteenListFlipped)),
                columns=['A', 'C', 'G', 'T']), color_scheme={'A': 'red','C': 'blue', 'G': 'orange', 'T': 'green'},baseline_width=0,
                           show_spines=False,
                           vsep=.005,
                           width=.80)

In [None]:
#This cell is to plot the motif found by TF-MoDISCo. Both compliments are plotted below using the information content
#scale

modisco_motif = np.load("znf436.npy")
logo_logo = logo.Logo((pd.DataFrame(convert_to_ic_scale(np.asarray(modisco_motif)),
                columns=['A', 'C', 'G', 'T'])), color_scheme={'A': 'red','C': 'blue', 'G': 'orange', 'T': 'green'},baseline_width=0,
                           show_spines=False,
                           vsep=.005,
                           width=.80)


modisco_motif_flipped = np.flip(modisco_motif)
logo_logo = logo.Logo(pd.DataFrame(convert_to_ic_scale(np.asarray(modisco_motif_flipped)),
                columns=['A', 'C', 'G', 'T']), color_scheme={'A': 'red','C': 'blue', 'G': 'orange', 'T': 'green'},baseline_width=0,
                           show_spines=False,
                           vsep=.005,
                           width=.80)

In [None]:
#Just to get the probabilities
print(thirteenList)

In [None]:
#This cell is also to plot but this one plots the results of bothOccuranceLength. This is very similar to the 
#one above.


rows, cols = (40, 4) 
negProbabilitiesFullList = [[0]*cols]*rows




count = 0
for key in bothOccuranceLength:
    one_hot_result = convert_to_one_hot(key, bothOccuranceLength.get(key))
    if len(one_hot_result) != 40:
        rows, cols = (40 - len(one_hot_result), 4)
        extra = np.asarray([[0]*cols]*rows)
        one_hot_result = np.vstack((one_hot_result, extra))  
    negProbabilitiesFullList = negProbabilitiesFullList + one_hot_result
    count = count + 1

    


#IMPORTANT: THE FIRST TWO PLOTS ARE USING MY SCALING ALGORITHM. THE SECOND ONE IS THE REVERSE COMPLIMENT OF THE FIRST
#IMPORTANT: THE LAST TWO PLOTS ARE USING THE INFORMATION CONTENT SCALING ALGORITHM. AGAIN, THE LAST PLOT (4TH ONE)
#IS THE REVERSE COMPLIMENT OF THE THIRD.
    
negProbabilitiesFullList = normalize(negProbabilitiesFullList, axis=1, norm='l1')
logo_logo = logo.Logo(getSpecialEncoding(pd.DataFrame(np.asarray(negProbabilitiesFullList),
                columns=['A', 'C', 'G', 'T'])), color_scheme={'A': 'red','C': 'blue', 'G': 'orange', 'T': 'green'},baseline_width=0,
                           show_spines=False,
                           vsep=.005,
                           width=.80)



negProbabilitiesFullListReverse = np.flip(negProbabilitiesFullList)



logo_logo = logo.Logo(getSpecialEncoding(pd.DataFrame(np.asarray(negProbabilitiesFullListReverse),
                columns=['A', 'C', 'G', 'T'])), color_scheme={'A': 'red','C': 'blue', 'G': 'orange', 'T': 'green'},baseline_width=0,
                           show_spines=False,
                           vsep=.005,
                           width=.80)


##################################################################



##Logos below use the convert to ic scale
logo_logo = logo.Logo((pd.DataFrame(convert_to_ic_scale(np.asarray(negProbabilitiesFullList)),
                columns=['A', 'C', 'G', 'T'])), color_scheme={'A': 'red','C': 'blue', 'G': 'orange', 'T': 'green'},baseline_width=0,
                           show_spines=False,
                           vsep=.005,
                           width=.80)


logo_logo = logo.Logo(pd.DataFrame(convert_to_ic_scale(np.asarray(negProbabilitiesFullListReverse)),
                columns=['A', 'C', 'G', 'T']), color_scheme={'A': 'red','C': 'blue', 'G': 'orange', 'T': 'green'},baseline_width=0,
                           show_spines=False,
                           vsep=.005,
                           width=.80)

In [None]:
#print probabilities

print(negProbabilitiesFullList)