In [2]:
#requirements
import pandas as pd

#for tokenizer
import morfessor
import math

# Repression Prediction vis LLM
The linear regression didn't perform very well. Lets try this again but with a LLM trained to the sequence data. In order to have peak performance, we need to make a specific tokenizer for this model. The tokenizer should be able to identify key motifs in the sequence. 

In [3]:
#train morfessor model to tokenize words
io = morfessor.MorfessorIO()
train_data = list(io.read_corpus_file('data_owen/morphemes.txt'))
model = morfessor.BaselineModel()

def sigmoid(x):
   return 5/(1+(math.e**(-(x-3.188))))

model.load_data(train_data, count_modifier=sigmoid)
model.train_batch()

.
.


(2, 0.0)

In [4]:
def tokenize_by_word(seq, model):
    return " ".join(model.viterbi_segment(seq, addcount=1,maxlen=7)[0])

In [10]:
def tokenize_by_sliding_window(seq, rbp_dict, data_type):
    threshold = None
    key = ''
    if data_type == 'dmitri':
        threshold = 3.32
        key = 'Mean_RBP'
    elif data_type == 'owen':
        threshold = 0.954
        key = 'ENCFF266PHU'
    seq_string = ""
    for i in range(len(seq)-6):
        subseq = seq[i:i+7]
        if subseq in rbp_dict and rbp_dict[subseq][key] > threshold:    
            seq_string = seq_string+subseq + " " 
    return seq_string

In [22]:
#format data by 'morfessor word tokens'
def format_data_word_tokens(model):
    exp_data = pd.read_csv("data/Rabani_expression_A+.csv")
    seq_data = pd.read_csv("data/Rabani_oligos.csv")

    #generate ER
    er_list = []
    token_list = []
    count = 0
    for ind,row in exp_data.iterrows():
        seq_id = exp_data.loc[ind, 'id'] 
        seq = seq_data.loc[seq_data['id'] == seq_id, 'Sequence'].values
        #see if sequence is in seq_data
        if seq.size!=0:
            er = (exp_data.loc[ind, '6h'] - exp_data.loc[ind, '1h'])/2
            er_list.append(er)
            token_list.append(tokenize_by_word(seq[0],model))
            if count%400==0:
                print("")  
            elif count%5==0: 
                print(".", end ="")
        count+=1
    #print token_list to text file for tokenizer training
    with open("data/token_data_word_token.txt", "w") as txt_file:
        for line in token_list:
            txt_file.write(line+"\n")
    df = pd.DataFrame({'Token':token_list,'Expression Rate':er_list})
    df.to_csv("data/seq_er_word_token.csv",index=False)

In [13]:
#format data by 'sliding window with mean rbp threshold'
def format_data_slideing_window(data_type):
    exp_data = pd.read_csv("data/Rabani_expression_A+.csv")
    seq_data = pd.read_csv("data/Rabani_oligos.csv")
    rbp_dict = None
    if data_type == 'dmitri':
        rbp_dict = pd.read_csv("data_dmitri/ZF_7N_PCBP2_vs_Adar-1.csv")[['Motif', 'Mean_RBP']].set_index('Motif').T.to_dict()
    elif data_type == 'owen':
        rbp_dict = pd.read_csv("data_owen/Bind_n_seq_comparison/Enrichments.csv")[['Sequence', 'ENCFF266PHU']].set_index('Sequence').T.to_dict()

    #generate ER
    er_list = []
    token_list = []
    count = 0
    for ind,row in exp_data.iterrows():
        seq_id = exp_data.loc[ind, 'id'] 
        seq = seq_data.loc[seq_data['id'] == seq_id, 'Sequence'].values
        #see if sequence is in seq_data
        if seq.size!=0:
            tokens = tokenize_by_sliding_window(seq[0], rbp_dict, data_type)
            if len(tokens) > 0:
                er = (exp_data.loc[ind, '6h'] - exp_data.loc[ind, '1h'])/2
                er_list.append(er)
                token_list.append(tokens)
            else:
                print("*", end ="")
            if count%5000==0:
                print("")  
            elif count%200==0: 
                print(".", end ="")
        count+=1
    #print token_list to text file for tokenizer training
    with open("data/token_data_sliding_window_"+data_type+".txt", "w") as txt_file:
        for line in token_list:
            txt_file.write(line+"\n")
    df = pd.DataFrame({'Token':token_list,'Expression Rate':er_list})
    df.to_csv("data/seq_er_sliding_window_"+data_type+".csv",index=False)

In [14]:
format_data_slideing_window('owen')

.*.*......................
........................
........................
........................
....*....................
........................
...................*.*....
...........*.*..........*..
........................
*........................
........................
........................
...............*.........
........................
........................
........................
........................
...................*.....
