In [81]:
%reset

In [2]:
import os
os.chdir('../')

from data_handler import *

import numpy as np
from textwrap import wrap

%matplotlib inline
import matplotlib.pyplot as plt

In [3]:
def learn_probs(data_train, substring_length):
    '''Learn the probabilities by counting 
    the number of time a substring was implied in a binding sequence'''

    counts = {}
    for _, row in data_train.iterrows():
        sequence = row['Sequence']
        binding = row['Bound']
        substrings = wrap(sequence, substring_length)
        for substr in substrings:
            if substr in counts.keys():
                counts[substr]['count'] = counts[substr]['count'] + 1
                counts[substr]['binding'] = counts[substr]['binding'] + int(binding)
            else:
                counts[substr] = {'count': 1, 'binding': int(binding)}

    probs = {}
    for substr in counts.keys():
        probs[substr] = counts[substr]['binding']/counts[substr]['count']
        
    return probs

In [4]:
# for sequence in data_test['Sequence']:
#     for t in wrap(sequence, substring_length):
#         if t not in probs.keys(): print(t)

In [20]:
def compute_binding_prob(sequence, prob_dict, substring_length):
    proba = 1
    for substring in wrap(sequence, substring_length):
        if substring in prob_dict.keys():
            proba *= prob_dict[substring]
        else:
            proba *= 0.5
    return 1 - proba

In [21]:
## Loading training data
tr0 = load_data(0, 'tr')
tr1 = load_data(1, 'tr')
tr2 = load_data(2, 'tr')

## Loading test data
te0 = load_data(0, 'te')
te1 = load_data(1, 'te')
te2 = load_data(2, 'te')

**Set 0**

In [22]:
for substring_length in [3,4,5,6]:

    probs = learn_probs(tr0, substring_length)

    sequence_prob = []
    for _, row in tr0.iterrows():
        seq = row['Sequence']
        binding = row['Bound']
        sequence_prob.append(compute_binding_prob(seq, probs, substring_length))

    predictions = np.array(sequence_prob)>np.mean(sequence_prob)
    acc = 100*np.mean(predictions == tr0['Bound'])
    print('Substring Length: {} - Accuracy: {:.2f}%'.format(substring_length, acc))

Substring Length: 3 - Accuracy: 35.60%
Substring Length: 4 - Accuracy: 30.20%
Substring Length: 5 - Accuracy: 22.85%
Substring Length: 6 - Accuracy: 27.50%


In [24]:
for substring_length in [3,4,5,6]:

    probs = learn_probs(tr1, substring_length)

    sequence_prob = []
    for _, row in tr1.iterrows():
        seq = row['Sequence']
        binding = row['Bound']
        sequence_prob.append(compute_binding_prob(seq, probs, substring_length))

    predictions = np.array(sequence_prob)>np.mean(sequence_prob)
    acc = 100*np.mean(predictions == tr1['Bound'])
    print('Substring Length: {} - Accuracy: {:.2f}%'.format(substring_length, acc))

Substring Length: 3 - Accuracy: 39.60%
Substring Length: 4 - Accuracy: 36.65%
Substring Length: 5 - Accuracy: 33.00%
Substring Length: 6 - Accuracy: 31.10%


In [25]:
for substring_length in [3,4,5,6]:

    probs = learn_probs(tr2, substring_length)

    sequence_prob = []
    for _, row in tr2.iterrows():
        seq = row['Sequence']
        binding = row['Bound']
        sequence_prob.append(compute_binding_prob(seq, probs, substring_length))

    predictions = np.array(sequence_prob)>np.mean(sequence_prob)
    acc = 100*np.mean(predictions == tr2['Bound'])
    print('Substring Length: {} - Accuracy: {:.2f}%'.format(substring_length, acc))

Substring Length: 3 - Accuracy: 39.65%
Substring Length: 4 - Accuracy: 34.45%
Substring Length: 5 - Accuracy: 27.20%
Substring Length: 6 - Accuracy: 27.10%
