In [33]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [2]:
import os
os.chdir('../')

from data_handler import *

import numpy as np
import itertools

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [3]:
## Loading training data
tr0 = load_data(0, 'tr')
tr1 = load_data(1, 'tr')
tr2 = load_data(2, 'tr')

## Loading test data
te0 = load_data(0, 'te')
te1 = load_data(1, 'te')
te2 = load_data(2, 'te')

In [9]:
def create_vocab(alphabet, substring_length):
    '''
    Create all the vocabulary of all possibles words using the alphabet: all
    combination of length substring_length. Vocabulary is of size |alphabet|^substring_length.
    
    Input:
        alphabet: letters available in the alphabet
        substring_length: lenghth of words
        
    Output:
        vocab2index: dictionary associating each word in the vocab to an index (integer)
        index2vocab: dictionary associating each index to a word in the vocab
    '''
    vocab = [''.join(i) for i in itertools.product(alphabet, repeat = substring_length)]
    
    vocab2index = {}
    index2vocab = {}
    for idx, v in enumerate(vocab):
        vocab2index[v] = idx
        index2vocab[idx] = v
        
    return vocab2index, index2vocab


def is_neighbour(alpha, beta, mismatch):
    '''
    Check if word beta is in the neighbourhood of word alpha as defined by Leslie and al.
    http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.86.7384&rep=rep1&type=pdf
    
    Input:
        alpha: first word
        beta: second word
        mismatch: tolerance of mismatch
    Output
        Boolean: True if beta is the mismatch-neighbourhood of alpha
    '''
    if sum(a!=b for a, b in zip(alpha, beta)) <= mismatch:
        return True
    else:
        return False
    
def compute_neighbours(vocab2index, mismatch):
    '''
    Compute once for all the neighbours of each word in the vocabulary.
    
    Input:
        vocab2index: vocabulary
        mismatch: tolerance of mismatch
    Output:
        Dictionary of neighbours for each word in the vocabulary.
    '''
    vocab = vocab2index.keys()
    
    neighbours = {}
    for word1 in vocab:
        neighbours[word1] = []
        for word2 in vocab:
            if is_neighbour(word1, word2, mismatch):
                neighbours[word1].append(word2)
    
    return neighbours


def create_mismatch_feature(sequence, substring_length, vocab2index, neighbours):
    '''
    Mismatch kernel feature as described by Leslie and al.
    http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.86.7384&rep=rep1&type=pdf
    
    Input:
        sequence: DNA sequence to process
        substring_length: lenghth of vocabulary words
        vocab2index: mapping of vocabulary word to their index
        neighbours: neighbours for each word for each of the word in the vocabulary
    Output:
        Numpy array: Sequence embedding
    '''
    embedding = np.zeros(len(vocab2index), dtype = 'int')

    for start in range(len(sequence) - substring_length + 1):
        end = start + substring_length
        substring = sequence[start:end]
        for neighbour in neighbours[substring]:
            embedding[vocab2index[neighbour]] += 1
    
    return embedding

In [7]:
#Alphabet
alphabet = ['A', 'C', 'G', 'T']

In [8]:
substring_length = 5
mismatch_tol = 3

vocab2index, _ = create_vocab(alphabet, substring_length)
neighbours = compute_neighbours(vocab2index, mismatch_tol)


#Example
create_mismatch_feature(tr0['Sequence'][10], substring_length, vocab2index, neighbours)

array([42, 38, 38, ..., 44, 41, 43])

In [54]:
data = tr1

X = np.zeros((len(data), len(vocab2index)))
for idx, seq in enumerate(data['Sequence']):
    X[idx, :] = create_mismatch_feature(seq, substring_length, vocab2index, neighbours)
Y = data['Bound']

In [56]:
#Logistic Regrression
for penal in ['l1', 'l2']:
    for regu in [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]:
        model = LogisticRegression(penalty = penal, C = regu, random_state = 777)
        score = np.mean(cross_val_score(model, X, Y, cv = 5))
        print('Logistic Regression - {0} penalty - C: {1} - Mean Cross Validation Score: {2:.2f}%'.format(penal, regu, 100*score))

Logistic Regression - l1 penalty - C: 0.001 - Mean Cross Validation Score: 63.95%
Logistic Regression - l1 penalty - C: 0.005 - Mean Cross Validation Score: 74.75%
Logistic Regression - l1 penalty - C: 0.01 - Mean Cross Validation Score: 79.70%
Logistic Regression - l1 penalty - C: 0.05 - Mean Cross Validation Score: 80.60%
Logistic Regression - l1 penalty - C: 0.1 - Mean Cross Validation Score: 79.50%
Logistic Regression - l1 penalty - C: 0.5 - Mean Cross Validation Score: 77.70%
Logistic Regression - l1 penalty - C: 1 - Mean Cross Validation Score: 77.70%
Logistic Regression - l2 penalty - C: 0.001 - Mean Cross Validation Score: 81.90%
Logistic Regression - l2 penalty - C: 0.005 - Mean Cross Validation Score: 79.80%
Logistic Regression - l2 penalty - C: 0.01 - Mean Cross Validation Score: 79.05%
Logistic Regression - l2 penalty - C: 0.05 - Mean Cross Validation Score: 77.85%
Logistic Regression - l2 penalty - C: 0.1 - Mean Cross Validation Score: 77.45%
Logistic Regression - l2 penal

In [55]:
X.shape

(2000, 1024)