In [None]:
%reset

In [None]:
import os
os.chdir('../')

from data_handler import *
from kernel_methods import *
from metrics import *

import numpy as np
import itertools

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

#Alphabet
alphabet = ['A', 'C', 'G', 'T']

In [None]:
## Loading training data
tr0 = load_data(0, 'tr')
tr1 = load_data(1, 'tr')
tr2 = load_data(2, 'tr')

## Loading test data
te0 = load_data(0, 'te')
te1 = load_data(1, 'te')
te2 = load_data(2, 'te')

# Create the mismatch kernel functions

In [None]:
def create_vocab(alphabet, substring_length):
    '''
    Create all the vocabulary of all possibles words using the alphabet: all
    combination of length substring_length. Vocabulary is of size |alphabet|^substring_length.
    
    Input:
        alphabet: letters available in the alphabet
        substring_length: lenghth of words
        
    Output:
        vocab2index: dictionary associating each word in the vocab to an index (integer)
        index2vocab: dictionary associating each index to a word in the vocab
    '''
    vocab = [''.join(i) for i in itertools.product(alphabet, repeat = substring_length)]
    
    vocab2index = {}
    index2vocab = {}
    for idx, v in enumerate(vocab):
        vocab2index[v] = idx
        index2vocab[idx] = v
        
    return vocab2index, index2vocab


def is_neighbour(alpha, beta, mismatch):
    '''
    Check if word beta is in the neighbourhood of word alpha as defined by Leslie and al.
    http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.86.7384&rep=rep1&type=pdf
    
    Input:
        alpha: first word
        beta: second word
        mismatch: tolerance of mismatch
    Output
        Boolean: True if beta is the mismatch-neighbourhood of alpha
    '''
    if sum(a!=b for a, b in zip(alpha, beta)) <= mismatch:
        return True
    else:
        return False
    
def compute_neighbours(vocab2index, mismatch):
    '''
    Compute once for all the neighbours of each word in the vocabulary.
    
    Input:
        vocab2index: vocabulary
        mismatch: tolerance of mismatch
    Output:
        Dictionary of neighbours for each word in the vocabulary.
    '''
    vocab = vocab2index.keys()
    
    neighbours = {}
    for word1 in vocab:
        neighbours[word1] = []
        for word2 in vocab:
            if is_neighbour(word1, word2, mismatch):
                neighbours[word1].append(word2)
    
    return neighbours


def create_mismatch_feature(sequence, substring_length, vocab2index, neighbours, normalize = False):
    '''
    Mismatch kernel feature as described by Leslie and al.
    http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.86.7384&rep=rep1&type=pdf
    
    Input:
        sequence: DNA sequence to process
        substring_length: lenghth of vocabulary words
        vocab2index: mapping of vocabulary word to their index
        neighbours: neighbours for each word for each of the word in the vocabulary
    Output:
        Numpy array: Sequence embedding
    '''
    embedding = np.zeros(len(vocab2index), dtype = 'int')

    for start in range(len(sequence) - substring_length + 1):
        end = start + substring_length
        substring = sequence[start:end]
        for neighbour in neighbours[substring]:
            embedding[vocab2index[neighbour]] += 1
    
    if normalize:
        embedding = embedding/np.linalg.norm(embedding)
        
    return embedding


def mismatch_kernel(sequenceA, sequenceB, substring_length, vocab2index, neighbours, normalize):
    '''
    Mismatch kernel. Optional normalization as described in Leslie and al.
    '''
    embedingA = create_mismatch_feature(sequenceA, substring_length, vocab2index, neighbours, normalize)
    embedingB = create_mismatch_feature(sequenceB, substring_length, vocab2index, neighbours, normalize)
    
    return np.dot(embeddingA, embeddingB)

# Test these functions

In [None]:
#Alphabet
alphabet = ['A', 'C', 'G', 'T']

substring_length = 3
mismatch_tol = 1

vocab2index, _ = create_vocab(alphabet, substring_length)
neighbours = compute_neighbours(vocab2index, mismatch_tol)


#Example
# create_mismatch_feature(tr0['Sequence'][10], substring_length, vocab2index, neighbours)

# Kernel

In [None]:
lbda = 0.005
kSVM = kernelSVM(lbda)

data = tr0

#Train
kSVM.train(tr0['Sequence'].as_matrix(), 
           tr0['Bound'].as_matrix(), 
           kernel_fct = lambda seq_A, seq_B: mismatch_kernel(seq_A, seq_B, substring_length, vocab2index, neighbours, normalize = True))

#Test
# te0 = load_data(0, 'te')
# predictions = kSVM.predict(te0['Sequence'], stringsData = False)

# Grid Search With Mismatch Kernel Features + Logistic Regression

In [None]:
def gd_log_reg(data, sub_lengths, tols, regularizations, penalties, normalize):
    max_score = 0
    max_model = None

    Y = data['Bound'].as_matrix()

    for substring_length in sub_lengths:
        vocab2index, _ = create_vocab(alphabet, substring_length)
        for mismatch_tol in tols:
            print('--Normalize: {0} - Substring Length: {1} - Mismatch Tolerance: {2}'.format(normalize, substring_length, mismatch_tol))
            neighbours = compute_neighbours(vocab2index, mismatch_tol)

            X = np.zeros((len(data), len(vocab2index)))
            for idx, seq in enumerate(data['Sequence']):
                X[idx, :] = create_mismatch_feature(seq, substring_length, vocab2index, neighbours, normalize)

            for penal in penalties:
                for regu in regularizations:
                    clf = LogisticRegression(C = regu, penalty = penal)
                    #SIMON. Cette partie la bug... à cause du sign_label je crois...
#                     score = kfold(data = X, 
#                                   labels = Y, 
#                                   n_folds = 5,
#                                   train_method = clf.fit, 
#                                   pred_method = clf.predict,
#                                   metric = m_binary,
#                                   verbose = True)
                    #Du coup, j'ai utilisé celle de sklearn.
                    score = np.mean(cross_val_score(clf, X, Y, cv = 5))

                    if score > max_score:
                        max_score = score
                        max_model = clf
                        print('----Increase in Score. Penal: {0} - Regu: {1}. Mean Val Score: {2:.2f}'.format(penal, regu, 100*score))

**Set 0**

In [None]:
data = tr0

sub_lengths = [3,4,5,6]
tols = [1, 2]
regularizations = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10]
penalties = ['l1', 'l2']

gd_log_reg(data, sub_lengths, tols, regularizations, penalties, normalize= True)
print('\n')
gd_log_reg(data, sub_lengths, tols, regularizations, penalties, normalize= False)

print('\nFinished !')

**Set 1**

In [None]:
data = tr1

sub_lengths = [3,4,5,6]
tols = [1, 2]
regularizations = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]
penalties = ['l1', 'l2']

gd_log_reg(data, sub_lengths, tols, regularizations, penalties, normalize= True)
gd_log_reg(data, sub_lengths, tols, regularizations, penalties, normalize= False)

print('\nFinished !')

**Set 2**

In [None]:
data = tr2

sub_lengths = [3,4,5,6]
tols = [1, 2]
regularizations = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]
penalties = ['l1', 'l2']

gd_log_reg(data, sub_lengths, tols, regularizations, penalties, normalize= True)
gd_log_reg(data, sub_lengths, tols, regularizations, penalties, normalize= False)

print('\nFinished !')