In [33]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [7]:
import os
os.chdir('../')

from data_handler import *
from kernel_methods import *
from metrics import *

import numpy as np
import itertools

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [2]:
## Loading training data
tr0 = load_data(0, 'tr')
tr1 = load_data(1, 'tr')
tr2 = load_data(2, 'tr')

## Loading test data
te0 = load_data(0, 'te')
te1 = load_data(1, 'te')
te2 = load_data(2, 'te')

# Create the mismatch kernel functions

In [3]:
def create_vocab(alphabet, substring_length):
    '''
    Create all the vocabulary of all possibles words using the alphabet: all
    combination of length substring_length. Vocabulary is of size |alphabet|^substring_length.
    
    Input:
        alphabet: letters available in the alphabet
        substring_length: lenghth of words
        
    Output:
        vocab2index: dictionary associating each word in the vocab to an index (integer)
        index2vocab: dictionary associating each index to a word in the vocab
    '''
    vocab = [''.join(i) for i in itertools.product(alphabet, repeat = substring_length)]
    
    vocab2index = {}
    index2vocab = {}
    for idx, v in enumerate(vocab):
        vocab2index[v] = idx
        index2vocab[idx] = v
        
    return vocab2index, index2vocab


def is_neighbour(alpha, beta, mismatch):
    '''
    Check if word beta is in the neighbourhood of word alpha as defined by Leslie and al.
    http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.86.7384&rep=rep1&type=pdf
    
    Input:
        alpha: first word
        beta: second word
        mismatch: tolerance of mismatch
    Output
        Boolean: True if beta is the mismatch-neighbourhood of alpha
    '''
    if sum(a!=b for a, b in zip(alpha, beta)) <= mismatch:
        return True
    else:
        return False
    
def compute_neighbours(vocab2index, mismatch):
    '''
    Compute once for all the neighbours of each word in the vocabulary.
    
    Input:
        vocab2index: vocabulary
        mismatch: tolerance of mismatch
    Output:
        Dictionary of neighbours for each word in the vocabulary.
    '''
    vocab = vocab2index.keys()
    
    neighbours = {}
    for word1 in vocab:
        neighbours[word1] = []
        for word2 in vocab:
            if is_neighbour(word1, word2, mismatch):
                neighbours[word1].append(word2)
    
    return neighbours


def create_mismatch_feature(sequence, substring_length, vocab2index, neighbours):
    '''
    Mismatch kernel feature as described by Leslie and al.
    http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.86.7384&rep=rep1&type=pdf
    
    Input:
        sequence: DNA sequence to process
        substring_length: lenghth of vocabulary words
        vocab2index: mapping of vocabulary word to their index
        neighbours: neighbours for each word for each of the word in the vocabulary
    Output:
        Numpy array: Sequence embedding
    '''
    embedding = np.zeros(len(vocab2index), dtype = 'int')

    for start in range(len(sequence) - substring_length + 1):
        end = start + substring_length
        substring = sequence[start:end]
        for neighbour in neighbours[substring]:
            embedding[vocab2index[neighbour]] += 1
    
    return embedding


def mismatch_kernel(sequenceA, sequenceB, substring_length, vocab2index, neighbours, normalize = False):
    '''
    Mismatch kernel. Optional normalization as described in Leslie and al.
    '''
    embedingA = create_mismatch_feature(sequenceA, substring_length, vocab2index, neighbours)
    embedingB = create_mismatch_feature(sequenceB, substring_length, vocab2index, neighbours)
    
    if normalize:
        return np.dot(embedingA, embedingB)/(np.linalg.norm(embedingA)*np.linalg.norm(embedingB))
    else:
        return np.dot(embeddingA, embeddingB)

# Test these functions

In [32]:
#Alphabet
alphabet = ['A', 'C', 'G', 'T']

substring_length = 5
mismatch_tol = 1

vocab2index, _ = create_vocab(alphabet, substring_length)
neighbours = compute_neighbours(vocab2index, mismatch_tol)


#Example
# create_mismatch_feature(tr0['Sequence'][10], substring_length, vocab2index, neighbours)

# Kernel

In [None]:
lbda = 0.005
kSVM = kernelSVM(lbda)

data = tr0

#Train
kSVM.train(tr0['Sequence'].as_matrix(), 
           tr0['Bound'].as_matrix(), 
           kernel_fct = lambda seq_A, seq_B: mismatch_kernel(seq_A, seq_B, substring_length, vocab2index, neighbours, normalize = True))

#Test
# te0 = load_data(0, 'te')
# predictions = kSVM.predict(te0['Sequence'], stringsData = False)

Building kernel matrix from 2000x2000 samples...


In [20]:
row_sums = np.linalg.norm(X, axis = 1)
X = X / row_sums[:, np.newaxis]

(2000,)

In [29]:
clf = LogisticRegression(C = 0.05)


data = tr0

X = np.zeros((len(data), len(vocab2index)))
for idx, seq in enumerate(data['Sequence']):
    X[idx, :] = create_mismatch_feature(seq, substring_length, vocab2index, neighbours)
# row_sums = np.linalg.norm(X, axis = 1)
# X = X / row_sums[:, np.newaxis]
Y = data['Bound'].as_matrix()


kfold(data = X, 
      labels = Y, 
      n_folds = 5,
      train_method = clf.fit, 
      pred_method = clf.predict,
      metric = m_binary)

Engaging n-fold cross validation with 5 folds on 2000 items
Fold 0, Match rate: 0.71
Fold 1, Match rate: 0.68
Fold 2, Match rate: 0.67
Fold 3, Match rate: 0.67
Fold 4, Match rate: 0.72
Done! Average Match rate is 0.69


0.6880000000000001

In [16]:
data = tr0

X = np.zeros((len(data), len(vocab2index)))
for idx, seq in enumerate(data['Sequence']):
    X[idx, :] = create_mismatch_feature(seq, substring_length, vocab2index, neighbours)
Y = data['Bound']

#Logistic Regrression
for penal in ['l1', 'l2']:
#     for regu in [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]:
    for regu in [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05]:
        model = LogisticRegression(penalty = penal, C = regu, random_state = 777)
        score = np.mean(cross_val_score(model, X, Y, cv = 5))
        print('Logistic Regression - {0} penalty - C: {1} - Mean Cross Validation Score: {2:.2f}%'.format(penal, regu, 100*score))

Logistic Regression - l1 penalty - C: 0.0001 - Mean Cross Validation Score: 50.00%
Logistic Regression - l1 penalty - C: 0.0005 - Mean Cross Validation Score: 50.00%
Logistic Regression - l1 penalty - C: 0.001 - Mean Cross Validation Score: 50.00%
Logistic Regression - l1 penalty - C: 0.005 - Mean Cross Validation Score: 52.85%
Logistic Regression - l1 penalty - C: 0.01 - Mean Cross Validation Score: 65.40%
Logistic Regression - l1 penalty - C: 0.05 - Mean Cross Validation Score: 72.10%
Logistic Regression - l2 penalty - C: 0.0001 - Mean Cross Validation Score: 70.75%
Logistic Regression - l2 penalty - C: 0.0005 - Mean Cross Validation Score: 73.05%
Logistic Regression - l2 penalty - C: 0.001 - Mean Cross Validation Score: 73.35%
Logistic Regression - l2 penalty - C: 0.005 - Mean Cross Validation Score: 72.25%
Logistic Regression - l2 penalty - C: 0.01 - Mean Cross Validation Score: 71.25%
Logistic Regression - l2 penalty - C: 0.05 - Mean Cross Validation Score: 68.75%


In [17]:
data = tr1

X = np.zeros((len(data), len(vocab2index)))
for idx, seq in enumerate(data['Sequence']):
    X[idx, :] = create_mismatch_feature(seq, substring_length, vocab2index, neighbours)
Y = data['Bound']

#Logistic Regrression
for penal in ['l1', 'l2']:
#     for regu in [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]:
    for regu in [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05]:
        model = LogisticRegression(penalty = penal, C = regu, random_state = 777)
        score = np.mean(cross_val_score(model, X, Y, cv = 5))
        print('Logistic Regression - {0} penalty - C: {1} - Mean Cross Validation Score: {2:.2f}%'.format(penal, regu, 100*score))

Logistic Regression - l1 penalty - C: 0.0001 - Mean Cross Validation Score: 50.00%
Logistic Regression - l1 penalty - C: 0.0005 - Mean Cross Validation Score: 50.00%
Logistic Regression - l1 penalty - C: 0.001 - Mean Cross Validation Score: 50.00%
Logistic Regression - l1 penalty - C: 0.005 - Mean Cross Validation Score: 72.20%
Logistic Regression - l1 penalty - C: 0.01 - Mean Cross Validation Score: 79.50%
Logistic Regression - l1 penalty - C: 0.05 - Mean Cross Validation Score: 84.40%
Logistic Regression - l2 penalty - C: 0.0001 - Mean Cross Validation Score: 79.50%
Logistic Regression - l2 penalty - C: 0.0005 - Mean Cross Validation Score: 84.45%
Logistic Regression - l2 penalty - C: 0.001 - Mean Cross Validation Score: 85.25%
Logistic Regression - l2 penalty - C: 0.005 - Mean Cross Validation Score: 84.75%
Logistic Regression - l2 penalty - C: 0.01 - Mean Cross Validation Score: 84.45%
Logistic Regression - l2 penalty - C: 0.05 - Mean Cross Validation Score: 82.80%


In [18]:
data = tr2

X = np.zeros((len(data), len(vocab2index)))
for idx, seq in enumerate(data['Sequence']):
    X[idx, :] = create_mismatch_feature(seq, substring_length, vocab2index, neighbours)
Y = data['Bound']

#Logistic Regrression
for penal in ['l1', 'l2']:
#     for regu in [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]:
    for regu in [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05]:
        model = LogisticRegression(penalty = penal, C = regu, random_state = 777)
        score = np.mean(cross_val_score(model, X, Y, cv = 5))
        print('Logistic Regression - {0} penalty - C: {1} - Mean Cross Validation Score: {2:.2f}%'.format(penal, regu, 100*score))

Logistic Regression - l1 penalty - C: 0.0001 - Mean Cross Validation Score: 50.00%
Logistic Regression - l1 penalty - C: 0.0005 - Mean Cross Validation Score: 50.00%
Logistic Regression - l1 penalty - C: 0.001 - Mean Cross Validation Score: 50.00%
Logistic Regression - l1 penalty - C: 0.005 - Mean Cross Validation Score: 54.85%
Logistic Regression - l1 penalty - C: 0.01 - Mean Cross Validation Score: 58.25%
Logistic Regression - l1 penalty - C: 0.05 - Mean Cross Validation Score: 63.25%
Logistic Regression - l2 penalty - C: 0.0001 - Mean Cross Validation Score: 62.75%
Logistic Regression - l2 penalty - C: 0.0005 - Mean Cross Validation Score: 64.30%
Logistic Regression - l2 penalty - C: 0.001 - Mean Cross Validation Score: 64.35%
Logistic Regression - l2 penalty - C: 0.005 - Mean Cross Validation Score: 63.55%
Logistic Regression - l2 penalty - C: 0.01 - Mean Cross Validation Score: 63.50%
Logistic Regression - l2 penalty - C: 0.05 - Mean Cross Validation Score: 62.85%
