# Spectrum Kernel Development
Described in Slides 341 -> 344.

In [None]:
%reset

In [1]:
import os
os.chdir('../')

from data_handler import *

import numpy as np
import pdb

# Create Spectrum Kernel functions

In [2]:
def create_dictionary(training_sequences, substring_length):
    '''Create the dictionary/vocabulary of possible subsequeces of length substring_length from training sequences.
    "ABCD" contains two words of legnth 3: "ABC" and "BCD".
    
    Input:
        training_sequences: array like strucutre containing training sequences
        substring_length: length of substring in sequence
    Output:
        word_2_index: mapping between a word and its index. The keys are all the words of length substring_length appearing in training sequences. 
    '''
    
    unique_subsequences = set()

    for string in training_sequences:
        for start in range(len(string)-substring_length+1):
            end = start + substring_length
            substring = string[start:end]
            unique_subsequences.add(substring)
    
    #Creating the word_2_index mapping words and their index. The keys are all the words.
    unique_subsequences = sorted(unique_subsequences)
    word_2_index = dict()
    for idx, word in enumerate(unique_subsequences):
        word_2_index[word] = idx
        
    return word_2_index

In [3]:
def create_occ_feature(sequence, substring_length, dictionary, normalize=True):
    '''Create the spectrum kernel feature vector of occurences of every word in dictionary/vocabulary.
    
    Input:
        sequence: ADN sequence to transform
        dictionary: already trained dictionary listing all the words appearing in training and their index
        normalize: if true, transform the occurences in percentage (frequencies)
    Ouput:
        feature: occurence of each word in dictionary/vocabulary
    '''
    
    feature = np.zeros(len(dictionary), dtype = int)
    
    for start in range(len(sequence)-substring_length+1):
        end = start + substring_length
        substring = sequence[start:end]
        if substring in dictionary: #It is possible that some word in test are not appearing in training
            feature[dictionary[substring]] += 1
            
    if normalize:
        feature = feature/feature.sum()
        
    return feature

In [4]:
def spetrum_kernel(sequence_A, sequence_B, substring_length, dictionary, normalize=False):
    '''substring_length-spectrum kernel
    
    Input:
        sequence_A: first sequence
        sequence_B: second sequence
        substring_length: length of word in vocabulary
        dictionary: vocabulary derived from training
        normalize: if true, transform the occurences in percentage (frequencies)
    Output:
        kernel similarity between sequence_A and sequence_B
    '''
    
    feature_A = create_occ_feature(sequence_A, substring_length, dictionary, normalize)
    feature_B = create_occ_feature(sequence_B, substring_length, dictionary, normalize)
    
    return np.dot(feature_A, feature_B)

# Test these functions

In [None]:
data = load_data(1, 'tr')

substring_length = 3
dictionary = create_dictionary(data['Sequence'], substring_length)

seq_A = data['Sequence'][0]
seq_B = data['Sequence'][2]

spetrum_kernel(seq_A, seq_B, substring_length, dictionary, False)

# Kernel SVM with spectrum kernel

In [5]:
%run kernel_methods.py

In [6]:
lbda = 0.5
kSVM = kernelSVM(lbda)

In [7]:
#Charge training data and create training vocabulary
tr0 = load_data(0, 'tr')
tr1 = load_data(1, 'tr')
tr2 = load_data(2, 'tr')
tr = pd.concat([tr0,tr1,tr2])

substring_length = 3
dictionary = create_dictionary(tr['Sequence'], substring_length)

# Creating feature vectors
features_tr0 = np.apply_along_axis(lambda seq_A: create_occ_feature(seq_A[0], substring_length, dictionary, False), axis=1, arr=tr0['Sequence'].as_matrix().reshape(2000,1))
features_tr1 = np.apply_along_axis(lambda seq_A: create_occ_feature(seq_A[0], substring_length, dictionary, False), axis=1, arr=tr1['Sequence'].as_matrix().reshape(2000,1))
features_tr2 = np.apply_along_axis(lambda seq_A: create_occ_feature(seq_A[0], substring_length, dictionary, False), axis=1, arr=tr2['Sequence'].as_matrix().reshape(2000,1))

## Loading test data
te0 = load_data(0, 'te')
te1 = load_data(1, 'te')
te2 = load_data(2, 'te')

# Creating feature vectors
features_te0 = np.apply_along_axis(lambda seq_A: create_occ_feature(seq_A[0], substring_length, dictionary, False), axis=1, arr=te0['Sequence'].as_matrix().reshape(1000,1))
features_te1 = np.apply_along_axis(lambda seq_A: create_occ_feature(seq_A[0], substring_length, dictionary, False), axis=1, arr=te1['Sequence'].as_matrix().reshape(1000,1))
features_te2 = np.apply_along_axis(lambda seq_A: create_occ_feature(seq_A[0], substring_length, dictionary, False), axis=1, arr=te2['Sequence'].as_matrix().reshape(1000,1))

In [None]:
### Kernel SVM - From kernel ###
#Train
kSVM.train(tr0['Sequence'].as_matrix(), 
         tr0['Bound'].as_matrix(), 
         lambda seq_A, seq_B: spetrum_kernel(seq_A[0], seq_B[0], substring_length, dictionary, normalize = False))

predictions = kSVM.predict(te0['Sequence'])

In [None]:
### Kernel SVM - Linear from features ###
#Train
kSVM.train(features_tr0, 
         tr0['Bound'].as_matrix(), 
         stringsData=False, solver='mosek')

#Test
predictions = kSVM.predict(features_te0, stringsData=False)

# Kernel kNN with spectrum kernel

In [21]:
### Kernel NN - Linear from features ###
k = 10
kNN = kernelKNN(k)

In [22]:
kNN.train(features_tr0, tr0['Bound'].as_matrix())
kNN_te0_raw = kNN.predict(features_te0)
kNN_te0 = pd.DataFrame(
    data = format_preds(kNN_te0_raw),
    columns = ['Bound'])

kNN.train(features_tr1, tr1['Bound'].as_matrix())
kNN_te1_raw = kNN.predict(features_te1)
kNN_te1 = pd.DataFrame(
    data = format_preds(kNN_te1_raw),
    columns = ['Bound'])
kNN_te1.index = kNN_te1.index + 1000

kNN.train(features_tr2, tr2['Bound'].as_matrix())
kNN_te2_raw = kNN.predict(features_te2)
kNN_te2 = pd.DataFrame(
    data = format_preds(kNN_te2_raw),
    columns = ['Bound'])
kNN_te2.index = kNN_te2.index + 2000

frames = [kNN_te0, kNN_te1, kNN_te2]
kNN_te = pd.concat(frames)
kNN_te.index = kNN_te.index.set_names(['Id'])

kNN_te.to_csv('predictions/kNN10_spectralKernel_te.csv')

Building kernel matrix from 1000x2000 samples...
...done in 7.60s
Building kernel matrix from 1000x2000 samples...
...done in 7.71s
Building kernel matrix from 1000x2000 samples...
...done in 8.54s


In [18]:
### Kernel NN - Linear from features ###
k = 20
kNN = kernelKNN(k)
kNN.assess(features_tr0, tr0['Bound'].as_matrix(), n_folds=5, stringsData=False)

Engaging n-fold cross validation with 5 folds on 2000 items
Building kernel matrix from 400x1600 samples...
...done in 2.45s
Fold 0, Match rate: 0.90
Building kernel matrix from 400x1600 samples...
...done in 2.58s
Fold 1, Match rate: 0.88
Building kernel matrix from 400x1600 samples...
...done in 2.41s
Fold 2, Match rate: 0.90
Building kernel matrix from 400x1600 samples...
...done in 2.53s
Fold 3, Match rate: 0.90
Building kernel matrix from 400x1600 samples...
...done in 2.56s
Fold 4, Match rate: 0.90
Done! Average Match rate is 0.90


0.89587499999999998

In [19]:
kNN.assess(features_tr1, tr1['Bound'].as_matrix(), n_folds=5, stringsData=False)

Engaging n-fold cross validation with 5 folds on 2000 items
Building kernel matrix from 400x1600 samples...
...done in 2.73s
Fold 0, Match rate: 0.93
Building kernel matrix from 400x1600 samples...
...done in 2.57s
Fold 1, Match rate: 0.92
Building kernel matrix from 400x1600 samples...
...done in 2.55s
Fold 2, Match rate: 0.93
Building kernel matrix from 400x1600 samples...
...done in 2.58s
Fold 3, Match rate: 0.92
Building kernel matrix from 400x1600 samples...
...done in 2.48s
Fold 4, Match rate: 0.91
Done! Average Match rate is 0.92


0.92225000000000001

In [20]:
kNN.assess(features_tr2, tr2['Bound'].as_matrix(), n_folds=5, stringsData=False)

Engaging n-fold cross validation with 5 folds on 2000 items
Building kernel matrix from 400x1600 samples...
...done in 2.59s
Fold 0, Match rate: 0.89
Building kernel matrix from 400x1600 samples...
...done in 2.54s
Fold 1, Match rate: 0.88
Building kernel matrix from 400x1600 samples...
...done in 2.46s
Fold 2, Match rate: 0.89
Building kernel matrix from 400x1600 samples...
...done in 2.42s
Fold 3, Match rate: 0.88
Building kernel matrix from 400x1600 samples...
...done in 2.57s
Fold 4, Match rate: 0.89
Done! Average Match rate is 0.88


0.88400000000000001