# Spectrum Kernel Development
Described in Slides 341 -> 344.

In [None]:
%reset

In [None]:
import os
os.chdir('../')

from data_handler import *

import numpy as np

# Create Spectrum Kernel functions

In [None]:
def create_dictionary(training_sequences, substring_length):
    '''Create the dictionary/vocabulary of possible subsequeces of length substring_length from training sequences.
    "ABCD" contains two words of legnth 3: "ABC" and "BCD".
    
    Input:
        training_sequences: array like strucutre containing training sequences
        substring_length: length of substring in sequence
    Output:
        word_2_index: mapping between a word and its index. The keys are all the words of length substring_length appearing in training sequences. 
    '''
    
    unique_subsequences = set()

    for string in training_sequences:
        for start in range(len(string)-substring_length+1):
            end = start + substring_length
            substring = string[start:end]
            unique_subsequences.add(substring)
    
    #Creating the word_2_index mapping words and their index. The keys are all the words.
    unique_subsequences = sorted(unique_subsequences)
    word_2_index = dict()
    for idx, word in enumerate(unique_subsequences):
        word_2_index[word] = idx
        
    return word_2_index

In [None]:
def create_occ_feature(sequence, substring_length, dictionary, normalize=True):
    '''Create the spectrum kernel feature vector of occurences of every word in dictionary/vocabulary.
    
    Input:
        sequence: ADN sequence to transform
        dictionary: already trained dictionary listing all the words appearing in training and their index
        normalize: if true, transform the occurences in percentage (frequencies)
    Ouput:
        feature: occurence of each word in dictionary/vocabulary
    '''
    
    feature = np.zeros(len(dictionary), dtype = int)
    
    for start in range(len(sequence)-substring_length+1):
        end = start + substring_length
        substring = sequence[start:end]
        if substring in dictionary: #It is possible that some word in test are not appearing in training
            feature[dictionary[substring]] = feature[dictionary[substring]] + 1
            
    if normalize:
        feature = feature/feature.sum()
        
    return feature

In [None]:
def spetrum_kernel(sequence_A, sequence_B, substring_length, dictionary, normalize=False):
    '''substring_length-spectrum kernel
    
    Input:
        sequence_A: first sequence
        sequence_B: second sequence
        substring_length: length of word in vocabulary
        dictionary: vocabulary derived from training
        normalize: if true, transform the occurences in percentage (frequencies)
    Output:
        kernel similarity between sequence_A and sequence_B
    '''
    
    feature_A = create_occ_feature(sequence_A, substring_length, dictionary, normalize)
    feature_B = create_occ_feature(sequence_B, substring_length, dictionary, normalize)
    
    return np.dot(feature_A, feature_B)

# Test these functions

In [None]:
data = load_data(1, 'tr')

substring_length = 3
dictionary = create_dictionary(data['Sequence'], substring_length)

seq_A = data['Sequence'][0]
seq_B = data['Sequence'][2]

spetrum_kernel(seq_A, seq_B, substring_length, dictionary, False)

# Training simple kernel SVM using spectrum kernel

In [None]:
from kernel_methods import *

In [None]:
lbda = 0.5
kSVM = kernelSVM(lbda)

In [None]:
#Charge training data and create training vocabulary
tr0 = load_data(0, 'tr')

substring_length = 3
dictionary = create_dictionary(tr0['Sequence'], substring_length)

#Train
kSVM.run(tr0['Sequence'], 
         tr0['Bound'], 
         lambda seq_A, seq_B: spetrum_kernel(seq_A, seq_B, substring_length, dictionary, normalize = False), 
         stringsData = False)

#Test
te0 = load_data(0, 'te')
predictions = kSVM.predict(te0['Sequence'], stringsData = False)