# Spectrum Kernel Development
Described in Slides 341 -> 344.

In [None]:
%reset

In [1]:
import os
os.chdir('../')

from data_handler import *

import numpy as np

# Create Spectrum Kernel functions

In [2]:
def create_dictionary(training_sequences, substring_length):
    '''Create the dictionary/vocabulary of possible subsequeces of length substring_length from training sequences.
    "ABCD" contains two words of legnth 3: "ABC" and "BCD".
    
    Input:
        training_sequences: array like strucutre containing training sequences
        substring_length: length of substring in sequence
    Output:
        word_2_index: mapping between a word and its index. The keys are all the words of length substring_length appearing in training sequences. 
    '''
    
    unique_subsequences = set()

    for string in training_sequences:
        for start in range(len(string)-substring_length+1):
            end = start + substring_length
            substring = string[start:end]
            unique_subsequences.add(substring)
    
    #Creating the word_2_index mapping words and their index. The keys are all the words.
    unique_subsequences = sorted(unique_subsequences)
    word_2_index = dict()
    for idx, word in enumerate(unique_subsequences):
        word_2_index[word] = idx
        
    return word_2_index

In [3]:
def create_occ_feature(sequence, substring_length, dictionary, normalize=True):
    '''Create the spectrum kernel feature vector of occurences of every word in dictionary/vocabulary.
    
    Input:
        sequence: ADN sequence to transform
        dictionary: already trained dictionary listing all the words appearing in training and their index
        normalize: if true, transform the occurences in percentage (frequencies)
    Ouput:
        feature: occurence of each word in dictionary/vocabulary
    '''
    
    feature = np.zeros(len(dictionary), dtype = int)
    
    for start in range(len(sequence)-substring_length+1):
        end = start + substring_length
        substring = sequence[start:end]
        if substring in dictionary: #It is possible that some word in test are not appearing in training
            feature[dictionary[substring]] = feature[dictionary[substring]] + 1
            
    if normalize:
        feature = feature/feature.sum()
        
    return feature

In [4]:
def spetrum_kernel(sequence_A, sequence_B, substring_length, dictionary, normalize=False):
    '''substring_length-spectrum kernel
    
    Input:
        sequence_A: first sequence
        sequence_B: second sequence
        substring_length: length of word in vocabulary
        dictionary: vocabulary derived from training
        normalize: if true, transform the occurences in percentage (frequencies)
    Output:
        kernel similarity between sequence_A and sequence_B
    '''
    
    feature_A = create_occ_feature(sequence_A, substring_length, dictionary, normalize)
    feature_B = create_occ_feature(sequence_B, substring_length, dictionary, normalize)
    
    return np.dot(feature_A, feature_B)

# Test these functions

In [5]:
data = load_data(1, 'tr')

substring_length = 3
dictionary = create_dictionary(data['Sequence'], substring_length)

seq_A = data['Sequence'][0]
seq_B = data['Sequence'][2]

spetrum_kernel(seq_A, seq_B, substring_length, dictionary, False)

95

# Training simple kernel SVM using spectrum kernel

In [6]:
from kernel_methods import *

In [7]:
lbda = 0.5
kSVM = kernelSVM(lbda)

In [9]:
#Charge training data and create training vocabulary
tr0 = load_data(0, 'tr')

substring_length = 3
dictionary = create_dictionary(tr0['Sequence'], substring_length)

#Train
kSVM.train(tr0['Sequence'], 
           tr0['Bound'], 
           lambda seq_A, seq_B: spetrum_kernel(seq_A, seq_B, substring_length, dictionary, normalize = False), 
           stringsData = False)

#Test
te0 = load_data(0, 'te')
predictions = kSVM.predict(te0['Sequence'], stringsData = False)

  arr1 = arr1.reshape((len(arr1),1))
  arr2 = arr2.reshape((len(arr2),1))


Building kernel matrix from 2000x2000 samples...
...done in 24.90s
Building kernel matrix from 1000x2000 samples...
...done in 11.01s


# Optimizing simple classification models on spectrum features

In [33]:
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

In [34]:
data = load_data(0, 'tr')

Y = data['Bound']

In [None]:
for substring_length in [3, 4, 5, 6, 7]:
    print('Substring Length: {} -------------------------'.format(substring_length))
    dictionary = create_dictionary(data['Sequence'], substring_length)
    print('Vocabulary size: {}'.format(len(dictionary)))

    X = np.zeros((len(data), len(dictionary)))
    for idx, seq in enumerate(data['Sequence']):
        X[idx, :] = create_occ_feature(seq, substring_length, dictionary, normalize = False)
    
    max_score = 0
    max_model = None
    
    #Logistic Regrression
    for penal in ['l1', 'l2']:
        for regu in [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5,10]:
            model = LogisticRegression(penalty = penal, C = regu, random_state = 777)
            score = np.mean(cross_val_score(model, X, Y, cv = 5))
            if score > max_score:
                max_score = score
                max_model = model
                print('Logistic Regression - {0} penalty - C: {1} - Mean Cross Validation Score: {2:.2f}%'.format(penal, regu, 100*score))

    #Random Forest
    for max_depth in [2,3,4,5]:
        for n_estim in [10,25,50,75,100,500]:
            model = RandomForestClassifier(max_depth = max_depth, n_estimators = n_estim, random_state = 777)
            score = np.mean(cross_val_score(model, X, Y, cv = 5))
            if score > max_score:
                max_score = score
                max_model = model
                print('Random Forest - {0} max_depth - n_estimators: {1} - Mean Cross Validation Score: {2:.2f}%'.format(max_depth, n_estim, 100*score))
    
    #XGBoost             
    for max_depth in [2,3,4,5]:
        for n_estim in [10,25,50,75,100,500]:
            model = XGBClassifier(max_depth = max_depth, n_estimators = n_estim, random_state = 777)
            score = np.mean(cross_val_score(model, X, Y, cv = 5))
            if score > max_score:
                max_score = score
                max_model = model
                print('Xgboost - {0} max_depth - n_estimators: {1} - Mean Cross Validation Score: {2:.2f}%'.format(max_depth, n_estim, 100*score))
    #SVM
    for kernel in ['linear', 'rbf']:
        for regu in [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5,10]:
            model = SVC(kernel = kernel, C = regu, random_state = 777)
            score = np.mean(cross_val_score(model, X, Y, cv = 5))
            if score > max_score:
                max_score = score
                max_model = model
                print('SVM - {0} kernel - C: {1} - Mean Cross Validation Score: {2:.2f}%'.format(kernel, regu, 100*score))

    #MLP
    for architecture in [(10), (20), (50), (100), (10,10), (20,20), (50,50), (100,100)]:
        for alpha in [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5,10]:
            model = MLPClassifier(hidden_layer_sizes = architecture, alpha = alpha)
            score = np.mean(cross_val_score(model, X, Y, cv = 5))
            if score > max_score:
                max_score = score
                max_model = model
                print('MLP - {0} architecture - regu: {1} - Mean Cross Validation Score: {2:.2f}%'.format(architecture, alpha, 100*score))
                
print('Best Model is: {}'.format(max_model))
print('It achieves {:.2f}% 5fold cross validation'.format(100*max_score))

Substring Length: 3 -------------------------
Vocabulary size: 64
Logistic Regression - l1 penalty - C: 0.001 - Mean Cross Validation Score: 50.00%
Logistic Regression - l1 penalty - C: 0.01 - Mean Cross Validation Score: 63.15%
Logistic Regression - l1 penalty - C: 0.05 - Mean Cross Validation Score: 66.80%
Logistic Regression - l1 penalty - C: 0.1 - Mean Cross Validation Score: 67.25%
Logistic Regression - l2 penalty - C: 0.005 - Mean Cross Validation Score: 67.40%
Logistic Regression - l2 penalty - C: 0.01 - Mean Cross Validation Score: 67.85%
MLP - 10 architecture - regu: 10 - Mean Cross Validation Score: 68.30%
MLP - 50 architecture - regu: 5 - Mean Cross Validation Score: 68.50%
MLP - 100 architecture - regu: 5 - Mean Cross Validation Score: 68.85%
