# Raw DNA strings

In this notebook we train SVM on string data (DNA strings).

In [None]:
from kernel import *

from KernelSVM import KernelSVM

import numpy as np

from processing import train_test_split, load_y

import pandas as pd

# Load and pre-process data

In [None]:
data = []

dataset_index = '2'

# load DNA strings 
f = open('data/train/Xtr{}.csv'.format(dataset_index), 'r')
data += [line.strip('\n') for line in f.readlines()]
f.close()

# load target vector 
y = (load_y("data/train/Ytr{}.csv".format(dataset_index)))
y = 2*y - 1 # transform y to lie in {-1, 1} instead of {0, 1}
y = np.reshape(y.astype(float), -1) # 

print('First DNA string : ')
data[0]

In [None]:
X_train, y_train, X_test, y_test = data[:200], y[:200], data[200:300], y[200:300]
#X_train, y_train, X_test, y_test = X_train[:75], y_train[:75], X_test[:25], y_test[:25]

# Train

### Kitchen sink

In [None]:
#### TEST of the KITCHENS SINKs
alpha_size = 4

M = 6*2048

best_score = 0

kernel_parameters = {} # no parameters for linear kernel

n_lambda = 10
for k in [7]:
    for gamma in np.linspace(0.5, 1, 5):
        score_avg_list = np.zeros(n_lambda)
        for run in range(5):

            W = gamma*np.random.randn(M,4*k)
            b = (2*np.pi)*np.random.rand(M)

            X_train_ = sequence_to_matrix(X_train)
            X_test_ = sequence_to_matrix(X_test)

            X_lift_train = compute_conv_features(X_train_, W, b, alpha_size = alpha_size)
            print("Train set lifted")
            X_lift_test = compute_conv_features(X_test_, W, b, alpha_size = alpha_size)
            print("Test set lifted")

            clf = KernelSVM(lambda_reg = 0, loss="squared_hinge", kernel = linear, 
                        kernel_parameters = kernel_parameters, data_type='vector', threshold=0, verbose=False)
            
            lambda_list = np.linspace(0.005, 0.01, 10)
            for lambda_reg, lambda_index in zip(lambda_list,range(n_lambda)):
            

                clf.lambda_reg = lambda_reg

                clf.fit(X_lift_train, y_train)
                y_pred_test = clf.pred(X_lift_test)

                score =  sum(y_test == y_pred_test)/len(y_test)*100
                #print('k : {}, Gamma : {}, Lambda : {}, Score: {}'.format(k,gamma, lambda_reg, score))
                
                score_avg_list[lambda_index] += score
            
        score_avg_list = score_avg_list/n_lambda
        score_avg = np.max(score_avg_list)
        lambda_opt_index = np.argmax(score_avg_list)
        lambda_opt = lambda_list[lambda_opt_index]
        
        if score_avg > best_score:
            best_score = score_avg
            best_gamma = gamma
            best_lambda = lambda_opt
            best_k = k

print('Best k : {}, best gamma : {}, best lambda : {}, best score: {}'.format(best_k, best_gamma, best_lambda, best_score))

### k-spectrum kernel

In [None]:
#Grid search for parameters

kernel_parameters_list = [{'k':6}, {'k':7}, {'k':8}]
best_score = 0
best_param = None

for kernel_parameters in kernel_parameters_list:
    
    svm = KernelSVM(lambda_reg = 1, loss='hinge', kernel = spectrum_kernel, 
                kernel_parameters = kernel_parameters, data_type='string')

    for lambda_reg in np.linspace(0.01,1, 10):
        
        svm.lambda_reg = lambda_reg
        
        # train svm
        svm.fit(X_train, y_train)

        y_pred = svm.pred(X_test)

        #print('Score : ')
        #sum(y_test == y_pred)/len(y_test)
        score = sum(y_test == y_pred)/len(y_test)
        print(kernel_parameters, lambda_reg, score)
        if score > best_score:
            best_score=score
            best_param = lambda_reg, kernel_parameters
            
print(best_score)
print(best_param)

# Submission

### Mixed (k-spectrum + kitchen sink)

In [None]:
y_pred = np.zeros((3000,))

In [None]:
dataset_index = 0

## Training ##
# load DNA strings 
data = []
f = open('data/train/Xtr{}.csv'.format(dataset_index), 'r')
data += [line.strip('\n') for line in f.readlines()]
f.close()

# load target vector 
y = (load_y("data/train/Ytr{}.csv".format(dataset_index)))
y = 2*y - 1 # transform y to lie in {-1, 1} instead of {0, 1}
y = np.reshape(y.astype(float), -1) # 

X_train, y_train = data, y

kernel_parameters = {'k':6}

clf =  KernelSVM(lambda_reg = 0.1, loss='hinge', kernel = spectrum_kernel, 
                kernel_parameters = kernel_parameters, data_type='string')

clf.fit(X_train, y_train)

## Prediction ##

f_ = open('data/test/Xte{}.csv'.format(dataset_index), 'r')
X_test = [line.strip('\n') for line in f_.readlines()]
f_.close()

y_pred[1000*dataset_index:1000*(dataset_index+1)] = np.reshape(clf.pred(X_test), (1000,))

In [None]:
dataset_index = 1

alpha_size = 4

kernel_parameters = {}

## Training ##
# load DNA strings 
data = []
f = open('data/train/Xtr{}.csv'.format(dataset_index), 'r')
data += [line.strip('\n') for line in f.readlines()]
f.close()

# load target vector 
y = (load_y("data/train/Ytr{}.csv".format(dataset_index)))
y = 2*y - 1 # transform y to lie in {-1, 1} instead of {0, 1}
y = np.reshape(y.astype(float), -1) # 

X_train, y_train = data, y

W = np.load('pickled_files/W_1.npy')
b = np.load('pickled_files/b_1.npy')

# Vectorize string (one-hot encoding)
X_train_ = sequence_to_matrix(X_train)

# Get features
X_lift_train = compute_conv_features(X_train_, W, b, alpha_size = alpha_size)

clf = KernelSVM(lambda_reg = 0.054210526315789466, loss="squared_hinge", kernel = linear, 
            kernel_parameters = kernel_parameters, data_type='vector', threshold=0, verbose=False)

clf.fit(X_lift_train, y_train)

## Prediction ##

f_ = open('data/test/Xte{}.csv'.format(dataset_index), 'r')
X_test = [line.strip('\n') for line in f_.readlines()]
f_.close()

# Vectorize string (one-hot encoding)
X_test_ = sequence_to_matrix(X_test)

# Get features
X_lift_test = compute_conv_features(X_test_, W, b, alpha_size = alpha_size)

y_pred[1000*dataset_index:1000*(dataset_index+1)] = np.reshape(clf.pred(X_lift_test), (1000,))

In [None]:
dataset_index = 2

alpha_size = 4

## Training ##
# load DNA strings 
data = []
f = open('data/train/Xtr{}.csv'.format(dataset_index), 'r')
data += [line.strip('\n') for line in f.readlines()]
f.close()

# load target vector 
y = (load_y("data/train/Ytr{}.csv".format(dataset_index)))
y = 2*y - 1 # transform y to lie in {-1, 1} instead of {0, 1}
y = np.reshape(y.astype(float), -1) # 

X_train, y_train = data, y

W = np.load('pickled_files/W_2.npy')
b = np.load('pickled_files/b_2.npy')

# Vectorize string (one-hot encoding)
X_train_ = sequence_to_matrix(X_train)

# Get features
X_lift_train = compute_conv_features(X_train_, W, b, alpha_size = alpha_size)

clf = KernelSVM(lambda_reg = 0.16631578947368422, loss="squared_hinge", kernel = linear, 
            kernel_parameters = kernel_parameters, data_type='vector', threshold=0, verbose=False)

clf.fit(X_lift_train, y_train)

## Prediction ##

f_ = open('data/test/Xte{}.csv'.format(dataset_index), 'r')
X_test = [line.strip('\n') for line in f_.readlines()]
f_.close()

# Vectorize string (one-hot encoding)
X_test_ = sequence_to_matrix(X_test)

# Get features
X_lift_test = compute_conv_features(X_test_, W, b, alpha_size = alpha_size)

y_pred[1000*dataset_index:1000*(dataset_index+1)] = np.reshape(clf.pred(X_lift_test), (1000,))

In [None]:
y_pred = (y_pred+1)/2 # to have y in 0 and 1
df = pd.DataFrame(y_pred, columns=['Bound'])
df.index.name = 'Id'
df.Bound = df.Bound.astype(int)
df.to_csv('mixed.csv')

### k-spectrum

In [None]:
y_pred = np.zeros((3000,))

# k-spectrum
lambda_opt=[0.12,0.035,0.066]
kernel_parameters_opt=[{'k':6}, {'k':6}, {'k':4}]


for dataset_index in range(3):
    # load DNA strings 
    data = []
    f = open('data/train/Xtr{}.csv'.format(dataset_index), 'r')
    data += [line.strip('\n') for line in f.readlines()]
    f.close()

    # load target vector 
    y = (load_y("data/train/Ytr{}.csv".format(dataset_index)))
    y = 2*y - 1 # transform y to lie in {-1, 1} instead of {0, 1}
    y = np.reshape(y.astype(float), -1) # 

    X_train, y_train, X_test, y_test = data, y, data[1500:], y[1500:]
    
    lambda_reg=lambda_opt[dataset_index]
    kernel_parameters=kernel_parameters_opt[dataset_index]

    SVM = KernelSVM(lambda_reg = lambda_reg, kernel = spectrum_kernel, loss="hinge",
                    kernel_parameters = kernel_parameters, data_type='string')
    
    SVM.fit(X_train, y_train)    
    
    f_ = open('data/test/Xte{}.csv'.format(dataset_index), 'r')
    X = [line.strip('\n') for line in f_.readlines()]
    f_.close()

    y_pred[1000*dataset_index:1000*(dataset_index+1)] = np.reshape(SVM.pred(X), (1000,))

y_pred = (y_pred+1)/2 # to have y in 0 and 1
df = pd.DataFrame(y_pred, columns=['Bound'])
df.index.name = 'Id'
df.Bound = df.Bound.astype(int)
df.to_csv('kspectrum.csv')