# Raw DNA strings

In this notebook we train SVM on string data (DNA strings).

In [1]:
from kernel import *

from KernelSVM import KernelSVM

import numpy as np

from processing import train_test_split, load_y

import pandas as pd

# Load and pre-process data

In [2]:
data = []

dataset_index = '2'

# load DNA strings 
f = open('data/train/Xtr{}.csv'.format(dataset_index), 'r')
data += [line.strip('\n') for line in f.readlines()]
f.close()

# load target vector 
y = (load_y("data/train/Ytr{}.csv".format(dataset_index)))
y = 2*y - 1 # transform y to lie in {-1, 1} instead of {0, 1}
y = np.reshape(y.astype(float), -1) # 

print('First DNA string : ')
data[0]

First DNA string : 


'GAAAAAACGTTAGCAAACAAGGAACAAAGACAAAGCTGTCAACGGTCCATGGAATCTTGAAATTTAAATAATTGTTACACATTTTGTTTTGTTCTAACTGT'

In [3]:
X_train, y_train, X_test, y_test = data[:1500], y[:1500], data[1500:], y[1500:]
#X_train, y_train, X_test, y_test = X_train[:75], y_train[:75], X_test[:25], y_test[:25]

# Train

### Kitchen sink

In [None]:
#### TEST of the KITCHENS SINKs
alpha_size = 4

M = 4*2048

best_score = 0

kernel_parameters = {} # no parameters for linear kernel

""" 
X_lift_train = phi_sink(X_train, k, gamma, M)
print("Train set lifted")
X_lift_test = phi_sink(X_test, k, gamma, M)
print("Test set lifted")
"""
for k in [6, 7, 8, 9]:
    for gamma in [0.6, 0.8, 1.]:
        W = gamma*np.random.randn(M,4*k)
        b = (2*np.pi)*np.random.rand(M)

        X_train_ = sequence_to_matrix(X_train)
        X_test_ = sequence_to_matrix(X_test)

        X_lift_train = compute_conv_features(X_train_, W, b, alpha_size = alpha_size)
        print("Train set lifted")
        X_lift_test = compute_conv_features(X_test_, W, b, alpha_size = alpha_size)
        print("Test set lifted")

        clf = KernelSVM(lambda_reg = 0, loss="squared_hinge", kernel = linear, 
                    kernel_parameters = kernel_parameters, data_type='vector', threshold=0, verbose=False)

        for lambda_reg in np.linspace(0.01, 1, 20):

            clf.lambda_reg = lambda_reg

            clf.fit(X_lift_train, y_train)

            y_pred_test = clf.predict(X_lift_test)
            score =  sum(y_test == y_pred_test)/len(y_test)*100

            print('Gamma : {}, Lambda : {}, Score: {}'.format(gamma, lambda_reg, score))
            if score > best_score:
                best_score = score
                best_gamma = gamma
                best_lambda = lambda_reg
                best_k = k

print('Best k : {}, best gamma : {}, best lambda : {}, best score: {}'.format(best_k, best_gamma, best_lambda, best_score))

Train set lifted
Test set lifted
Gamma : 0.6, Lambda : 0.01, Score: 66.4
Gamma : 0.6, Lambda : 0.06210526315789474, Score: 64.60000000000001
Gamma : 0.6, Lambda : 0.11421052631578947, Score: 65.0
Gamma : 0.6, Lambda : 0.16631578947368422, Score: 64.2
Gamma : 0.6, Lambda : 0.21842105263157896, Score: 63.2
Gamma : 0.6, Lambda : 0.2705263157894737, Score: 63.6
Gamma : 0.6, Lambda : 0.32263157894736844, Score: 64.8
Gamma : 0.6, Lambda : 0.37473684210526315, Score: 65.0
Gamma : 0.6, Lambda : 0.4268421052631579, Score: 64.4
Gamma : 0.6, Lambda : 0.4789473684210527, Score: 63.6
Gamma : 0.6, Lambda : 0.5310526315789474, Score: 63.0
Gamma : 0.6, Lambda : 0.5831578947368421, Score: 63.6
Gamma : 0.6, Lambda : 0.6352631578947369, Score: 64.2
Gamma : 0.6, Lambda : 0.6873684210526316, Score: 64.0
Gamma : 0.6, Lambda : 0.7394736842105263, Score: 64.4
Gamma : 0.6, Lambda : 0.791578947368421, Score: 64.4
Gamma : 0.6, Lambda : 0.8436842105263158, Score: 64.0
Gamma : 0.6, Lambda : 0.8957894736842106, Sco

Gamma : 0.8, Lambda : 0.21842105263157896, Score: 68.4
Gamma : 0.8, Lambda : 0.2705263157894737, Score: 67.80000000000001
Gamma : 0.8, Lambda : 0.32263157894736844, Score: 68.4
Gamma : 0.8, Lambda : 0.37473684210526315, Score: 68.2
Gamma : 0.8, Lambda : 0.4268421052631579, Score: 67.60000000000001
Gamma : 0.8, Lambda : 0.4789473684210527, Score: 68.0
Gamma : 0.8, Lambda : 0.5310526315789474, Score: 68.0
Gamma : 0.8, Lambda : 0.5831578947368421, Score: 68.0
Gamma : 0.8, Lambda : 0.6352631578947369, Score: 68.0
Gamma : 0.8, Lambda : 0.6873684210526316, Score: 68.0
Gamma : 0.8, Lambda : 0.7394736842105263, Score: 67.60000000000001
Gamma : 0.8, Lambda : 0.791578947368421, Score: 67.4
Gamma : 0.8, Lambda : 0.8436842105263158, Score: 67.0
Gamma : 0.8, Lambda : 0.8957894736842106, Score: 66.2
Gamma : 0.8, Lambda : 0.9478947368421053, Score: 66.2
Gamma : 0.8, Lambda : 1.0, Score: 66.4
Train set lifted
Test set lifted
Gamma : 1.0, Lambda : 0.01, Score: 65.4
Gamma : 1.0, Lambda : 0.0621052631578

### k-spectrum kernel

In [4]:
# allow mismatch
kernel_parameters = {'k':7, 'm':1}

lambda_reg = 0.12
svm = KernelSVM(lambda_reg = lambda_reg, kernel = spectrum_kernel, 
                kernel_parameters = kernel_parameters, data_type='string', threshold=0)

# train svm
svm.fit(X_train, y_train)

y_pred = svm.pred(X_test)

print('Score : ')
score = sum(y_test == y_pred)/len(y_test)*100
print("lambda :", lambda_reg, " ", "score:",score)

Building kernel matrix:   1%|          | 14/1500 [00:02<04:13,  5.87it/s]


KeyboardInterrupt: 

In [None]:
#Grid search for parameters

kernel_parameters_list = [{'k':3, 'm':1},{'k':5, 'm':1},{'k':6, 'm':1}]
best_score = 0
best_param = None

for kernel_parameters in kernel_parameters_list:
    
    svm = KernelSVM(lambda_reg = 1, kernel = spectrum_kernel, 
                kernel_parameters = kernel_parameters, data_type='string')

    for lambda_reg in np.linspace(0.08,0.16, 3):
        
        svm.lambda_reg = lambda_reg
        
        # train svm
        svm.fit(X_train, y_train)

        y_pred = svm.pred(X_test)

        #print('Score : ')
        #sum(y_test == y_pred)/len(y_test)
        score = sum(y_test == y_pred)/len(y_test)
        print(kernel_parameters, lambda_reg, score)
        if score > best_score:
            best_score=score
            best_param = lambda_reg, kernel_parameters
            
print(best_score)
print(best_param)

Building kernel matrix:  67%|██████▋   | 671/1000 [02:01<00:59,  5.50it/s]

# Submission

In [19]:
y_pred = np.zeros((3000,))

lambda_opt=[0.12,0.035,0.066]
kernel_parameters_opt=[{'k':6}, {'k':6}, {'k':4}]

for dataset_index in range(3):
    # load DNA strings 
    data = []
    f = open('data/train/Xtr{}.csv'.format(dataset_index), 'r')
    data += [line.strip('\n') for line in f.readlines()]
    f.close()

    # load target vector 
    y = (load_y("data/train/Ytr{}.csv".format(dataset_index)))
    y = 2*y - 1 # transform y to lie in {-1, 1} instead of {0, 1}
    y = np.reshape(y.astype(float), -1) # 

    X_train, y_train, X_test, y_test = data, y, data[1500:], y[1500:]
    
    lambda_reg=lambda_opt[dataset_index]
    kernel_parameters=kernel_parameters_opt[dataset_index]

    SVM = KernelSVM(lambda_reg = lambda_reg, kernel = spectrum_kernel, 
                    kernel_parameters = kernel_parameters, data_type='string')
    
    SVM.fit(X_train, y_train)
    #y_loc = SVM.pred(X_test)
    #print(sum(y_test == y_loc)/len(y_test))
    
    #SVM.load('SVM_opt{}.npy'.format(i), 'Xtrain_{}.npy'.format(i) )
    
    
    f_ = open('data/test/Xte{}.csv'.format(dataset_index), 'r')
    X = [line.strip('\n') for line in f_.readlines()]
    f_.close()

    y_pred[1000*dataset_index:1000*(dataset_index+1)] = np.reshape(SVM.pred(X), (1000,))

Building kernel matrix: 100%|██████████| 2000/2000 [00:31<00:00, 62.81it/s] 


     pcost       dcost       gap    pres   dres
 0: -5.0459e+01 -4.5490e+01  5e+03  7e+01  1e-14
 1: -4.5703e+01 -3.4125e+01  6e+02  8e+00  1e-14
 2: -2.2657e+01 -1.6444e+01  2e+02  2e+00  6e-15
 3: -9.5445e+00 -1.1559e+01  3e+01  3e-01  2e-15
 4: -5.7943e+00 -9.2809e+00  4e+00  4e-03  9e-16
 5: -6.1193e+00 -6.7273e+00  6e-01  6e-04  6e-16
 6: -6.2542e+00 -6.3671e+00  1e-01  5e-05  6e-16
 7: -6.2928e+00 -6.3026e+00  1e-02  3e-06  6e-16
 8: -6.2967e+00 -6.2971e+00  3e-04  8e-08  7e-16
 9: -6.2969e+00 -6.2969e+00  1e-05  2e-09  6e-16
10: -6.2969e+00 -6.2969e+00  3e-07  2e-11  7e-16
Optimal solution found.
[   0    1    2 ... 1996 1997 1999]


Predicting values:   0%|          | 1/1000 [00:00<01:48,  9.18it/s]

Numbers of support vectors : 1915


Predicting values: 100%|██████████| 1000/1000 [01:51<00:00,  9.01it/s]
Building kernel matrix: 100%|██████████| 2000/2000 [00:30<00:00, 65.91it/s] 


     pcost       dcost       gap    pres   dres
 0: -2.8829e+01 -4.0671e+01  5e+03  7e+01  6e-15
 1: -2.7091e+01 -3.5225e+01  5e+02  6e+00  6e-15
 2: -1.5188e+01 -2.6530e+01  1e+02  1e+00  4e-15
 3: -9.3484e+00 -2.4162e+01  4e+01  3e-01  2e-15
 4: -8.1452e+00 -1.5392e+01  9e+00  2e-02  1e-15
 5: -8.7394e+00 -9.9694e+00  1e+00  4e-03  1e-15
 6: -9.0512e+00 -9.2459e+00  2e-01  4e-04  1e-15
 7: -9.1109e+00 -9.1304e+00  2e-02  3e-05  1e-15
 8: -9.1175e+00 -9.1191e+00  2e-03  1e-06  1e-15
 9: -9.1181e+00 -9.1181e+00  6e-05  3e-08  1e-15
10: -9.1181e+00 -9.1181e+00  2e-06  4e-10  1e-15
Optimal solution found.
[   0    2    5 ... 1997 1998 1999]


Predicting values:   0%|          | 1/1000 [00:00<01:44,  9.59it/s]

Numbers of support vectors : 1207


Predicting values: 100%|██████████| 1000/1000 [01:48<00:00,  9.22it/s]
Building kernel matrix: 100%|██████████| 2000/2000 [00:40<00:00, 49.08it/s] 


     pcost       dcost       gap    pres   dres
 0: -2.9430e+03 -3.5752e+01  4e+04  2e+02  2e-12
 1: -1.0559e+02 -3.3006e+01  1e+03  6e+00  2e-12
 2: -1.2367e+01 -2.9323e+01  6e+01  2e-01  1e-13
 3: -9.4820e+00 -1.7739e+01  8e+00  4e-17  8e-15
 4: -1.0591e+01 -1.2302e+01  2e+00  3e-17  7e-15
 5: -1.1076e+01 -1.1465e+01  4e-01  3e-17  7e-15
 6: -1.1192e+01 -1.1290e+01  1e-01  3e-17  7e-15
 7: -1.1225e+01 -1.1245e+01  2e-02  3e-17  8e-15
 8: -1.1232e+01 -1.1235e+01  3e-03  3e-17  8e-15
 9: -1.1234e+01 -1.1234e+01  1e-04  3e-17  8e-15
10: -1.1234e+01 -1.1234e+01  2e-06  3e-17  8e-15
Optimal solution found.
[   0    1    2 ... 1996 1998 1999]


Predicting values:   0%|          | 1/1000 [00:00<01:50,  9.08it/s]

Numbers of support vectors : 1636


Predicting values: 100%|██████████| 1000/1000 [01:51<00:00,  8.97it/s]


In [20]:
y_pred = (y_pred+1)/2 # to have y in 0 and 1
df = pd.DataFrame(y_pred, columns=['Bound'])
df.index.name = 'Id'
df.Bound = df.Bound.astype(int)
df.to_csv('submit_full.csv')