# Raw DNA strings

In this notebook we train SVM on string data (DNA strings).

In [1]:
from kernel import spectrum_kernel, build_spectrum_kernel_matrix
from kernel import *

from KernelSVM import KernelSVM

import numpy as np

from processing import train_test_split, load_y

import pandas as pd
from sklearn import svm

# Load and pre-process data

In [2]:
data = []

dataset_index = '1'

# load DNA strings 
f = open('data/train/Xtr{}.csv'.format(dataset_index), 'r')
data += [line.strip('\n') for line in f.readlines()]
f.close()

# load target vector 
y = (load_y("data/train/Ytr{}.csv".format(dataset_index)))
y = 2*y - 1 # transform y to lie in {-1, 1} instead of {0, 1}
y = np.reshape(y.astype(float), -1) # 

print('First DNA string : ')
data[0]

First DNA string : 


'CGGGCCTCCTCCAGGCTCAGAATCGACCCCCCCCCATCCTGATAGACCACAACGAAAGCCGTAGCGACGGCCGCAGGAGCTGGCGCGACAGCCCAGAGCTC'

In [3]:
X_train, y_train, X_test, y_test = data[:1500], y[:1500], data[1500:], y[1500:]
#X_train, y_train, X_test, y_test = X_train[:75], y_train[:75], X_test[:25], y_test[:25]

# Train

In [4]:
# allow mismatch
kernel_parameters = {'k':7, 'm':1}

lambda_reg = 0.12
svm = KernelSVM(lambda_reg = lambda_reg, kernel = spectrum_kernel, 
                kernel_parameters = kernel_parameters, data_type='string', threshold=0)

# train svm
svm.fit(X_train, y_train)

y_pred = svm.pred(X_test)

print('Score : ')
score = sum(y_test == y_pred)/len(y_test)*100
print("lambda :", lambda_reg, " ", "score:",score)

Building kernel matrix:   2%|▏         | 23/1500 [00:02<02:49,  8.69it/s]


KeyboardInterrupt: 

In [5]:
#### TEST of the KITCHENS SINKs
alpha_size=4

k = 6
M = 4*2048
gamma = 10 #spectrum regime
""" 
X_lift_train = phi_sink(X_train, k, gamma, M)
print("Train set lifted")
X_lift_test = phi_sink(X_test, k, gamma, M)
print("Test set lifted")
"""
for gamma in [0.4,0.5,]:
    print("$\gamma$", " = ", gamma)
    W = gamma*np.random.randn(M,4*k)
    b = (2*np.pi)*np.random.rand(M)

    X_train_ = sequence_to_matrix(X_train)
    X_test_ = sequence_to_matrix(X_test)

    X_lift_train = compute_conv_features(X_train_, W, b, alpha_size = alpha_size)
    print("train set lifted")
    X_lift_test = compute_conv_features(X_test_, W, b, alpha_size = alpha_size)
    print("test set lifted")


    kernel_parameters = {'k':k}
    #K_true = build_spectrum_kernel_matrix(X_train, kernel_parameters)
    #K_approx = np.dot(X_lift_train, X_lift_train.T)

    #print(X_lift_train[12])
    #print(X_lift_train[25]-X_lift_train[15])

    for C in [0.005, 0.01, 0.03, 0.05]:
        clf = svm.SVC(C = C, kernel = 'linear')
        clf.fit(X_lift_train, y_train)
        print('    C :',clf.C)
        y_pred_train = clf.predict(X_lift_train)
        print('    Score training: ', sum(y_train == y_pred_train)/len(y_train)*100)
        y_pred_test = clf.predict(X_lift_test)
        print('    Score test : ', sum(y_test == y_pred_test)/len(y_test)*100)

""" 
for C in [0.001, 0.01, 0.1, 1, 10, 100, 1000]:
    linear = lambda x,y : np.dot(x,y)
    svm = KernelSVM(lambda_reg = C, kernel = linear, 
                kernel_parameters = {}, data_type='vector', threshold=0)
    svm.fit(X_lift_train, y_train)
    print('lambda:', C)
    y_pred_train = svm.pred(X_lift_train)
    print('Score training: ', sum(y_train == y_pred_train)/len(y_train)*100)
    y_pred_test = svm.pred(X_lift_test)
    print('Score test : ', sum(y_test == y_pred_test)/len(y_test)*100)
"""    
    

$\gamma$  =  0.1
train set lifted
test set lifted
    C : 0.005
    Score training:  64.46666666666667
    Score test :  62.4
    C : 0.01
    Score training:  64.33333333333333
    Score test :  62.6
    C : 0.03
    Score training:  65.0
    Score test :  64.8
    C : 0.05
    Score training:  66.2
    Score test :  65.4
$\gamma$  =  0.5
train set lifted
test set lifted
    C : 0.005
    Score training:  83.73333333333333
    Score test :  80.60000000000001
    C : 0.01
    Score training:  86.2
    Score test :  81.6
    C : 0.03
    Score training:  89.26666666666667
    Score test :  84.2
    C : 0.05
    Score training:  91.0
    Score test :  84.0
$\gamma$  =  1
train set lifted
test set lifted
    C : 0.005
    Score training:  93.13333333333334
    Score test :  87.0
    C : 0.01
    Score training:  95.66666666666667
    Score test :  87.6
    C : 0.03
    Score training:  98.53333333333333
    Score test :  86.0
    C : 0.05
    Score training:  99.46666666666667
    Score t

" \nfor C in [0.001, 0.01, 0.1, 1, 10, 100, 1000]:\n    linear = lambda x,y : np.dot(x,y)\n    svm = KernelSVM(lambda_reg = C, kernel = linear, \n                kernel_parameters = {}, data_type='vector', threshold=0)\n    svm.fit(X_lift_train, y_train)\n    print('lambda:', C)\n    y_pred_train = svm.pred(X_lift_train)\n    print('Score training: ', sum(y_train == y_pred_train)/len(y_train)*100)\n    y_pred_test = svm.pred(X_lift_test)\n    print('Score test : ', sum(y_test == y_pred_test)/len(y_test)*100)\n"

In [18]:
# allow mismatch
kernel_parameters = {'k':6}
#kernel_parameters = {'k':5}
for lambda_reg in[0.12]:
    #lambda_reg = 0.12
    svm = KernelSVM(lambda_reg = lambda_reg, kernel = spectrum_kernel, 
                    kernel_parameters = kernel_parameters, data_type='string', threshold=0)

    # train svm
    svm.fit(X_train, y_train)

    y_pred = svm.pred(X_test)

    #print('Score : ')
    score = sum(y_test == y_pred)/len(y_test)
    print("lambda :", lambda_reg, " ", "score:",score)

Building kernel matrix: 100%|██████████| 2000/2000 [00:31<00:00, 63.29it/s] 


     pcost       dcost       gap    pres   dres
 0: -5.0459e+01 -4.5490e+01  5e+03  7e+01  1e-14
 1: -4.5703e+01 -3.4125e+01  6e+02  8e+00  1e-14
 2: -2.2657e+01 -1.6444e+01  2e+02  2e+00  6e-15
 3: -9.5445e+00 -1.1559e+01  3e+01  3e-01  2e-15
 4: -5.7943e+00 -9.2809e+00  4e+00  4e-03  9e-16
 5: -6.1193e+00 -6.7273e+00  6e-01  6e-04  6e-16
 6: -6.2542e+00 -6.3671e+00  1e-01  5e-05  6e-16
 7: -6.2928e+00 -6.3026e+00  1e-02  3e-06  6e-16
 8: -6.2967e+00 -6.2971e+00  3e-04  8e-08  7e-16
 9: -6.2969e+00 -6.2969e+00  1e-05  2e-09  6e-16
10: -6.2969e+00 -6.2969e+00  3e-07  2e-11  7e-16
Optimal solution found.
[   0    1    2 ... 1997 1998 1999]


Predicting values:   0%|          | 1/500 [00:00<00:53,  9.39it/s]

Numbers of support vectors : 2000


Predicting values: 100%|██████████| 500/500 [00:56<00:00,  8.81it/s]

lambda : 0.12   score: 0.9





In [14]:
K_good = svm.K
K_good

array([[105.,  10.,  14., ...,   5.,   8.,   8.],
       [ 10., 113.,  15., ...,   6.,   3.,  18.],
       [ 14.,  15., 115., ...,  16.,   5.,  20.],
       ...,
       [  5.,   6.,  16., ..., 111.,  11.,  18.],
       [  8.,   3.,   5., ...,  11., 113.,  15.],
       [  8.,  18.,  20., ...,  18.,  15., 103.]])

In [7]:
K_bad = svm.K
K_bad

array([[105.,  10.,  14., ...,   5.,   8.,   8.],
       [ 10., 113.,  15., ...,   6.,   3.,  18.],
       [ 14.,  15., 115., ...,  16.,   5.,  20.],
       ...,
       [  5.,   6.,  16., ..., 111.,  11.,  18.],
       [  8.,   3.,   5., ...,  11., 113.,  15.],
       [  8.,  18.,  20., ...,  18.,  15., 103.]])

In [5]:
%svm.fit(X_train, y_train)

ERROR:root:Line magic function `%svm.fit` not found.


In [None]:
#Grid search for parameters

kernel_parameters_list = [{'k':3, 'm':1},{'k':5, 'm':1},{'k':6, 'm':1}]
best_score = 0
best_param = None

for kernel_parameters in kernel_parameters_list:
    
    svm = KernelSVM(lambda_reg = 1, kernel = spectrum_kernel, 
                kernel_parameters = kernel_parameters, data_type='string')

    for lambda_reg in np.linspace(0.08,0.16, 3):
        
        svm.lambda_reg = lambda_reg
        
        # train svm
        svm.fit(X_train, y_train)

        y_pred = svm.pred(X_test)

        #print('Score : ')
        #sum(y_test == y_pred)/len(y_test)
        score = sum(y_test == y_pred)/len(y_test)
        print(kernel_parameters, lambda_reg, score)
        if score > best_score:
            best_score=score
            best_param = lambda_reg, kernel_parameters

Building kernel matrix:  67%|██████▋   | 671/1000 [02:01<00:59,  5.50it/s]

In [82]:
print(best_score)
print(best_param)

0.776
(0.12, {'k': 6})


# Submission

In [19]:
y_pred = np.zeros((3000,))

lambda_opt=[0.12,0.035,0.066]
kernel_parameters_opt=[{'k':6}, {'k':6}, {'k':4}]

for dataset_index in range(3):
    # load DNA strings 
    data = []
    f = open('data/train/Xtr{}.csv'.format(dataset_index), 'r')
    data += [line.strip('\n') for line in f.readlines()]
    f.close()

    # load target vector 
    y = (load_y("data/train/Ytr{}.csv".format(dataset_index)))
    y = 2*y - 1 # transform y to lie in {-1, 1} instead of {0, 1}
    y = np.reshape(y.astype(float), -1) # 

    X_train, y_train, X_test, y_test = data, y, data[1500:], y[1500:]
    
    lambda_reg=lambda_opt[dataset_index]
    kernel_parameters=kernel_parameters_opt[dataset_index]

    SVM = KernelSVM(lambda_reg = lambda_reg, kernel = spectrum_kernel, 
                    kernel_parameters = kernel_parameters, data_type='string')
    
    SVM.fit(X_train, y_train)
    #y_loc = SVM.pred(X_test)
    #print(sum(y_test == y_loc)/len(y_test))
    
    #SVM.load('SVM_opt{}.npy'.format(i), 'Xtrain_{}.npy'.format(i) )
    
    
    f_ = open('data/test/Xte{}.csv'.format(dataset_index), 'r')
    X = [line.strip('\n') for line in f_.readlines()]
    f_.close()

    y_pred[1000*dataset_index:1000*(dataset_index+1)] = np.reshape(SVM.pred(X), (1000,))

Building kernel matrix: 100%|██████████| 2000/2000 [00:31<00:00, 62.81it/s] 


     pcost       dcost       gap    pres   dres
 0: -5.0459e+01 -4.5490e+01  5e+03  7e+01  1e-14
 1: -4.5703e+01 -3.4125e+01  6e+02  8e+00  1e-14
 2: -2.2657e+01 -1.6444e+01  2e+02  2e+00  6e-15
 3: -9.5445e+00 -1.1559e+01  3e+01  3e-01  2e-15
 4: -5.7943e+00 -9.2809e+00  4e+00  4e-03  9e-16
 5: -6.1193e+00 -6.7273e+00  6e-01  6e-04  6e-16
 6: -6.2542e+00 -6.3671e+00  1e-01  5e-05  6e-16
 7: -6.2928e+00 -6.3026e+00  1e-02  3e-06  6e-16
 8: -6.2967e+00 -6.2971e+00  3e-04  8e-08  7e-16
 9: -6.2969e+00 -6.2969e+00  1e-05  2e-09  6e-16
10: -6.2969e+00 -6.2969e+00  3e-07  2e-11  7e-16
Optimal solution found.
[   0    1    2 ... 1996 1997 1999]


Predicting values:   0%|          | 1/1000 [00:00<01:48,  9.18it/s]

Numbers of support vectors : 1915


Predicting values: 100%|██████████| 1000/1000 [01:51<00:00,  9.01it/s]
Building kernel matrix: 100%|██████████| 2000/2000 [00:30<00:00, 65.91it/s] 


     pcost       dcost       gap    pres   dres
 0: -2.8829e+01 -4.0671e+01  5e+03  7e+01  6e-15
 1: -2.7091e+01 -3.5225e+01  5e+02  6e+00  6e-15
 2: -1.5188e+01 -2.6530e+01  1e+02  1e+00  4e-15
 3: -9.3484e+00 -2.4162e+01  4e+01  3e-01  2e-15
 4: -8.1452e+00 -1.5392e+01  9e+00  2e-02  1e-15
 5: -8.7394e+00 -9.9694e+00  1e+00  4e-03  1e-15
 6: -9.0512e+00 -9.2459e+00  2e-01  4e-04  1e-15
 7: -9.1109e+00 -9.1304e+00  2e-02  3e-05  1e-15
 8: -9.1175e+00 -9.1191e+00  2e-03  1e-06  1e-15
 9: -9.1181e+00 -9.1181e+00  6e-05  3e-08  1e-15
10: -9.1181e+00 -9.1181e+00  2e-06  4e-10  1e-15
Optimal solution found.
[   0    2    5 ... 1997 1998 1999]


Predicting values:   0%|          | 1/1000 [00:00<01:44,  9.59it/s]

Numbers of support vectors : 1207


Predicting values: 100%|██████████| 1000/1000 [01:48<00:00,  9.22it/s]
Building kernel matrix: 100%|██████████| 2000/2000 [00:40<00:00, 49.08it/s] 


     pcost       dcost       gap    pres   dres
 0: -2.9430e+03 -3.5752e+01  4e+04  2e+02  2e-12
 1: -1.0559e+02 -3.3006e+01  1e+03  6e+00  2e-12
 2: -1.2367e+01 -2.9323e+01  6e+01  2e-01  1e-13
 3: -9.4820e+00 -1.7739e+01  8e+00  4e-17  8e-15
 4: -1.0591e+01 -1.2302e+01  2e+00  3e-17  7e-15
 5: -1.1076e+01 -1.1465e+01  4e-01  3e-17  7e-15
 6: -1.1192e+01 -1.1290e+01  1e-01  3e-17  7e-15
 7: -1.1225e+01 -1.1245e+01  2e-02  3e-17  8e-15
 8: -1.1232e+01 -1.1235e+01  3e-03  3e-17  8e-15
 9: -1.1234e+01 -1.1234e+01  1e-04  3e-17  8e-15
10: -1.1234e+01 -1.1234e+01  2e-06  3e-17  8e-15
Optimal solution found.
[   0    1    2 ... 1996 1998 1999]


Predicting values:   0%|          | 1/1000 [00:00<01:50,  9.08it/s]

Numbers of support vectors : 1636


Predicting values: 100%|██████████| 1000/1000 [01:51<00:00,  8.97it/s]


In [20]:
y_pred = (y_pred+1)/2 # to have y in 0 and 1
df = pd.DataFrame(y_pred, columns=['Bound'])
df.index.name = 'Id'
df.Bound = df.Bound.astype(int)
df.to_csv('submit_full.csv')