# Raw DNA strings

In this notebook we train SVM on string data (DNA strings).

In [1]:
from kernel import spectrum_kernel, build_spectrum_kernel_matrix

from KernelSVM import KernelSVM

import numpy as np

from processing import train_test_split, load_y

import pandas as pd

# Load and pre-process data

In [2]:
data = []

dataset_index = '0'

# load DNA strings 
f = open('data/train/Xtr{}.csv'.format(dataset_index), 'r')
data += [line.strip('\n') for line in f.readlines()]
f.close()

# load target vector 
y = (load_y("data/train/Ytr{}.csv".format(dataset_index)))
y = 2*y - 1 # transform y to lie in {-1, 1} instead of {0, 1}
y = np.reshape(y.astype(float), -1) # 

print('First DNA string : ')
data[0]

First DNA string : 


'TCCTCAACTTTTATTGGGCCGCTGTGGCACCAGAATCTACGAATGGCGCCCTCTAGAGTTGTGTAAAGAAGTGGCGTCACCTCATTATAAATAAAAGGTTG'

In [3]:
X_train, y_train, X_test, y_test = data[:100], y[:100], data[100:200], y[100:200]
#X_train, y_train, X_test, y_test = X_train[:75], y_train[:75], X_test[:25], y_test[:25]

# Train

In [5]:
# allow mismatch
kernel_parameters = {'k':5, 'm':1}

lambda_reg = 0.12
svm = KernelSVM(lambda_reg = lambda_reg, kernel = spectrum_kernel, 
                kernel_parameters = kernel_parameters, data_type='string', threshold=0)

# train svm
svm.fit(X_train, y_train)

y_pred = svm.pred(X_test)

print('Score : ')
score = sum(y_test == y_pred)/len(y_test)
print("lambda :", lambda_reg, " ", "score:",score)

Building kernel matrix: 100%|██████████| 100/100 [00:08<00:00, 11.21it/s]


(array([ 3.36922963e+04,  3.40331543e+03,  2.69886866e+03,  1.87424163e+03,
        1.28179657e+03,  1.14134353e+03,  1.12283431e+03,  1.06235988e+03,
        9.97391362e+02,  9.47823985e+02,  8.87249590e+02,  8.14851518e+02,
        7.39511446e+02,  7.03675541e+02,  6.77087944e+02,  6.30053355e+02,
        5.97531870e+02,  5.73425967e+02,  5.45092027e+02,  5.06285844e+02,
        4.85677323e+02, -3.34733315e+02,  4.51283473e+02,  4.12485205e+02,
        4.05545912e+02, -3.22925539e+02,  3.79803154e+02,  3.77850310e+02,
       -3.08883390e+02, -2.82099422e+02,  3.43073696e+02,  3.29683932e+02,
       -2.52832911e+02,  3.08468588e+02,  3.04296286e+02, -2.43750893e+02,
       -2.38403437e+02,  2.86470511e+02,  2.59808820e+02, -2.18158053e+02,
       -2.16011628e+02, -2.05838215e+02,  2.57188253e+02,  2.39219576e+02,
       -1.97539451e+02, -1.86139253e+02, -1.72301936e+02,  2.19249960e+02,
        2.11752792e+02,  2.09635674e+02,  1.93570881e+02, -1.65318460e+02,
        1.83712372e+02, 

ValueError: Rank(A) < p or Rank([P; A; G]) < n

In [18]:
# allow mismatch
kernel_parameters = {'k':6}
#kernel_parameters = {'k':5}
for lambda_reg in[0.12]:
    #lambda_reg = 0.12
    svm = KernelSVM(lambda_reg = lambda_reg, kernel = spectrum_kernel, 
                    kernel_parameters = kernel_parameters, data_type='string', threshold=0)

    # train svm
    svm.fit(X_train, y_train)

    y_pred = svm.pred(X_test)

    #print('Score : ')
    score = sum(y_test == y_pred)/len(y_test)
    print("lambda :", lambda_reg, " ", "score:",score)

Building kernel matrix: 100%|██████████| 2000/2000 [00:31<00:00, 63.29it/s] 


     pcost       dcost       gap    pres   dres
 0: -5.0459e+01 -4.5490e+01  5e+03  7e+01  1e-14
 1: -4.5703e+01 -3.4125e+01  6e+02  8e+00  1e-14
 2: -2.2657e+01 -1.6444e+01  2e+02  2e+00  6e-15
 3: -9.5445e+00 -1.1559e+01  3e+01  3e-01  2e-15
 4: -5.7943e+00 -9.2809e+00  4e+00  4e-03  9e-16
 5: -6.1193e+00 -6.7273e+00  6e-01  6e-04  6e-16
 6: -6.2542e+00 -6.3671e+00  1e-01  5e-05  6e-16
 7: -6.2928e+00 -6.3026e+00  1e-02  3e-06  6e-16
 8: -6.2967e+00 -6.2971e+00  3e-04  8e-08  7e-16
 9: -6.2969e+00 -6.2969e+00  1e-05  2e-09  6e-16
10: -6.2969e+00 -6.2969e+00  3e-07  2e-11  7e-16
Optimal solution found.
[   0    1    2 ... 1997 1998 1999]


Predicting values:   0%|          | 1/500 [00:00<00:53,  9.39it/s]

Numbers of support vectors : 2000


Predicting values: 100%|██████████| 500/500 [00:56<00:00,  8.81it/s]

lambda : 0.12   score: 0.9





In [14]:
K_good = svm.K
K_good

array([[105.,  10.,  14., ...,   5.,   8.,   8.],
       [ 10., 113.,  15., ...,   6.,   3.,  18.],
       [ 14.,  15., 115., ...,  16.,   5.,  20.],
       ...,
       [  5.,   6.,  16., ..., 111.,  11.,  18.],
       [  8.,   3.,   5., ...,  11., 113.,  15.],
       [  8.,  18.,  20., ...,  18.,  15., 103.]])

In [7]:
K_bad = svm.K
K_bad

array([[105.,  10.,  14., ...,   5.,   8.,   8.],
       [ 10., 113.,  15., ...,   6.,   3.,  18.],
       [ 14.,  15., 115., ...,  16.,   5.,  20.],
       ...,
       [  5.,   6.,  16., ..., 111.,  11.,  18.],
       [  8.,   3.,   5., ...,  11., 113.,  15.],
       [  8.,  18.,  20., ...,  18.,  15., 103.]])

In [5]:
%svm.fit(X_train, y_train)

ERROR:root:Line magic function `%svm.fit` not found.


In [81]:
#Grid search for parameters

kernel_parameters_list = [{'k':3},{'k':5},{'k':6}]
#kernel_parameters_list = [{'k':6}]
best_score = 0
best_param = None

for kernel_parameters in kernel_parameters_list:
    for lambda_reg in np.linspace(0.08,0.16,5):
        

        svm = KernelSVM(lambda_reg = lambda_reg, kernel = spectrum_kernel, 
                        kernel_parameters = kernel_parameters, data_type='string')

        # train svm
        svm.fit(X_train, y_train)

        y_pred = svm.pred(X_test)

        #print('Score : ')
        #sum(y_test == y_pred)/len(y_test)
        score = sum(y_test == y_pred)/len(y_test)
        if score > best_score:
            best_score=score
            best_param = lambda_reg, kernel_parameters

Building kernel matrix: 100%|██████████| 1500/1500 [00:19<00:00, 75.99it/s] 


     pcost       dcost       gap    pres   dres
 0: -2.5774e+01 -3.0306e+01  4e+03  6e+01  5e-15
 1: -2.5149e+01 -2.6832e+01  3e+02  5e+00  5e-15
 2: -1.6123e+01 -1.7304e+01  9e+01  1e+00  4e-15
 3: -8.4431e+00 -1.4228e+01  2e+01  1e-01  2e-15
 4: -7.5947e+00 -1.0041e+01  3e+00  1e-02  8e-16
 5: -7.8488e+00 -8.2831e+00  5e-01  2e-03  8e-16
 6: -7.9639e+00 -8.0170e+00  6e-02  1e-04  9e-16
 7: -7.9828e+00 -7.9870e+00  4e-03  8e-06  8e-16
 8: -7.9845e+00 -7.9846e+00  2e-04  2e-07  9e-16


Predicting values:   0%|          | 1/500 [00:00<00:51,  9.77it/s]

 9: -7.9845e+00 -7.9845e+00  7e-06  3e-09  9e-16
Optimal solution found.


Predicting values: 100%|██████████| 500/500 [00:40<00:00, 12.46it/s]
Building kernel matrix: 100%|██████████| 1500/1500 [00:16<00:00, 91.95it/s] 


     pcost       dcost       gap    pres   dres
 0: -2.5773e+01 -2.8899e+01  4e+03  6e+01  5e-15
 1: -2.5142e+01 -2.5374e+01  3e+02  5e+00  5e-15
 2: -1.5927e+01 -1.5273e+01  9e+01  1e+00  3e-15
 3: -7.8542e+00 -1.2062e+01  2e+01  1e-01  2e-15
 4: -6.5868e+00 -8.9192e+00  3e+00  1e-02  9e-16
 5: -6.7690e+00 -7.1725e+00  5e-01  2e-03  7e-16
 6: -6.8753e+00 -6.9264e+00  5e-02  1e-04  7e-16
 7: -6.8937e+00 -6.8976e+00  4e-03  9e-06  7e-16
 8: -6.8953e+00 -6.8954e+00  1e-04  2e-07  7e-16


Predicting values:   0%|          | 2/500 [00:00<00:38, 13.06it/s]

 9: -6.8954e+00 -6.8954e+00  5e-06  3e-09  7e-16
Optimal solution found.


Predicting values: 100%|██████████| 500/500 [00:36<00:00, 13.58it/s]
Building kernel matrix: 100%|██████████| 1500/1500 [00:15<00:00, 94.68it/s] 


     pcost       dcost       gap    pres   dres
 0: -2.5772e+01 -2.7961e+01  4e+03  6e+01  5e-15
 1: -2.5137e+01 -2.4398e+01  3e+02  5e+00  5e-15
 2: -1.5794e+01 -1.3893e+01  9e+01  1e+00  3e-15
 3: -7.4372e+00 -1.0514e+01  2e+01  2e-01  2e-15
 4: -5.8147e+00 -8.0503e+00  3e+00  1e-02  1e-15
 5: -5.9618e+00 -6.3205e+00  4e-01  1e-03  6e-16
 6: -6.0628e+00 -6.1127e+00  5e-02  1e-04  7e-16
 7: -6.0815e+00 -6.0854e+00  4e-03  8e-06  6e-16
 8: -6.0831e+00 -6.0833e+00  2e-04  4e-07  6e-16
 9: -6.0832e+00 -6.0832e+00  8e-06  1e-08  6e-16


Predicting values:   0%|          | 2/500 [00:00<00:39, 12.58it/s]

10: -6.0832e+00 -6.0832e+00  3e-07  1e-10  7e-16
Optimal solution found.


Predicting values: 100%|██████████| 500/500 [00:36<00:00, 13.60it/s]
Building kernel matrix: 100%|██████████| 1500/1500 [00:16<00:00, 89.77it/s] 


     pcost       dcost       gap    pres   dres
 0: -2.5772e+01 -2.7290e+01  4e+03  6e+01  5e-15
 1: -2.5133e+01 -2.3700e+01  3e+02  5e+00  5e-15
 2: -1.5697e+01 -1.2895e+01  9e+01  1e+00  3e-15
 3: -7.1888e+00 -9.3637e+00  2e+01  2e-01  2e-15
 4: -5.2141e+00 -7.3556e+00  3e+00  1e-02  8e-16
 5: -5.3294e+00 -5.6772e+00  4e-01  1e-03  5e-16
 6: -5.4253e+00 -5.4824e+00  6e-02  1e-04  5e-16
 7: -5.4469e+00 -5.4500e+00  3e-03  5e-06  6e-16
 8: -5.4483e+00 -5.4484e+00  1e-04  1e-07  6e-16


Predicting values:   0%|          | 2/500 [00:00<00:39, 12.52it/s]

 9: -5.4483e+00 -5.4483e+00  4e-06  2e-09  6e-16
Optimal solution found.


Predicting values: 100%|██████████| 500/500 [00:37<00:00, 13.50it/s]
Building kernel matrix: 100%|██████████| 1500/1500 [00:16<00:00, 92.14it/s] 


     pcost       dcost       gap    pres   dres
 0: -2.5772e+01 -2.6786e+01  4e+03  6e+01  5e-15
 1: -2.5130e+01 -2.3175e+01  3e+02  5e+00  5e-15
 2: -1.5624e+01 -1.2139e+01  9e+01  1e+00  3e-15
 3: -7.0061e+00 -8.4734e+00  2e+01  2e-01  2e-15
 4: -4.7324e+00 -6.7574e+00  3e+00  1e-02  9e-16
 5: -4.8227e+00 -5.1225e+00  3e-01  9e-04  5e-16
 6: -4.9137e+00 -4.9606e+00  5e-02  1e-04  5e-16
 7: -4.9320e+00 -4.9356e+00  4e-03  6e-06  5e-16
 8: -4.9336e+00 -4.9337e+00  1e-04  2e-07  5e-16


Predicting values:   0%|          | 2/500 [00:00<00:39, 12.63it/s]

 9: -4.9336e+00 -4.9336e+00  2e-06  2e-09  5e-16
Optimal solution found.


Predicting values: 100%|██████████| 500/500 [00:36<00:00, 13.56it/s]


In [82]:
print(best_score)
print(best_param)

0.776
(0.12, {'k': 6})


# Submission

In [19]:
y_pred = np.zeros((3000,))

lambda_opt=[0.12,0.035,0.066]
kernel_parameters_opt=[{'k':6}, {'k':6}, {'k':4}]

for dataset_index in range(3):
    # load DNA strings 
    data = []
    f = open('data/train/Xtr{}.csv'.format(dataset_index), 'r')
    data += [line.strip('\n') for line in f.readlines()]
    f.close()

    # load target vector 
    y = (load_y("data/train/Ytr{}.csv".format(dataset_index)))
    y = 2*y - 1 # transform y to lie in {-1, 1} instead of {0, 1}
    y = np.reshape(y.astype(float), -1) # 

    X_train, y_train, X_test, y_test = data, y, data[1500:], y[1500:]
    
    lambda_reg=lambda_opt[dataset_index]
    kernel_parameters=kernel_parameters_opt[dataset_index]

    SVM = KernelSVM(lambda_reg = lambda_reg, kernel = spectrum_kernel, 
                    kernel_parameters = kernel_parameters, data_type='string')
    
    SVM.fit(X_train, y_train)
    #y_loc = SVM.pred(X_test)
    #print(sum(y_test == y_loc)/len(y_test))
    
    #SVM.load('SVM_opt{}.npy'.format(i), 'Xtrain_{}.npy'.format(i) )
    
    
    f_ = open('data/test/Xte{}.csv'.format(dataset_index), 'r')
    X = [line.strip('\n') for line in f_.readlines()]
    f_.close()

    y_pred[1000*dataset_index:1000*(dataset_index+1)] = np.reshape(SVM.pred(X), (1000,))

Building kernel matrix: 100%|██████████| 2000/2000 [00:31<00:00, 62.81it/s] 


     pcost       dcost       gap    pres   dres
 0: -5.0459e+01 -4.5490e+01  5e+03  7e+01  1e-14
 1: -4.5703e+01 -3.4125e+01  6e+02  8e+00  1e-14
 2: -2.2657e+01 -1.6444e+01  2e+02  2e+00  6e-15
 3: -9.5445e+00 -1.1559e+01  3e+01  3e-01  2e-15
 4: -5.7943e+00 -9.2809e+00  4e+00  4e-03  9e-16
 5: -6.1193e+00 -6.7273e+00  6e-01  6e-04  6e-16
 6: -6.2542e+00 -6.3671e+00  1e-01  5e-05  6e-16
 7: -6.2928e+00 -6.3026e+00  1e-02  3e-06  6e-16
 8: -6.2967e+00 -6.2971e+00  3e-04  8e-08  7e-16
 9: -6.2969e+00 -6.2969e+00  1e-05  2e-09  6e-16
10: -6.2969e+00 -6.2969e+00  3e-07  2e-11  7e-16
Optimal solution found.
[   0    1    2 ... 1996 1997 1999]


Predicting values:   0%|          | 1/1000 [00:00<01:48,  9.18it/s]

Numbers of support vectors : 1915


Predicting values: 100%|██████████| 1000/1000 [01:51<00:00,  9.01it/s]
Building kernel matrix: 100%|██████████| 2000/2000 [00:30<00:00, 65.91it/s] 


     pcost       dcost       gap    pres   dres
 0: -2.8829e+01 -4.0671e+01  5e+03  7e+01  6e-15
 1: -2.7091e+01 -3.5225e+01  5e+02  6e+00  6e-15
 2: -1.5188e+01 -2.6530e+01  1e+02  1e+00  4e-15
 3: -9.3484e+00 -2.4162e+01  4e+01  3e-01  2e-15
 4: -8.1452e+00 -1.5392e+01  9e+00  2e-02  1e-15
 5: -8.7394e+00 -9.9694e+00  1e+00  4e-03  1e-15
 6: -9.0512e+00 -9.2459e+00  2e-01  4e-04  1e-15
 7: -9.1109e+00 -9.1304e+00  2e-02  3e-05  1e-15
 8: -9.1175e+00 -9.1191e+00  2e-03  1e-06  1e-15
 9: -9.1181e+00 -9.1181e+00  6e-05  3e-08  1e-15
10: -9.1181e+00 -9.1181e+00  2e-06  4e-10  1e-15
Optimal solution found.
[   0    2    5 ... 1997 1998 1999]


Predicting values:   0%|          | 1/1000 [00:00<01:44,  9.59it/s]

Numbers of support vectors : 1207


Predicting values: 100%|██████████| 1000/1000 [01:48<00:00,  9.22it/s]
Building kernel matrix: 100%|██████████| 2000/2000 [00:40<00:00, 49.08it/s] 


     pcost       dcost       gap    pres   dres
 0: -2.9430e+03 -3.5752e+01  4e+04  2e+02  2e-12
 1: -1.0559e+02 -3.3006e+01  1e+03  6e+00  2e-12
 2: -1.2367e+01 -2.9323e+01  6e+01  2e-01  1e-13
 3: -9.4820e+00 -1.7739e+01  8e+00  4e-17  8e-15
 4: -1.0591e+01 -1.2302e+01  2e+00  3e-17  7e-15
 5: -1.1076e+01 -1.1465e+01  4e-01  3e-17  7e-15
 6: -1.1192e+01 -1.1290e+01  1e-01  3e-17  7e-15
 7: -1.1225e+01 -1.1245e+01  2e-02  3e-17  8e-15
 8: -1.1232e+01 -1.1235e+01  3e-03  3e-17  8e-15
 9: -1.1234e+01 -1.1234e+01  1e-04  3e-17  8e-15
10: -1.1234e+01 -1.1234e+01  2e-06  3e-17  8e-15
Optimal solution found.
[   0    1    2 ... 1996 1998 1999]


Predicting values:   0%|          | 1/1000 [00:00<01:50,  9.08it/s]

Numbers of support vectors : 1636


Predicting values: 100%|██████████| 1000/1000 [01:51<00:00,  8.97it/s]


In [20]:
y_pred = (y_pred+1)/2 # to have y in 0 and 1
df = pd.DataFrame(y_pred, columns=['Bound'])
df.index.name = 'Id'
df.Bound = df.Bound.astype(int)
df.to_csv('submit_full.csv')