# Raw DNA strings

In this notebook we train SVM on string data (DNA strings).

In [1]:
from kernel import spectrum_kernel

from KernelSVM import KernelSVM

import numpy as np

from processing import train_test_split, load_y

import pandas as pd

# Load and pre-process data

In [78]:
data = []

dataset_index = '0'

# load DNA strings 
f = open('data/train/Xtr{}.csv'.format(dataset_index), 'r')
data += [line.strip('\n') for line in f.readlines()]
f.close()

# load target vector 
y = (load_y("data/train/Ytr{}.csv".format(dataset_index)))
y = 2*y - 1 # transform y to lie in {-1, 1} instead of {0, 1}
y = np.reshape(y.astype(float), -1) # 

print('First DNA string : ')
data[0]

First DNA string : 


'TCCTCAACTTTTATTGGGCCGCTGTGGCACCAGAATCTACGAATGGCGCCCTCTAGAGTTGTGTAAAGAAGTGGCGTCACCTCATTATAAATAAAAGGTTG'

In [80]:
# train-test split 
X_train, y_train, X_test, y_test = data[:1500], y[:1500], data[1500:], y[1500:]
#X_train, y_train, X_test, y_test = X_train[:300], y_train[:300], X_test[:100], y_test[:100]

# Train

In [81]:
#Grid search for parameters

kernel_parameters_list = [{'k':3},{'k':5},{'k':6}]
#kernel_parameters_list = [{'k':6}]
best_score = 0
best_param = None

for kernel_parameters in kernel_parameters_list:
    for lambda_reg in np.linspace(0.08,0.16,5):
        

        svm = KernelSVM(lambda_reg = lambda_reg, kernel = spectrum_kernel, 
                        kernel_parameters = kernel_parameters, data_type='string')

        # train svm
        svm.fit(X_train, y_train)

        y_pred = svm.pred(X_test)

        #print('Score : ')
        #sum(y_test == y_pred)/len(y_test)
        score = sum(y_test == y_pred)/len(y_test)
        if score > best_score:
            best_score=score
            best_param = lambda_reg, kernel_parameters

Building kernel matrix: 100%|██████████| 1500/1500 [00:19<00:00, 75.99it/s] 


     pcost       dcost       gap    pres   dres
 0: -2.5774e+01 -3.0306e+01  4e+03  6e+01  5e-15
 1: -2.5149e+01 -2.6832e+01  3e+02  5e+00  5e-15
 2: -1.6123e+01 -1.7304e+01  9e+01  1e+00  4e-15
 3: -8.4431e+00 -1.4228e+01  2e+01  1e-01  2e-15
 4: -7.5947e+00 -1.0041e+01  3e+00  1e-02  8e-16
 5: -7.8488e+00 -8.2831e+00  5e-01  2e-03  8e-16
 6: -7.9639e+00 -8.0170e+00  6e-02  1e-04  9e-16
 7: -7.9828e+00 -7.9870e+00  4e-03  8e-06  8e-16
 8: -7.9845e+00 -7.9846e+00  2e-04  2e-07  9e-16


Predicting values:   0%|          | 1/500 [00:00<00:51,  9.77it/s]

 9: -7.9845e+00 -7.9845e+00  7e-06  3e-09  9e-16
Optimal solution found.


Predicting values: 100%|██████████| 500/500 [00:40<00:00, 12.46it/s]
Building kernel matrix: 100%|██████████| 1500/1500 [00:16<00:00, 91.95it/s] 


     pcost       dcost       gap    pres   dres
 0: -2.5773e+01 -2.8899e+01  4e+03  6e+01  5e-15
 1: -2.5142e+01 -2.5374e+01  3e+02  5e+00  5e-15
 2: -1.5927e+01 -1.5273e+01  9e+01  1e+00  3e-15
 3: -7.8542e+00 -1.2062e+01  2e+01  1e-01  2e-15
 4: -6.5868e+00 -8.9192e+00  3e+00  1e-02  9e-16
 5: -6.7690e+00 -7.1725e+00  5e-01  2e-03  7e-16
 6: -6.8753e+00 -6.9264e+00  5e-02  1e-04  7e-16
 7: -6.8937e+00 -6.8976e+00  4e-03  9e-06  7e-16
 8: -6.8953e+00 -6.8954e+00  1e-04  2e-07  7e-16


Predicting values:   0%|          | 2/500 [00:00<00:38, 13.06it/s]

 9: -6.8954e+00 -6.8954e+00  5e-06  3e-09  7e-16
Optimal solution found.


Predicting values: 100%|██████████| 500/500 [00:36<00:00, 13.58it/s]
Building kernel matrix: 100%|██████████| 1500/1500 [00:15<00:00, 94.68it/s] 


     pcost       dcost       gap    pres   dres
 0: -2.5772e+01 -2.7961e+01  4e+03  6e+01  5e-15
 1: -2.5137e+01 -2.4398e+01  3e+02  5e+00  5e-15
 2: -1.5794e+01 -1.3893e+01  9e+01  1e+00  3e-15
 3: -7.4372e+00 -1.0514e+01  2e+01  2e-01  2e-15
 4: -5.8147e+00 -8.0503e+00  3e+00  1e-02  1e-15
 5: -5.9618e+00 -6.3205e+00  4e-01  1e-03  6e-16
 6: -6.0628e+00 -6.1127e+00  5e-02  1e-04  7e-16
 7: -6.0815e+00 -6.0854e+00  4e-03  8e-06  6e-16
 8: -6.0831e+00 -6.0833e+00  2e-04  4e-07  6e-16
 9: -6.0832e+00 -6.0832e+00  8e-06  1e-08  6e-16


Predicting values:   0%|          | 2/500 [00:00<00:39, 12.58it/s]

10: -6.0832e+00 -6.0832e+00  3e-07  1e-10  7e-16
Optimal solution found.


Predicting values: 100%|██████████| 500/500 [00:36<00:00, 13.60it/s]
Building kernel matrix: 100%|██████████| 1500/1500 [00:16<00:00, 89.77it/s] 


     pcost       dcost       gap    pres   dres
 0: -2.5772e+01 -2.7290e+01  4e+03  6e+01  5e-15
 1: -2.5133e+01 -2.3700e+01  3e+02  5e+00  5e-15
 2: -1.5697e+01 -1.2895e+01  9e+01  1e+00  3e-15
 3: -7.1888e+00 -9.3637e+00  2e+01  2e-01  2e-15
 4: -5.2141e+00 -7.3556e+00  3e+00  1e-02  8e-16
 5: -5.3294e+00 -5.6772e+00  4e-01  1e-03  5e-16
 6: -5.4253e+00 -5.4824e+00  6e-02  1e-04  5e-16
 7: -5.4469e+00 -5.4500e+00  3e-03  5e-06  6e-16
 8: -5.4483e+00 -5.4484e+00  1e-04  1e-07  6e-16


Predicting values:   0%|          | 2/500 [00:00<00:39, 12.52it/s]

 9: -5.4483e+00 -5.4483e+00  4e-06  2e-09  6e-16
Optimal solution found.


Predicting values: 100%|██████████| 500/500 [00:37<00:00, 13.50it/s]
Building kernel matrix: 100%|██████████| 1500/1500 [00:16<00:00, 92.14it/s] 


     pcost       dcost       gap    pres   dres
 0: -2.5772e+01 -2.6786e+01  4e+03  6e+01  5e-15
 1: -2.5130e+01 -2.3175e+01  3e+02  5e+00  5e-15
 2: -1.5624e+01 -1.2139e+01  9e+01  1e+00  3e-15
 3: -7.0061e+00 -8.4734e+00  2e+01  2e-01  2e-15
 4: -4.7324e+00 -6.7574e+00  3e+00  1e-02  9e-16
 5: -4.8227e+00 -5.1225e+00  3e-01  9e-04  5e-16
 6: -4.9137e+00 -4.9606e+00  5e-02  1e-04  5e-16
 7: -4.9320e+00 -4.9356e+00  4e-03  6e-06  5e-16
 8: -4.9336e+00 -4.9337e+00  1e-04  2e-07  5e-16


Predicting values:   0%|          | 2/500 [00:00<00:39, 12.63it/s]

 9: -4.9336e+00 -4.9336e+00  2e-06  2e-09  5e-16
Optimal solution found.


Predicting values: 100%|██████████| 500/500 [00:36<00:00, 13.56it/s]


In [82]:
print(best_score)
print(best_param)


0.776
(0.12, {'k': 6})


# Submission

In [3]:
y_pred = np.zeros((3000,))

lambda_opt=[0.12,0.04,0.066]
kernel_parameters_opt=[{'k':6}, {'k':6}, {'k':4}]

for dataset_index in range(3):
    # load DNA strings 
    data = []
    f = open('data/train/Xtr{}.csv'.format(dataset_index), 'r')
    data += [line.strip('\n') for line in f.readlines()]
    f.close()

    # load target vector 
    y = (load_y("data/train/Ytr{}.csv".format(dataset_index)))
    y = 2*y - 1 # transform y to lie in {-1, 1} instead of {0, 1}
    y = np.reshape(y.astype(float), -1) # 

    X_train, y_train, X_test, y_test = data[:1500], y[:1500], data[1500:], y[1500:]
    
    lambda_reg=lambda_opt[dataset_index]
    kernel_parameters=kernel_parameters_opt[dataset_index]

    SVM = KernelSVM(lambda_reg = lambda_reg, kernel = spectrum_kernel, 
                    kernel_parameters = kernel_parameters, data_type='string')
    
    SVM.fit(X_train, y_train)
    y_loc = SVM.pred(X_test)
    print(sum(y_test == y_loc)/len(y_test))
    
    #SVM.load('SVM_opt{}.npy'.format(i), 'Xtrain_{}.npy'.format(i) )
    
    
    f_ = open('data/test/Xte{}.csv'.format(dataset_index), 'r')
    X = [line.strip('\n') for line in f_.readlines()]
    f_.close()

    y_pred[1000*dataset_index:1000*(dataset_index+1)] = np.reshape(SVM.pred(X), (1000,))

Building kernel matrix: 100%|██████████| 1500/1500 [00:16<00:00, 92.03it/s] 


     pcost       dcost       gap    pres   dres
 0: -2.5772e+01 -2.7961e+01  4e+03  6e+01  5e-15
 1: -2.5137e+01 -2.4398e+01  3e+02  5e+00  5e-15
 2: -1.5794e+01 -1.3893e+01  9e+01  1e+00  3e-15
 3: -7.4372e+00 -1.0514e+01  2e+01  2e-01  2e-15
 4: -5.8147e+00 -8.0503e+00  3e+00  1e-02  1e-15
 5: -5.9618e+00 -6.3205e+00  4e-01  1e-03  6e-16
 6: -6.0628e+00 -6.1127e+00  5e-02  1e-04  7e-16
 7: -6.0815e+00 -6.0854e+00  4e-03  8e-06  6e-16
 8: -6.0831e+00 -6.0833e+00  2e-04  4e-07  6e-16
 9: -6.0832e+00 -6.0832e+00  8e-06  1e-08  6e-16


Predicting values:   0%|          | 2/500 [00:00<00:40, 12.26it/s]

10: -6.0832e+00 -6.0832e+00  3e-07  1e-10  7e-16
Optimal solution found.


Predicting values: 100%|██████████| 500/500 [00:37<00:00, 13.25it/s]
Predicting values:   0%|          | 2/1000 [00:00<01:17, 12.95it/s]

0.776


Predicting values: 100%|██████████| 1000/1000 [01:14<00:00, 13.34it/s]
Building kernel matrix: 100%|██████████| 1500/1500 [00:15<00:00, 93.92it/s] 


     pcost       dcost       gap    pres   dres
 0: -1.5045e+01 -2.7338e+01  3e+03  6e+01  3e-15
 1: -1.4808e+01 -2.5674e+01  2e+02  3e+00  3e-15
 2: -1.0547e+01 -2.1220e+01  6e+01  8e-01  2e-15
 3: -7.5712e+00 -1.9048e+01  3e+01  2e-01  1e-15
 4: -6.9652e+00 -1.0953e+01  4e+00  8e-03  1e-15
 5: -7.4578e+00 -8.0999e+00  7e-01  1e-03  1e-15
 6: -7.6146e+00 -7.7036e+00  9e-02  1e-04  1e-15
 7: -7.6402e+00 -7.6468e+00  7e-03  5e-06  1e-15
 8: -7.6424e+00 -7.6427e+00  3e-04  1e-07  1e-15
 9: -7.6425e+00 -7.6425e+00  1e-05  2e-09  1e-15


Predicting values:   0%|          | 2/500 [00:00<00:39, 12.61it/s]

10: -7.6425e+00 -7.6425e+00  6e-07  3e-11  1e-15
Optimal solution found.


Predicting values: 100%|██████████| 500/500 [00:37<00:00, 13.28it/s]
Predicting values:   0%|          | 1/1000 [00:00<01:40,  9.90it/s]

0.876


Predicting values: 100%|██████████| 1000/1000 [01:14<00:00, 13.36it/s]
Building kernel matrix: 100%|██████████| 1500/1500 [00:20<00:00, 72.09it/s] 


     pcost       dcost       gap    pres   dres
 0: -2.1606e+03 -3.6473e+01  3e+04  2e+02  2e-12
 1: -9.6788e+01 -3.2830e+01  1e+03  6e+00  1e-12
 2: -1.1958e+01 -2.9315e+01  5e+01  2e-01  8e-14
 3: -9.4483e+00 -1.7181e+01  8e+00  5e-17  8e-15
 4: -1.0561e+01 -1.1998e+01  1e+00  3e-17  7e-15
 5: -1.0934e+01 -1.1411e+01  5e-01  4e-17  6e-15
 6: -1.1077e+01 -1.1195e+01  1e-01  4e-17  7e-15
 7: -1.1118e+01 -1.1138e+01  2e-02  4e-17  7e-15
 8: -1.1126e+01 -1.1128e+01  2e-03  4e-17  8e-15
 9: -1.1127e+01 -1.1127e+01  6e-05  4e-17  8e-15


Predicting values:   0%|          | 2/500 [00:00<00:40, 12.20it/s]

10: -1.1127e+01 -1.1127e+01  1e-06  4e-17  8e-15
Optimal solution found.


Predicting values: 100%|██████████| 500/500 [00:38<00:00, 13.03it/s]
Predicting values:   0%|          | 2/1000 [00:00<01:19, 12.62it/s]

0.656


Predicting values: 100%|██████████| 1000/1000 [01:14<00:00, 13.41it/s]


In [5]:
y_pred = (y_pred+1)/2 # to have y in 0 and 1
df = pd.DataFrame(y_pred, columns=['Bound'])
df.index.name = 'Id'
df.Bound = df.Bound.astype(int)
df.to_csv('submit_good_last_.csv')

1519.0
