# Raw DNA strings

In this notebook we train SVM on string data (DNA strings).

In [1]:
from kernel import spectrum_kernel

from KernelSVM import KernelSVM

import numpy as np

from processing import train_test_split, load_y

import pandas as pd

# Load and pre-process data

In [2]:
data = []

dataset_index = '0'

# load DNA strings 
f = open('data/train/Xtr{}.csv'.format(dataset_index), 'r')
data += [line.strip('\n') for line in f.readlines()]
f.close()

# load target vector 
y = (load_y("data/train/Ytr{}.csv".format(dataset_index)))
y = 2*y - 1 # transform y to lie in {-1, 1} instead of {0, 1}
y = np.reshape(y.astype(float), -1) # 

print('First DNA string : ')
data[0]

First DNA string : 


'TCCTCAACTTTTATTGGGCCGCTGTGGCACCAGAATCTACGAATGGCGCCCTCTAGAGTTGTGTAAAGAAGTGGCGTCACCTCATTATAAATAAAAGGTTG'

In [3]:
# train-test split 
X_train, y_train, X_test, y_test = data[:1500], y[:1500], data[1500:], y[1500:]

# Train

In [4]:
kernel_parameters = {'k':3}

In [5]:
svm = KernelSVM(lambda_reg = 0.5, kernel = spectrum_kernel, 
                kernel_parameters = kernel_parameters, data_type='string')

In [6]:
# train svm
svm.fit(X_train, y_train)

Building kernel matrix: 100%|██████████| 1500/1500 [00:43<00:00, 34.16it/s]


     pcost       dcost       gap    pres   dres
 0: -2.3700e+03 -4.7555e+00  3e+04  2e+02  4e-12
 1: -4.2780e+01 -4.6287e+00  5e+02  3e+00  4e-12
 2: -2.2745e+00 -4.1627e+00  1e+01  7e-02  1e-13
 3: -1.3937e+00 -3.1420e+00  2e+00  2e-03  6e-15
 4: -1.5282e+00 -1.9041e+00  4e-01  3e-04  3e-15
 5: -1.6028e+00 -1.7611e+00  2e-01  9e-05  3e-15
 6: -1.6490e+00 -1.6818e+00  3e-02  1e-05  4e-15
 7: -1.6584e+00 -1.6679e+00  1e-02  4e-06  3e-15
 8: -1.6616e+00 -1.6633e+00  2e-03  1e-07  4e-15
 9: -1.6623e+00 -1.6626e+00  2e-04  2e-08  4e-15
10: -1.6624e+00 -1.6624e+00  2e-05  1e-09  4e-15
11: -1.6624e+00 -1.6624e+00  4e-07  3e-11  4e-15
Optimal solution found.


In [7]:
y_pred = svm.pred(X_test)

Predicting values: 100%|██████████| 500/500 [01:06<00:00,  7.50it/s]


In [11]:
print('Score : ')
sum(y_test == y_pred)/len(y_test)

Score : 


0.65

In [9]:
# Find best regularization parameter
for lambda_reg in np.linspace(0.001, 1, 20):
    print(lambda_reg)
    svm.lambda_reg = lambda_reg
    svm.fit(X_train, y_train)
    y_pred = svm.pred(X_test)
    print('Score : {}'.format(sum(y_test == y_pred)/500))

0.001
     pcost       dcost       gap    pres   dres
 0: -2.7640e+03 -2.1270e+03  3e+04  1e+01  5e-12
 1: -7.1825e+02 -1.9642e+03  3e+03  9e-01  7e-12
 2: -5.7891e+02 -1.2190e+03  8e+02  1e-01  2e-12
 3: -6.3156e+02 -7.4724e+02  1e+02  2e-02  1e-12
 4: -6.6612e+02 -6.9914e+02  4e+01  4e-03  1e-12
 5: -6.7582e+02 -6.8673e+02  1e+01  1e-03  1e-12
 6: -6.7852e+02 -6.8323e+02  5e+00  4e-04  1e-12
 7: -6.7982e+02 -6.8158e+02  2e+00  8e-05  1e-12
 8: -6.8031e+02 -6.8099e+02  7e-01  2e-05  1e-12
 9: -6.8057e+02 -6.8069e+02  1e-01  2e-06  2e-12
10: -6.8062e+02 -6.8063e+02  1e-02  3e-07  2e-12
11: -6.8063e+02 -6.8063e+02  1e-03  2e-08  2e-12

Predicting values:   0%|          | 0/500 [00:00<?, ?it/s]


12: -6.8063e+02 -6.8063e+02  1e-05  2e-10  2e-12
Optimal solution found.


Predicting values: 100%|██████████| 500/500 [01:09<00:00,  7.22it/s]


Score : 0.626
0.05357894736842105
     pcost       dcost       gap    pres   dres
 0: -2.3766e+03 -3.8728e+01  3e+04  2e+02  6e-12
 1: -5.1525e+01 -3.8553e+01  5e+02  3e+00  4e-12
 2: -1.3985e+01 -3.5189e+01  5e+01  2e-01  3e-13
 3: -1.1428e+01 -2.1806e+01  1e+01  2e-02  3e-14
 4: -1.2629e+01 -1.4345e+01  2e+00  3e-03  2e-14
 5: -1.3207e+01 -1.3689e+01  5e-01  6e-04  3e-14
 6: -1.3360e+01 -1.3511e+01  2e-01  2e-04  3e-14
 7: -1.3408e+01 -1.3455e+01  5e-02  4e-05  3e-14
 8: -1.3423e+01 -1.3438e+01  2e-02  1e-05  3e-14
 9: -1.3428e+01 -1.3432e+01  3e-03  2e-06  3e-14
10: -1.3430e+01 -1.3430e+01  3e-04  6e-08  3e-14


Predicting values:   0%|          | 1/500 [00:00<01:15,  6.63it/s]

11: -1.3430e+01 -1.3430e+01  1e-05  2e-09  3e-14
Optimal solution found.


Predicting values: 100%|██████████| 500/500 [01:00<00:00,  8.30it/s]


Score : 0.646
0.1061578947368421
     pcost       dcost       gap    pres   dres
 0: -2.3730e+03 -1.9876e+01  3e+04  2e+02  4e-12
 1: -4.6149e+01 -1.9741e+01  5e+02  3e+00  4e-12
 2: -7.4860e+00 -1.8547e+01  3e+01  1e-01  2e-13
 3: -5.8583e+00 -1.1565e+01  6e+00  7e-03  2e-14
 4: -6.5409e+00 -7.4992e+00  1e+00  1e-03  1e-14
 5: -6.8543e+00 -7.1172e+00  3e-01  2e-04  1e-14
 6: -6.9396e+00 -7.0179e+00  8e-02  5e-05  2e-14
 7: -6.9652e+00 -6.9881e+00  2e-02  1e-05  2e-14
 8: -6.9731e+00 -6.9790e+00  6e-03  3e-06  2e-14
 9: -6.9755e+00 -6.9763e+00  9e-04  3e-07  2e-14
10: -6.9759e+00 -6.9759e+00  4e-05  1e-08  2e-14


Predicting values:   0%|          | 1/500 [00:00<01:20,  6.19it/s]

11: -6.9759e+00 -6.9759e+00  7e-07  2e-10  2e-14
Optimal solution found.


Predicting values: 100%|██████████| 500/500 [01:02<00:00,  8.02it/s]


Score : 0.65
0.15873684210526315
     pcost       dcost       gap    pres   dres
 0: -2.3717e+03 -1.3517e+01  3e+04  2e+02  6e-12
 1: -4.4732e+01 -1.3387e+01  5e+02  3e+00  4e-12
 2: -5.1870e+00 -1.2625e+01  2e+01  9e-02  1e-13
 3: -3.9704e+00 -7.8155e+00  4e+00  1e-03  1e-14
 4: -4.4884e+00 -5.1287e+00  6e-01  2e-04  9e-15
 5: -4.6765e+00 -4.8923e+00  2e-01  5e-05  9e-15
 6: -4.7491e+00 -4.8020e+00  5e-02  1e-05  1e-14
 7: -4.7650e+00 -4.7832e+00  2e-02  3e-06  1e-14
 8: -4.7711e+00 -4.7761e+00  5e-03  6e-07  1e-14
 9: -4.7730e+00 -4.7740e+00  1e-03  1e-07  1e-14
10: -4.7734e+00 -4.7735e+00  1e-04  9e-09  1e-14
11: -4.7734e+00 -4.7734e+00  9e-06  6e-10  1e-14


Predicting values:   0%|          | 1/500 [00:00<00:57,  8.71it/s]

12: -4.7734e+00 -4.7734e+00  2e-07  1e-11  1e-14
Optimal solution found.


Predicting values: 100%|██████████| 500/500 [01:00<00:00,  8.23it/s]


Score : 0.65
0.2113157894736842
     pcost       dcost       gap    pres   dres
 0: -2.3711e+03 -1.0322e+01  3e+04  2e+02  4e-12
 1: -4.4020e+01 -1.0195e+01  5e+02  3e+00  4e-12
 2: -4.0052e+00 -9.5819e+00  2e+01  7e-02  1e-13
 3: -3.0729e+00 -6.0891e+00  3e+00  3e-03  9e-15
 4: -3.4275e+00 -3.9477e+00  5e-01  4e-04  6e-15
 5: -3.5932e+00 -3.7341e+00  1e-01  9e-05  8e-15
 6: -3.6399e+00 -3.6779e+00  4e-02  2e-05  8e-15
 7: -3.6498e+00 -3.6661e+00  2e-02  5e-06  8e-15
 8: -3.6558e+00 -3.6590e+00  3e-03  7e-07  8e-15
 9: -3.6572e+00 -3.6575e+00  3e-04  4e-08  9e-15
10: -3.6573e+00 -3.6573e+00  8e-06  1e-09  9e-15


Predicting values:   0%|          | 1/500 [00:00<01:00,  8.22it/s]

11: -3.6573e+00 -3.6573e+00  1e-07  2e-11  9e-15
Optimal solution found.


Predicting values:  19%|█▉        | 97/500 [00:14<01:01,  6.58it/s]

KeyboardInterrupt: 

# Submission

In [8]:
# train the 3 SVM

for dataset_index in range(3):
    f = open('data/train/Xtr{}.csv'.format(dataset_index), 'r')
    data = [line.strip('\n') for line in f.readlines()]
    f.close()
    
    y = load_y("data/train/Ytr{}.csv".format(dataset_index))
    y = 2*y - 1
    y = np.reshape(y.astype(float), -1)

    svm = KernelSVM(lambda_reg = 0.5, kernel = spectrum_kernel, 
                    kernel_parameters = kernel_parameters, data_type='string')

    svm.fit(data, y)

    svm.save('SVM_{}'.format(dataset_index), 'Xtrain_{}'.format(dataset_index))

Building kernel matrix:  79%|███████▉  | 4732/6000 [08:12<02:12,  9.60it/s]

KeyboardInterrupt: 

In [None]:
y_pred = np.zeros((3000,))

for i in range(3):
    SVM_classifier.load('SVM_{}.npy'.format(i), 'Xtrain_{}.npy'.format(i))
    
    f = open('data/test/Xte{}.csv'.format(i), 'r')
    X = [line.strip('\n') for line in f.readlines()]
    f.close()

    y_pred[1000*i:1000*(i+1)] = np.reshape(svm.pred(X), (1000,))

In [None]:
y_pred = (y_pred+1)/2 # to have y in 0 and 1

In [None]:
df = pd.DataFrame(y_pred, columns=['Bound'])
df.index.name = 'Id'
df.Bound = df.Bound.astype(int)
df.to_csv('test.csv')