# Raw DNA strings

In this notebook we train SVM on string data (DNA strings).

In [1]:
from kernel import *

from KernelSVM import KernelSVM

import numpy as np

from processing import train_test_split, load_y

import pandas as pd

# Load and pre-process data

In [2]:
data = []

dataset_index = '2'

# load DNA strings 
f = open('data/train/Xtr{}.csv'.format(dataset_index), 'r')
data += [line.strip('\n') for line in f.readlines()]
f.close()

# load target vector 
y = (load_y("data/train/Ytr{}.csv".format(dataset_index)))
y = 2*y - 1 # transform y to lie in {-1, 1} instead of {0, 1}
y = np.reshape(y.astype(float), -1) # 

print('First DNA string : ')
data[0]

First DNA string : 


'GAAAAAACGTTAGCAAACAAGGAACAAAGACAAAGCTGTCAACGGTCCATGGAATCTTGAAATTTAAATAATTGTTACACATTTTGTTTTGTTCTAACTGT'

In [3]:
X_train, y_train, X_test, y_test = data[:1500], y[:1500], data[1500:], y[1500:]
#X_train, y_train, X_test, y_test = X_train[:75], y_train[:75], X_test[:25], y_test[:25]

# Train

### Kitchen sink

In [4]:
from sklearn.svm import LinearSVC

In [25]:
#### TEST of the KITCHENS SINKs
alpha_size = 4

M = 6*2048

best_score = 0

kernel_parameters = {} # no parameters for linear kernel

""" 
X_lift_train = phi_sink(X_train, k, gamma, M)
print("Train set lifted")
X_lift_test = phi_sink(X_test, k, gamma, M)
print("Test set lifted")
"""
for k in [7, 8, 9]:
    for gamma in np.linspace(0.5, 1.2, 8):
        W = gamma*np.random.randn(M,4*k)
        b = (2*np.pi)*np.random.rand(M)

        X_train_ = sequence_to_matrix(X_train)
        X_test_ = sequence_to_matrix(X_test)

        X_lift_train = compute_conv_features(X_train_, W, b, alpha_size = alpha_size)
        print("Train set lifted")
        X_lift_test = compute_conv_features(X_test_, W, b, alpha_size = alpha_size)
        print("Test set lifted")

        clf = KernelSVM(lambda_reg = 0, loss="squared_hinge", kernel = linear, 
                    kernel_parameters = kernel_parameters, data_type='vector', threshold=0, verbose=False)
                
        for lambda_reg in np.linspace(0.0001, 0.01, 20):

            clf_sk = LinearSVC(C=1/(2*1500*lambda_reg))
            
            clf.lambda_reg = lambda_reg

            clf.fit(X_lift_train, y_train)
            
            #clf_sk.fit(X_lift_train, y_train)

            y_pred_test = clf.predict(X_lift_test)
            #y_pred_test_sk = clf_sk.predict(X_lift_test)
            
            score =  sum(y_test == y_pred_test)/len(y_test)*100
            #score_sk =  sum(y_test == y_pred_test_sk)/len(y_test)*100
            
            print('Gamma : {}, Lambda : {}, Score: {}'.format(gamma, lambda_reg, score))
            if score > best_score:
                best_score = score
                best_gamma = gamma
                best_lambda = lambda_reg
                best_k = k
                best_W = W
                best_b = b

print('Best k : {}, best gamma : {}, best lambda : {}, best score: {}'.format(best_k, best_gamma, best_lambda, best_score))

Train set lifted
Test set lifted
Gamma : 0.5, Lambda : 0.0001, Score: 63.0
Gamma : 0.5, Lambda : 0.0006210526315789474, Score: 63.6
Gamma : 0.5, Lambda : 0.0011421052631578948, Score: 66.0
Gamma : 0.5, Lambda : 0.0016631578947368423, Score: 67.2
Gamma : 0.5, Lambda : 0.0021842105263157894, Score: 67.0
Gamma : 0.5, Lambda : 0.0027052631578947366, Score: 67.0
Gamma : 0.5, Lambda : 0.0032263157894736843, Score: 67.0
Gamma : 0.5, Lambda : 0.0037473684210526316, Score: 67.0
Gamma : 0.5, Lambda : 0.004268421052631579, Score: 67.0
Gamma : 0.5, Lambda : 0.004789473684210527, Score: 67.2
Gamma : 0.5, Lambda : 0.005310526315789474, Score: 67.80000000000001
Gamma : 0.5, Lambda : 0.005831578947368421, Score: 68.2
Gamma : 0.5, Lambda : 0.006352631578947369, Score: 68.2
Gamma : 0.5, Lambda : 0.0068736842105263166, Score: 68.4
Gamma : 0.5, Lambda : 0.007394736842105264, Score: 68.60000000000001
Gamma : 0.5, Lambda : 0.00791578947368421, Score: 68.8
Gamma : 0.5, Lambda : 0.008436842105263158, Score: 6

KeyboardInterrupt: 

In [6]:
#np.save('W_0.npy', best_W)
#np.save('b_0.npy', best_b)


# k = 7, M =10
#Gamma : 1.0, Lambda : 0.06210526315789474, Score: 70.19999999999999
#Gamma : 1.0, Lambda : 0.11421052631578947, Score: 70.8
#Gamma : 1.0, Lambda : 0.16631578947368422, Score: 71.2
#Gamma : 1.0, Lambda : 0.21842105263157896, Score: 71.0

In [14]:
best_W = np.load("W_2.npy") 
best_b = np.load("b_2.npy")

### k-spectrum kernel

In [6]:
#Grid search for parameters

kernel_parameters_list = [{'k':6}, {'k':7}, {'k':8}]
best_score = 0
best_param = None

for kernel_parameters in kernel_parameters_list:
    
    svm = KernelSVM(lambda_reg = 1, loss='hinge', kernel = spectrum_kernel, 
                kernel_parameters = kernel_parameters, data_type='string')

    for lambda_reg in np.linspace(0.01,1, 10):
        
        svm.lambda_reg = lambda_reg
        
        # train svm
        svm.fit(X_train, y_train)

        y_pred = svm.pred(X_test)

        #print('Score : ')
        #sum(y_test == y_pred)/len(y_test)
        score = sum(y_test == y_pred)/len(y_test)
        print(kernel_parameters, lambda_reg, score)
        if score > best_score:
            best_score=score
            best_param = lambda_reg, kernel_parameters
            
print(best_score)
print(best_param)


Building kernel matrix:   0%|          | 0/1500 [00:00<?, ?it/s][A
Building kernel matrix:   5%|▍         | 74/1500 [00:00<00:02, 652.37it/s][A
Building kernel matrix:   7%|▋         | 98/1500 [00:00<00:02, 471.92it/s][A
Building kernel matrix:   8%|▊         | 127/1500 [00:00<00:03, 417.50it/s][A
Building kernel matrix:  10%|█         | 156/1500 [00:00<00:03, 385.52it/s][A
Building kernel matrix:  12%|█▏        | 182/1500 [00:00<00:03, 330.16it/s][A
Building kernel matrix:  14%|█▎        | 206/1500 [00:00<00:04, 271.63it/s][A
Building kernel matrix:  15%|█▌        | 226/1500 [00:00<00:05, 243.44it/s][A
Building kernel matrix:  16%|█▋        | 244/1500 [00:01<00:05, 230.73it/s][A
Building kernel matrix:  17%|█▋        | 261/1500 [00:01<00:05, 222.52it/s][A
Building kernel matrix:  18%|█▊        | 277/1500 [00:01<00:06, 199.98it/s][A
Building kernel matrix:  19%|█▉        | 291/1500 [00:01<00:06, 195.29it/s][A
Building kernel matrix:  20%|██        | 305/1500 [00:01<00:06, 

Numbers of support vectors : 1098


Predicting values: 100%|██████████| 500/500 [01:01<00:00,  8.17it/s]


{'k': 6} 0.01 0.728


Predicting values:   0%|          | 1/500 [00:00<01:14,  6.72it/s]

Numbers of support vectors : 1442


Predicting values: 100%|██████████| 500/500 [00:55<00:00,  9.01it/s]


{'k': 6} 0.12 0.776


Predicting values:   0%|          | 1/500 [00:00<00:58,  8.46it/s]

Numbers of support vectors : 1493


Predicting values: 100%|██████████| 500/500 [00:56<00:00,  8.83it/s]


{'k': 6} 0.23 0.746


KeyboardInterrupt: 

# Submission

### Mixed

In [5]:
y_pred = np.zeros((3000,))

In [6]:
dataset_index = 0

# load DNA strings 
data = []
f = open('data/train/Xtr{}.csv'.format(dataset_index), 'r')
data += [line.strip('\n') for line in f.readlines()]
f.close()

# load target vector 
y = (load_y("data/train/Ytr{}.csv".format(dataset_index)))
y = 2*y - 1 # transform y to lie in {-1, 1} instead of {0, 1}
y = np.reshape(y.astype(float), -1) # 

X_train, y_train = data, y

kernel_parameters = {'k':6}

clf =  KernelSVM(lambda_reg = 0.1, loss='hinge', kernel = spectrum_kernel, 
                kernel_parameters = kernel_parameters, data_type='string')

clf.fit(X_train, y_train)

f_ = open('data/test/Xte{}.csv'.format(dataset_index), 'r')
X_test = [line.strip('\n') for line in f_.readlines()]
f_.close()

y_pred[1000*dataset_index:1000*(dataset_index+1)] = np.reshape(clf.pred(X_test), (1000,))

Building kernel matrix: 100%|██████████| 2000/2000 [01:01<00:00, 32.68it/s]
Predicting values:   0%|          | 0/1000 [00:00<?, ?it/s]

Numbers of support vectors : 1872


Predicting values: 100%|██████████| 1000/1000 [02:41<00:00,  6.20it/s]


In [7]:
dataset_index = 1

alpha_size = 4

kernel_parameters = {}

# load DNA strings 
data = []
f = open('data/train/Xtr{}.csv'.format(dataset_index), 'r')
data += [line.strip('\n') for line in f.readlines()]
f.close()

# load target vector 
y = (load_y("data/train/Ytr{}.csv".format(dataset_index)))
y = 2*y - 1 # transform y to lie in {-1, 1} instead of {0, 1}
y = np.reshape(y.astype(float), -1) # 

X_train, y_train = data, y

W = np.load('W_1.npy')
b = np.load('b_1.npy')

X_train_ = sequence_to_matrix(X_train)

X_lift_train = compute_conv_features(X_train_, W, b, alpha_size = alpha_size)

clf = KernelSVM(lambda_reg = 0.054210526315789466, loss="squared_hinge", kernel = linear, 
            kernel_parameters = kernel_parameters, data_type='vector', threshold=0, verbose=False)

clf.fit(X_lift_train, y_train)

f_ = open('data/test/Xte{}.csv'.format(dataset_index), 'r')
X_test = [line.strip('\n') for line in f_.readlines()]
f_.close()

X_test_ = sequence_to_matrix(X_test)

X_lift_test = compute_conv_features(X_test_, W, b, alpha_size = alpha_size)

y_pred[1000*dataset_index:1000*(dataset_index+1)] = np.reshape(clf.predict(X_lift_test), (1000,))

In [8]:
dataset_index = 2

alpha_size = 4

# load DNA strings 
data = []
f = open('data/train/Xtr{}.csv'.format(dataset_index), 'r')
data += [line.strip('\n') for line in f.readlines()]
f.close()

# load target vector 
y = (load_y("data/train/Ytr{}.csv".format(dataset_index)))
y = 2*y - 1 # transform y to lie in {-1, 1} instead of {0, 1}
y = np.reshape(y.astype(float), -1) # 

X_train, y_train = data, y

W = np.load('W_2.npy')
b = np.load('b_2.npy')

X_train_ = sequence_to_matrix(X_train)

X_lift_train = compute_conv_features(X_train_, W, b, alpha_size = alpha_size)

clf = KernelSVM(lambda_reg = 0.16631578947368422, loss="squared_hinge", kernel = linear, 
            kernel_parameters = kernel_parameters, data_type='vector', threshold=0, verbose=False)

clf.fit(X_lift_train, y_train)

f_ = open('data/test/Xte{}.csv'.format(dataset_index), 'r')
X_test = [line.strip('\n') for line in f_.readlines()]
f_.close()

X_test_ = sequence_to_matrix(X_test)

X_lift_test = compute_conv_features(X_test_, W, b, alpha_size = alpha_size)

y_pred[1000*dataset_index:1000*(dataset_index+1)] = np.reshape(clf.predict(X_lift_test), (1000,))

In [9]:
y_pred = (y_pred+1)/2 # to have y in 0 and 1
df = pd.DataFrame(y_pred, columns=['Bound'])
df.index.name = 'Id'
df.Bound = df.Bound.astype(int)
df.to_csv('mixed.csv')

### Kitchen sink

In [9]:
y_pred = np.zeros((3000,))

# kitchen sink
alpha_size = 4
M = 4*2048

kernel_parameters = {}

lambda_opt = [0.0441400304414003, 0.052631578947368425, 0.06210526315789474]
gamma_opt = [0.9842105263157894, 0.9666666666666667, 0.831578947368421]
k_opt = [8, 8, 7]

for dataset_index in range(3):
    print(dataset_index)
    # load DNA strings 
    data = []
    f = open('data/train/Xtr{}.csv'.format(dataset_index), 'r')
    data += [line.strip('\n') for line in f.readlines()]
    f.close()

    # load target vector 
    y = (load_y("data/train/Ytr{}.csv".format(dataset_index)))
    y = 2*y - 1 # transform y to lie in {-1, 1} instead of {0, 1}
    y = np.reshape(y.astype(float), -1) # 

    X_train, y_train = data, y
    
    W = gamma_opt[dataset_index]*np.random.randn(M,4*k_opt[dataset_index])
    b = (2*np.pi)*np.random.rand(M)

    X_train_ = sequence_to_matrix(X_train)

    X_lift_train = compute_conv_features(X_train_, W, b, alpha_size = alpha_size)

    clf = KernelSVM(lambda_reg = lambda_opt[dataset_index], loss="squared_hinge", kernel = linear, 
                kernel_parameters = kernel_parameters, data_type='vector', threshold=0, verbose=False)

    clf.fit(X_lift_train, y_train)
        
    f_ = open('data/test/Xte{}.csv'.format(dataset_index), 'r')
    X_test = [line.strip('\n') for line in f_.readlines()]
    f_.close()
    
    X_test_ = sequence_to_matrix(X_test)
    
    X_lift_test = compute_conv_features(X_test_, W, b, alpha_size = alpha_size)
    
    y_pred[1000*dataset_index:1000*(dataset_index+1)] = np.reshape(clf.predict(X_lift_test), (1000,))

0
1
2


In [10]:
y_pred = (y_pred+1)/2 # to have y in 0 and 1
df = pd.DataFrame(y_pred, columns=['Bound'])
df.index.name = 'Id'
df.Bound = df.Bound.astype(int)
df.to_csv('submit_full.csv')

### k-spectrum

In [19]:
y_pred = np.zeros((3000,))

# k-spectrum
#lambda_opt=[0.12,0.035,0.066]
#kernel_parameters_opt=[{'k':6}, {'k':6}, {'k':4}]


for dataset_index in range(3):
    # load DNA strings 
    data = []
    f = open('data/train/Xtr{}.csv'.format(dataset_index), 'r')
    data += [line.strip('\n') for line in f.readlines()]
    f.close()

    # load target vector 
    y = (load_y("data/train/Ytr{}.csv".format(dataset_index)))
    y = 2*y - 1 # transform y to lie in {-1, 1} instead of {0, 1}
    y = np.reshape(y.astype(float), -1) # 

    X_train, y_train, X_test, y_test = data, y, data[1500:], y[1500:]
    
    lambda_reg=lambda_opt[dataset_index]
    kernel_parameters=kernel_parameters_opt[dataset_index]

    SVM = KernelSVM(lambda_reg = lambda_reg, kernel = spectrum_kernel, 
                    kernel_parameters = kernel_parameters, data_type='string')
    
    SVM.fit(X_train, y_train)
    #y_loc = SVM.pred(X_test)
    #print(sum(y_test == y_loc)/len(y_test))
    
    #SVM.load('SVM_opt{}.npy'.format(i), 'Xtrain_{}.npy'.format(i) )
    
    
    f_ = open('data/test/Xte{}.csv'.format(dataset_index), 'r')
    X = [line.strip('\n') for line in f_.readlines()]
    f_.close()

    y_pred[1000*dataset_index:1000*(dataset_index+1)] = np.reshape(SVM.pred(X), (1000,))

Building kernel matrix: 100%|██████████| 2000/2000 [00:31<00:00, 62.81it/s] 


     pcost       dcost       gap    pres   dres
 0: -5.0459e+01 -4.5490e+01  5e+03  7e+01  1e-14
 1: -4.5703e+01 -3.4125e+01  6e+02  8e+00  1e-14
 2: -2.2657e+01 -1.6444e+01  2e+02  2e+00  6e-15
 3: -9.5445e+00 -1.1559e+01  3e+01  3e-01  2e-15
 4: -5.7943e+00 -9.2809e+00  4e+00  4e-03  9e-16
 5: -6.1193e+00 -6.7273e+00  6e-01  6e-04  6e-16
 6: -6.2542e+00 -6.3671e+00  1e-01  5e-05  6e-16
 7: -6.2928e+00 -6.3026e+00  1e-02  3e-06  6e-16
 8: -6.2967e+00 -6.2971e+00  3e-04  8e-08  7e-16
 9: -6.2969e+00 -6.2969e+00  1e-05  2e-09  6e-16
10: -6.2969e+00 -6.2969e+00  3e-07  2e-11  7e-16
Optimal solution found.
[   0    1    2 ... 1996 1997 1999]


Predicting values:   0%|          | 1/1000 [00:00<01:48,  9.18it/s]

Numbers of support vectors : 1915


Predicting values: 100%|██████████| 1000/1000 [01:51<00:00,  9.01it/s]
Building kernel matrix: 100%|██████████| 2000/2000 [00:30<00:00, 65.91it/s] 


     pcost       dcost       gap    pres   dres
 0: -2.8829e+01 -4.0671e+01  5e+03  7e+01  6e-15
 1: -2.7091e+01 -3.5225e+01  5e+02  6e+00  6e-15
 2: -1.5188e+01 -2.6530e+01  1e+02  1e+00  4e-15
 3: -9.3484e+00 -2.4162e+01  4e+01  3e-01  2e-15
 4: -8.1452e+00 -1.5392e+01  9e+00  2e-02  1e-15
 5: -8.7394e+00 -9.9694e+00  1e+00  4e-03  1e-15
 6: -9.0512e+00 -9.2459e+00  2e-01  4e-04  1e-15
 7: -9.1109e+00 -9.1304e+00  2e-02  3e-05  1e-15
 8: -9.1175e+00 -9.1191e+00  2e-03  1e-06  1e-15
 9: -9.1181e+00 -9.1181e+00  6e-05  3e-08  1e-15
10: -9.1181e+00 -9.1181e+00  2e-06  4e-10  1e-15
Optimal solution found.
[   0    2    5 ... 1997 1998 1999]


Predicting values:   0%|          | 1/1000 [00:00<01:44,  9.59it/s]

Numbers of support vectors : 1207


Predicting values: 100%|██████████| 1000/1000 [01:48<00:00,  9.22it/s]
Building kernel matrix: 100%|██████████| 2000/2000 [00:40<00:00, 49.08it/s] 


     pcost       dcost       gap    pres   dres
 0: -2.9430e+03 -3.5752e+01  4e+04  2e+02  2e-12
 1: -1.0559e+02 -3.3006e+01  1e+03  6e+00  2e-12
 2: -1.2367e+01 -2.9323e+01  6e+01  2e-01  1e-13
 3: -9.4820e+00 -1.7739e+01  8e+00  4e-17  8e-15
 4: -1.0591e+01 -1.2302e+01  2e+00  3e-17  7e-15
 5: -1.1076e+01 -1.1465e+01  4e-01  3e-17  7e-15
 6: -1.1192e+01 -1.1290e+01  1e-01  3e-17  7e-15
 7: -1.1225e+01 -1.1245e+01  2e-02  3e-17  8e-15
 8: -1.1232e+01 -1.1235e+01  3e-03  3e-17  8e-15
 9: -1.1234e+01 -1.1234e+01  1e-04  3e-17  8e-15
10: -1.1234e+01 -1.1234e+01  2e-06  3e-17  8e-15
Optimal solution found.
[   0    1    2 ... 1996 1998 1999]


Predicting values:   0%|          | 1/1000 [00:00<01:50,  9.08it/s]

Numbers of support vectors : 1636


Predicting values: 100%|██████████| 1000/1000 [01:51<00:00,  8.97it/s]
