In [None]:
%reset

In [4]:
import os
os.chdir('../')

from data_handler import *
from kernel_methods import *
from metrics import *

import numpy as np
import itertools

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

#Alphabet
alphabet = ['A', 'C', 'G', 'T']

In [2]:
## Loading training data
tr0 = load_data(0, 'tr')
tr1 = load_data(1, 'tr')
tr2 = load_data(2, 'tr')

## Loading test data
te0 = load_data(0, 'te')
te1 = load_data(1, 'te')
te2 = load_data(2, 'te')

# Create the mismatch kernel functions

In [3]:
def create_vocab(alphabet, substring_length):
    '''
    Create all the vocabulary of all possibles words using the alphabet: all
    combination of length substring_length. Vocabulary is of size |alphabet|^substring_length.
    
    Input:
        alphabet: letters available in the alphabet
        substring_length: lenghth of words
        
    Output:
        vocab2index: dictionary associating each word in the vocab to an index (integer)
        index2vocab: dictionary associating each index to a word in the vocab
    '''
    vocab = [''.join(i) for i in itertools.product(alphabet, repeat = substring_length)]
    
    vocab2index = {}
    index2vocab = {}
    for idx, v in enumerate(vocab):
        vocab2index[v] = idx
        index2vocab[idx] = v
        
    return vocab2index, index2vocab


def is_neighbour(alpha, beta, mismatch):
    '''
    Check if word beta is in the neighbourhood of word alpha as defined by Leslie and al.
    http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.86.7384&rep=rep1&type=pdf
    
    Input:
        alpha: first word
        beta: second word
        mismatch: tolerance of mismatch
    Output
        Boolean: True if beta is the mismatch-neighbourhood of alpha
    '''
    if sum(a!=b for a, b in zip(alpha, beta)) <= mismatch:
        return True
    else:
        return False
    
def compute_neighbours(vocab2index, mismatch):
    '''
    Compute once for all the neighbours of each word in the vocabulary.
    
    Input:
        vocab2index: vocabulary
        mismatch: tolerance of mismatch
    Output:
        Dictionary of neighbours for each word in the vocabulary.
    '''
    vocab = vocab2index.keys()
    
    neighbours = {}
    for word1 in vocab:
        neighbours[word1] = []
        for word2 in vocab:
            if is_neighbour(word1, word2, mismatch):
                neighbours[word1].append(word2)
    
    return neighbours


def create_mismatch_feature(sequence, substring_length, vocab2index, neighbours, normalize = False):
    '''
    Mismatch kernel feature as described by Leslie and al.
    http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.86.7384&rep=rep1&type=pdf
    
    Input:
        sequence: DNA sequence to process
        substring_length: lenghth of vocabulary words
        vocab2index: mapping of vocabulary word to their index
        neighbours: neighbours for each word for each of the word in the vocabulary
    Output:
        Numpy array: Sequence embedding
    '''
    embedding = np.zeros(len(vocab2index), dtype = 'int')

    for start in range(len(sequence) - substring_length + 1):
        end = start + substring_length
        substring = sequence[start:end]
        for neighbour in neighbours[substring]:
            embedding[vocab2index[neighbour]] += 1
    
    if normalize:
        embedding = embedding/np.linalg.norm(embedding)
        
    return embedding


def mismatch_kernel(sequenceA, sequenceB, substring_length, vocab2index, neighbours, normalize):
    '''
    Mismatch kernel. Optional normalization as described in Leslie and al.
    '''
    embedingA = create_mismatch_feature(sequenceA, substring_length, vocab2index, neighbours, normalize)
    embedingB = create_mismatch_feature(sequenceB, substring_length, vocab2index, neighbours, normalize)
    
    return np.dot(embeddingA, embeddingB)

# Test these functions

In [None]:
#Alphabet
alphabet = ['A', 'C', 'G', 'T']

substring_length = 3
mismatch_tol = 1

vocab2index, _ = create_vocab(alphabet, substring_length)
neighbours = compute_neighbours(vocab2index, mismatch_tol)


#Example
# create_mismatch_feature(tr0['Sequence'][10], substring_length, vocab2index, neighbours)

# Grid Search With Mismatch Kernel Features + Logistic Regression

In [12]:
def gd_log_reg(data, sub_lengths, tols, regularizations, penalties, normalize):
    max_score = 0
    max_model = None

    Y = data['Bound'].as_matrix()

    for substring_length in sub_lengths:
        vocab2index, _ = create_vocab(alphabet, substring_length)
        for mismatch_tol in tols:
            print('--Normalize: {0} - Substring Length: {1} - Mismatch Tolerance: {2}'.format(normalize, substring_length, mismatch_tol))
            neighbours = compute_neighbours(vocab2index, mismatch_tol)

            X = np.zeros((len(data), len(vocab2index)))
            for idx, seq in enumerate(data['Sequence']):
                X[idx, :] = create_mismatch_feature(seq, substring_length, vocab2index, neighbours, normalize)

            for penal in penalties:
                for regu in regularizations:
                    clf = LogisticRegression(C = regu, penalty = penal)
                    #SIMON. Cette partie la bug... à cause du sign_label je crois...
#                     score = kfold(data = X, 
#                                   labels = Y, 
#                                   n_folds = 5,
#                                   train_method = clf.fit, 
#                                   pred_method = clf.predict,
#                                   metric = m_binary,
#                                   verbose = True)
                    #Du coup, j'ai utilisé celle de sklearn.
                    score = np.mean(cross_val_score(clf, X, Y, cv = 5))

                    if score > max_score:
                        max_score = score
                        max_model = clf
                        print('----Increase in Score. Penal: {0} - Regu: {1}. Mean Val Score: {2:.2f}'.format(penal, regu, 100*score))

**Set 0**

In [5]:
data = tr0

sub_lengths = [3,4,5,6]
tols = [1, 2]
regularizations = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10]
penalties = ['l1', 'l2']

gd_log_reg(data, sub_lengths, tols, regularizations, penalties, normalize= True)
print('\n')
gd_log_reg(data, sub_lengths, tols, regularizations, penalties, normalize= False)

print('\nFinished !')

--Normalize: True - Substring Length: 3 - Mismatch Tolerance: 1
----Increase in Score. Penal: l1 - Regu: 0.001. Mean Val Score: 50.00
----Increase in Score. Penal: l1 - Regu: 0.5. Mean Val Score: 61.60
----Increase in Score. Penal: l1 - Regu: 1. Mean Val Score: 64.35
----Increase in Score. Penal: l1 - Regu: 5. Mean Val Score: 68.10
----Increase in Score. Penal: l2 - Regu: 10. Mean Val Score: 68.25
--Normalize: True - Substring Length: 3 - Mismatch Tolerance: 2
--Normalize: True - Substring Length: 4 - Mismatch Tolerance: 1
----Increase in Score. Penal: l1 - Regu: 5. Mean Val Score: 71.35
----Increase in Score. Penal: l1 - Regu: 10. Mean Val Score: 71.80
--Normalize: True - Substring Length: 4 - Mismatch Tolerance: 2
----Increase in Score. Penal: l1 - Regu: 10. Mean Val Score: 72.30
--Normalize: True - Substring Length: 5 - Mismatch Tolerance: 1
----Increase in Score. Penal: l2 - Regu: 1. Mean Val Score: 72.75
----Increase in Score. Penal: l2 - Regu: 5. Mean Val Score: 72.95
--Normalize

**Set 1**

In [6]:
data = tr1

sub_lengths = [3,4,5,6]
tols = [1, 2]
regularizations = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]
penalties = ['l1', 'l2']

gd_log_reg(data, sub_lengths, tols, regularizations, penalties, normalize= True)
print('\n')
gd_log_reg(data, sub_lengths, tols, regularizations, penalties, normalize= False)

print('\nFinished !')

--Normalize: True - Substring Length: 3 - Mismatch Tolerance: 1
----Increase in Score. Penal: l1 - Regu: 0.0001. Mean Val Score: 50.00
----Increase in Score. Penal: l1 - Regu: 0.1. Mean Val Score: 62.65
----Increase in Score. Penal: l1 - Regu: 0.5. Mean Val Score: 67.00
----Increase in Score. Penal: l1 - Regu: 1. Mean Val Score: 71.85
--Normalize: True - Substring Length: 3 - Mismatch Tolerance: 2
--Normalize: True - Substring Length: 4 - Mismatch Tolerance: 1
----Increase in Score. Penal: l1 - Regu: 0.5. Mean Val Score: 74.55
----Increase in Score. Penal: l1 - Regu: 1. Mean Val Score: 80.10
--Normalize: True - Substring Length: 4 - Mismatch Tolerance: 2
--Normalize: True - Substring Length: 5 - Mismatch Tolerance: 1
----Increase in Score. Penal: l1 - Regu: 1. Mean Val Score: 81.65
----Increase in Score. Penal: l2 - Regu: 1. Mean Val Score: 82.50
--Normalize: True - Substring Length: 5 - Mismatch Tolerance: 2
--Normalize: True - Substring Length: 6 - Mismatch Tolerance: 1
----Increase 

**Set 2**

In [7]:
data = tr2

sub_lengths = [3,4,5,6]
tols = [1, 2]
regularizations = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]
penalties = ['l1', 'l2']

gd_log_reg(data, sub_lengths, tols, regularizations, penalties, normalize= True)
print('\n')
gd_log_reg(data, sub_lengths, tols, regularizations, penalties, normalize= False)

print('\nFinished !')

--Normalize: True - Substring Length: 3 - Mismatch Tolerance: 1
----Increase in Score. Penal: l1 - Regu: 0.0001. Mean Val Score: 50.00
----Increase in Score. Penal: l1 - Regu: 0.5. Mean Val Score: 53.55
----Increase in Score. Penal: l1 - Regu: 1. Mean Val Score: 57.05
----Increase in Score. Penal: l2 - Regu: 1. Mean Val Score: 58.65
--Normalize: True - Substring Length: 3 - Mismatch Tolerance: 2
--Normalize: True - Substring Length: 4 - Mismatch Tolerance: 1
----Increase in Score. Penal: l1 - Regu: 1. Mean Val Score: 60.00
----Increase in Score. Penal: l2 - Regu: 1. Mean Val Score: 61.10
--Normalize: True - Substring Length: 4 - Mismatch Tolerance: 2
--Normalize: True - Substring Length: 5 - Mismatch Tolerance: 1
----Increase in Score. Penal: l2 - Regu: 0.5. Mean Val Score: 62.80
----Increase in Score. Penal: l2 - Regu: 1. Mean Val Score: 63.85
--Normalize: True - Substring Length: 5 - Mismatch Tolerance: 2
--Normalize: True - Substring Length: 6 - Mismatch Tolerance: 1
----Increase in

# Predictions with Mismatch Kernel Features + Tuned Logistic Regression

In [12]:
tr0 = load_data(0, 'tr')
te0 = load_data(0, 'te')

substring_length = 6
mismatch_tol = 2

vocab2index, _ = create_vocab(alphabet, substring_length)
neighbours = compute_neighbours(vocab2index, mismatch_tol)

#Transform raw data
X = np.zeros((len(tr0), len(vocab2index)))
for idx, seq in enumerate(tr0['Sequence']):
    X[idx, :] = create_mismatch_feature(seq, substring_length, vocab2index, neighbours, normalize = False)
Y = tr0['Bound'].as_matrix()

X_test = np.zeros((len(te0), len(vocab2index)))
for idx, seq in enumerate(te0['Sequence']):
    X_test[idx, :] = create_mismatch_feature(seq, substring_length, vocab2index, neighbours, normalize = False)

#Train classifier
clf = LogisticRegression(C = 0.001)
clf.fit(X, Y)

#Predict
mismatch_te0_raw = clf.predict(X_test).astype(int)

In [13]:
tr1 = load_data(1, 'tr')
te1 = load_data(1, 'te')

substring_length = 6
mismatch_tol = 1

vocab2index, _ = create_vocab(alphabet, substring_length)
neighbours = compute_neighbours(vocab2index, mismatch_tol)

#Transform raw data
X = np.zeros((len(tr1), len(vocab2index)))
for idx, seq in enumerate(tr1['Sequence']):
    X[idx, :] = create_mismatch_feature(seq, substring_length, vocab2index, neighbours, normalize = False)
Y = tr1['Bound'].as_matrix()

X_test = np.zeros((len(te1), len(vocab2index)))
for idx, seq in enumerate(te1['Sequence']):
    X_test[idx, :] = create_mismatch_feature(seq, substring_length, vocab2index, neighbours, normalize = False)

#Train classifier
clf = LogisticRegression(C = 0.001)
clf.fit(X, Y)

#Predict
mismatch_te1_raw = clf.predict(X_test).astype(int)

In [14]:
tr2 = load_data(2, 'tr')
te2 = load_data(2, 'te')

substring_length = 6
mismatch_tol = 1

vocab2index, _ = create_vocab(alphabet, substring_length)
neighbours = compute_neighbours(vocab2index, mismatch_tol)

#Transform raw data
X = np.zeros((len(tr2), len(vocab2index)))
for idx, seq in enumerate(tr2['Sequence']):
    X[idx, :] = create_mismatch_feature(seq, substring_length, vocab2index, neighbours, normalize = True)
Y = tr2['Bound'].as_matrix()

X_test = np.zeros((len(te2), len(vocab2index)))
for idx, seq in enumerate(te2['Sequence']):
    X_test[idx, :] = create_mismatch_feature(seq, substring_length, vocab2index, neighbours, normalize = True)

#Train classifier
clf = LogisticRegression(C = 1)
clf.fit(X, Y)

#Predict
mismatch_te2_raw = clf.predict(X_test).astype(int)

In [15]:
mismatch_te0_raw = pd.DataFrame(
    data = format_preds(mismatch_te0_raw),
    columns = ['Bound'])

mismatch_te1_raw = pd.DataFrame(
    data = format_preds(mismatch_te1_raw),
    columns = ['Bound'])
mismatch_te1_raw.index = mismatch_te1_raw.index + 1000

mismatch_te2_raw = pd.DataFrame(
    data = format_preds(mismatch_te2_raw),
    columns = ['Bound'])
mismatch_te2_raw.index = mismatch_te2_raw.index + 2000

frames = [mismatch_te0_raw, mismatch_te1_raw, mismatch_te2_raw]
mismatch_te = pd.concat(frames)
mismatch_te.index = mismatch_te.index.set_names(['Id'])

mismatch_te.to_csv('predictions/mismatch_log_regtruc.csv')

# Grid Search with MLP

In [7]:
def gd_mlp(data, sub_lengths, tols, architectures, alphas, normalize):
    max_score = 0
    max_model = None

    Y = data['Bound'].as_matrix()

    for substring_length in sub_lengths:
        vocab2index, _ = create_vocab(alphabet, substring_length)
        for mismatch_tol in tols:
            print('--Normalize: {0} - Substring Length: {1} - Mismatch Tolerance: {2}'.format(normalize, substring_length, mismatch_tol))
            neighbours = compute_neighbours(vocab2index, mismatch_tol)

            X = np.zeros((len(data), len(vocab2index)))
            for idx, seq in enumerate(data['Sequence']):
                X[idx, :] = create_mismatch_feature(seq, substring_length, vocab2index, neighbours, normalize)

            for archi in architectures:
                for alpha in alphas:
                    clf = MLPClassifier(hidden_layer_sizes = archi, alpha = alpha,  max_iter = 1000)
                    score = np.mean(cross_val_score(clf, X, Y, cv = 5))

                    if score > max_score:
                        max_score = score
                        max_model = clf
                        print('----Increase in Score. Architecture: {0} - Alpha: {1}. Mean Val Score: {2:.2f}'.format(archi, alpha, 100*score))

In [8]:
data = tr0

sub_lengths = [3,4,5,6]
tols = [1, 2]
architectures = [(10), (20), (25), (50), (75), (100)]
alphas = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]

gd_mlp(data, sub_lengths, tols, architectures, alphas, normalize= True)
print('\n')
gd_mlp(data, sub_lengths, tols, architectures, alphas, normalize= False)

print('\nFinished !')

--Normalize: True - Substring Length: 3 - Mismatch Tolerance: 1
----Increase in Score. Architecture: 10 - Alpha: 0.001. Mean Val Score: 55.10
----Increase in Score. Architecture: 10 - Alpha: 0.005. Mean Val Score: 65.50
----Increase in Score. Architecture: 10 - Alpha: 0.05. Mean Val Score: 65.80
----Increase in Score. Architecture: 20 - Alpha: 0.001. Mean Val Score: 66.50
----Increase in Score. Architecture: 20 - Alpha: 0.05. Mean Val Score: 67.10
----Increase in Score. Architecture: 50 - Alpha: 0.001. Mean Val Score: 67.40
--Normalize: True - Substring Length: 3 - Mismatch Tolerance: 2
--Normalize: True - Substring Length: 4 - Mismatch Tolerance: 1
----Increase in Score. Architecture: 10 - Alpha: 0.001. Mean Val Score: 69.95
----Increase in Score. Architecture: 10 - Alpha: 0.01. Mean Val Score: 70.35
----Increase in Score. Architecture: 10 - Alpha: 0.05. Mean Val Score: 70.80
----Increase in Score. Architecture: 25 - Alpha: 0.001. Mean Val Score: 71.20
----Increase in Score. Architect

In [9]:
data = tr1

sub_lengths = [3,4,5,6]
tols = [1, 2]
architectures = [(10), (20), (25), (50), (75), (100)]
alphas = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]

gd_mlp(data, sub_lengths, tols, architectures, alphas, normalize= True)
print('\n')
gd_mlp(data, sub_lengths, tols, architectures, alphas, normalize= False)

print('\nFinished !')

--Normalize: True - Substring Length: 3 - Mismatch Tolerance: 1
----Increase in Score. Architecture: 10 - Alpha: 0.001. Mean Val Score: 73.30
----Increase in Score. Architecture: 10 - Alpha: 0.01. Mean Val Score: 73.30
----Increase in Score. Architecture: 20 - Alpha: 0.005. Mean Val Score: 73.50
----Increase in Score. Architecture: 25 - Alpha: 0.001. Mean Val Score: 74.05
----Increase in Score. Architecture: 50 - Alpha: 0.01. Mean Val Score: 74.15
----Increase in Score. Architecture: 75 - Alpha: 0.005. Mean Val Score: 74.30
----Increase in Score. Architecture: 75 - Alpha: 0.01. Mean Val Score: 74.55
--Normalize: True - Substring Length: 3 - Mismatch Tolerance: 2
--Normalize: True - Substring Length: 4 - Mismatch Tolerance: 1
----Increase in Score. Architecture: 10 - Alpha: 0.001. Mean Val Score: 82.70
----Increase in Score. Architecture: 10 - Alpha: 0.005. Mean Val Score: 82.85
----Increase in Score. Architecture: 20 - Alpha: 0.005. Mean Val Score: 83.10
--Normalize: True - Substring L

In [10]:
data = tr2

sub_lengths = [3,4,5,6]
tols = [1, 2]
architectures = [(10), (20), (25), (50), (75), (100)]
alphas = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]

gd_mlp(data, sub_lengths, tols, architectures, alphas, normalize= True)
print('\n')
gd_mlp(data, sub_lengths, tols, architectures, alphas, normalize= False)

print('\nFinished !')

--Normalize: True - Substring Length: 3 - Mismatch Tolerance: 1
----Increase in Score. Architecture: 10 - Alpha: 0.001. Mean Val Score: 53.85
----Increase in Score. Architecture: 10 - Alpha: 0.005. Mean Val Score: 54.20
----Increase in Score. Architecture: 10 - Alpha: 0.1. Mean Val Score: 54.85
----Increase in Score. Architecture: 20 - Alpha: 0.005. Mean Val Score: 56.65
----Increase in Score. Architecture: 25 - Alpha: 0.005. Mean Val Score: 56.75
----Increase in Score. Architecture: 50 - Alpha: 0.001. Mean Val Score: 57.65
----Increase in Score. Architecture: 50 - Alpha: 0.01. Mean Val Score: 58.60
----Increase in Score. Architecture: 75 - Alpha: 0.05. Mean Val Score: 58.90
----Increase in Score. Architecture: 100 - Alpha: 0.005. Mean Val Score: 59.10
--Normalize: True - Substring Length: 3 - Mismatch Tolerance: 2
--Normalize: True - Substring Length: 4 - Mismatch Tolerance: 1
----Increase in Score. Architecture: 10 - Alpha: 0.001. Mean Val Score: 62.15
----Increase in Score. Architec

# Longer (7) sub sequence on mismatch

In [14]:
data = tr0

sub_lengths = [7]
tols = [1, 2, 3]
regularizations = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10]
penalties = ['l1', 'l2']

gd_log_reg(data, sub_lengths, tols, regularizations, penalties, normalize= True)
print('\n')
gd_log_reg(data, sub_lengths, tols, regularizations, penalties, normalize= False)

print('\nFinished !')

--Normalize: True - Substring Length: 7 - Mismatch Tolerance: 1
----Increase in Score. Penal: l1 - Regu: 0.001. Mean Val Score: 50.00
----Increase in Score. Penal: l1 - Regu: 1. Mean Val Score: 63.35
----Increase in Score. Penal: l1 - Regu: 5. Mean Val Score: 72.90
----Increase in Score. Penal: l1 - Regu: 10. Mean Val Score: 72.95
----Increase in Score. Penal: l2 - Regu: 0.05. Mean Val Score: 73.25
----Increase in Score. Penal: l2 - Regu: 0.1. Mean Val Score: 73.85
----Increase in Score. Penal: l2 - Regu: 0.5. Mean Val Score: 74.30
----Increase in Score. Penal: l2 - Regu: 1. Mean Val Score: 74.90
----Increase in Score. Penal: l2 - Regu: 5. Mean Val Score: 76.00
--Normalize: True - Substring Length: 7 - Mismatch Tolerance: 2
--Normalize: True - Substring Length: 7 - Mismatch Tolerance: 3


--Normalize: False - Substring Length: 7 - Mismatch Tolerance: 1
----Increase in Score. Penal: l1 - Regu: 0.001. Mean Val Score: 50.00
----Increase in Score. Penal: l1 - Regu: 0.05. Mean Val Score: 69

In [15]:
data = tr1

sub_lengths = [7]
tols = [1, 2, 3]
regularizations = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10]
penalties = ['l1', 'l2']

gd_log_reg(data, sub_lengths, tols, regularizations, penalties, normalize= True)
print('\n')
gd_log_reg(data, sub_lengths, tols, regularizations, penalties, normalize= False)

print('\nFinished !')

--Normalize: True - Substring Length: 7 - Mismatch Tolerance: 1
----Increase in Score. Penal: l1 - Regu: 0.001. Mean Val Score: 50.00
----Increase in Score. Penal: l1 - Regu: 0.5. Mean Val Score: 79.40
----Increase in Score. Penal: l1 - Regu: 1. Mean Val Score: 83.55
----Increase in Score. Penal: l1 - Regu: 5. Mean Val Score: 87.85
----Increase in Score. Penal: l2 - Regu: 5. Mean Val Score: 88.35
----Increase in Score. Penal: l2 - Regu: 10. Mean Val Score: 88.65
--Normalize: True - Substring Length: 7 - Mismatch Tolerance: 2
--Normalize: True - Substring Length: 7 - Mismatch Tolerance: 3


--Normalize: False - Substring Length: 7 - Mismatch Tolerance: 1
----Increase in Score. Penal: l1 - Regu: 0.001. Mean Val Score: 50.00
----Increase in Score. Penal: l1 - Regu: 0.005. Mean Val Score: 77.05
----Increase in Score. Penal: l1 - Regu: 0.01. Mean Val Score: 79.30
----Increase in Score. Penal: l1 - Regu: 0.05. Mean Val Score: 86.65
----Increase in Score. Penal: l1 - Regu: 0.1. Mean Val Score

In [16]:
data = tr2

sub_lengths = [7]
tols = [1, 2, 3]
regularizations = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10]
penalties = ['l1', 'l2']

gd_log_reg(data, sub_lengths, tols, regularizations, penalties, normalize= True)
print('\n')
gd_log_reg(data, sub_lengths, tols, regularizations, penalties, normalize= False)

print('\nFinished !')

--Normalize: True - Substring Length: 7 - Mismatch Tolerance: 1
----Increase in Score. Penal: l1 - Regu: 0.001. Mean Val Score: 50.00
----Increase in Score. Penal: l1 - Regu: 1. Mean Val Score: 54.95
----Increase in Score. Penal: l1 - Regu: 5. Mean Val Score: 62.70
----Increase in Score. Penal: l2 - Regu: 0.5. Mean Val Score: 64.80
----Increase in Score. Penal: l2 - Regu: 1. Mean Val Score: 65.45
----Increase in Score. Penal: l2 - Regu: 5. Mean Val Score: 66.15
--Normalize: True - Substring Length: 7 - Mismatch Tolerance: 2
----Increase in Score. Penal: l2 - Regu: 5. Mean Val Score: 66.50
--Normalize: True - Substring Length: 7 - Mismatch Tolerance: 3


--Normalize: False - Substring Length: 7 - Mismatch Tolerance: 1
----Increase in Score. Penal: l1 - Regu: 0.001. Mean Val Score: 50.00
----Increase in Score. Penal: l1 - Regu: 0.01. Mean Val Score: 50.25
----Increase in Score. Penal: l1 - Regu: 0.05. Mean Val Score: 59.40
----Increase in Score. Penal: l1 - Regu: 0.1. Mean Val Score: 62.