## EARLIER EXPERIMENTS

In [1]:
import pandas as pd
import numpy as np
#import networkx as nx
import sys

# for the bag of word features 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# stable numerical implementation of sigmoid
from scipy.special import expit

In [2]:
import cvxopt
from cvxopt import matrix, spmatrix, solvers

In [3]:
from sklearn.metrics.pairwise import cosine_similarity
#cosine_similarity(X, Y)

In [4]:
import time

In [5]:
def load_sequence(kind='X', root='tr', number=3):
    seqs =  [pd.read_csv('./data/%s%s%d.csv'%(kind, root, d)) for d in range(number)]
    
    if kind == 'X':
            df = pd.DataFrame(columns=['Id','seq'])
    else:
            df= pd.DataFrame(columns=['Id','Bound'])
    
    for seq in seqs:
        
        df = df.append(seq, ignore_index=True)
        
    return df

def load_features(root='tr', number=3):
    kind = 'X'
    
    feats =  [np.loadtxt('./data/%s%s%d_mat100.csv'%(kind, root, d)) for d in range(number)]
        
    return  np.vstack((feat for feat in feats))


def getKmers(sequence, size=5):
    return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]


def get_features(dF, size=5, normed=False, rang=(4,4)):
    df = dF.copy()
    
    df['words'] = df.apply(lambda x: getKmers(x['seq'], size=size), axis=1)
    df = df.drop('seq', axis=1)

    texts = list(df['words'])
    for item in range(len(texts)):
        texts[item] = ' '.join(df.iloc[item,1])
    
    if normed:
        cv = TfidfVectorizer(ngram_range=rang)
    else:
        cv = CountVectorizer(ngram_range= rang)
    X = cv.fit_transform(texts)
    return X

In [6]:
sequences_train = load_sequence(number=3)
sequences_test = load_sequence(number=3, root='te')
labels_train = load_sequence(kind='Y' ,root='tr', number=3)

In [7]:
sequences_test.shape, labels_train.shape, sequences_train.shape

((3000, 2), (6000, 2), (6000, 2))

In [8]:
all_labels = labels_train.Bound.values.astype(int)

In [10]:
def build_training(k, sizes=[4,5], normed=False, rang=(4,4)):
    dataset = pd.DataFrame()
    dataset = dataset.append(sequences_train.iloc[2000*k:2000*(k+1)])
    dataset =  dataset.append(sequences_test.iloc[1000*k:1000*(k+1)])

    print('performing counts..........')
    K_xx, K_yx = [], []
    
    for size in sizes:
        counts = get_features(dataset, size=size, normed=normed, rang=rang)

        counts_train = counts[:2000]
        counts_test = counts[2000:]
        k_xx = np.dot(counts_train, counts_train.T).toarray()
        k_yx = np.dot(counts_test, counts_train.T).toarray()
        K_xx.append(k_xx)
        K_yx.append(k_yx)
    
    y_train = all_labels[2000*k:2000*(k+1)]
    
    return np.array(K_xx), np.array(K_yx), y_train

## THE PYTHON CLASS FOR KERNEL LOGISTIC REGRESSION

In [11]:
class MultiKerOpt():
    
    def __init__(self, alpha=0.01, tol=1e-07, degree=2, method='klr', hide=False):
        self.alpha = alpha
        self.tol = tol
        self.degree = degree
        self.method = method
        self.hide  = hide
        
    def scale(self, u, norm):
        if norm=='l1':
            return u/np.sum(u)
        elif norm=='l2':
            return u / np.sqrt(np.sum(u**2))
        else:
            raise Exception('l1 and l2 are the only available norms')
            
    def bound(self, u, u_0, gamma, norm):
        u__ = u - u_0
        u__ = np.abs(self.scale(u__, norm) * gamma)
        return u__ + u_0
    
    def KrrIterate(self, Kernels, y, coef, weights = None):
        K_w = np.sum((Kernels * coef[:, None, None]), axis=0) ** self.degree
        N, D = K_w.shape
        if weights is None:
            c = np.linalg.solve(np.linalg.inv(K_w + self.alpha * np.eye(N, D)), y[:, np.newaxis])
        else:
            W_r = np.diag(np.sqrt(weights))
            A = W_r.dot(K_w).dot(W_r) + self.alpha * np.eye(N,D)
            Y = np.dot(W_r, y[:, np.newaxis])
            x_sol = np.linalg.solve(A, Y)
            c = np.dot(W_r, x_sol)
        return c
    
    def KlrIterate(self, Kernels, y, coef, tol=1e-07, max_iters=5):
        c_old = self.KrrIterate(Kernels, y, coef)
        K_w = np.sum((Kernels * coef[:, None, None]), axis=0) ** self.degree
        y_enc = 2*y-1
        
        for i in range(max_iters):
            m_t = np.dot(K_w, c_old)
            p_t = -expit(-y_enc[:, np.newaxis]*m_t)
            w_t = expit(m_t)*expit(-m_t)
            z_t = m_t - (p_t * y_enc[:, np.newaxis]) /(w_t+ 1e-05)
            c_new = self.KrrIterate(Kernels, z_t.flatten(), coef, weights=w_t.flatten())
            if np.linalg.norm(c_new - c_old)<tol:
                break
            else:
                c_old = c_new
        return c_old

    def SvmIterate(self, Kernels, y, coef):
        nb_samples = y.shape[0]
        C = 1 / ( 2 * self.alpha * nb_samples)
        
        r = np.arange(nb_samples)
        o = np.ones(nb_samples)
        z = np.zeros(nb_samples)
            
        K_w  = np.sum(Kernels * coef[:, None, None], axis=0) ** (self.degree)
        
        y_enc = 2*y-1
        
        P = matrix(K_w.astype(float), tc='d')
        q = matrix(-y_enc, tc='d')
        G = spmatrix(np.r_[y_enc, -y_enc], np.r_[r, r + nb_samples], np.r_[r, r], tc='d')
        h = matrix(np.r_[o * C, z], tc='d')
        
        if self.hide:
            solvers.options['show_progress'] = False
        sol = solvers.qp(P, q, G, h)
        c = np.ravel(sol['x'])[:,np.newaxis]
        
        return c
    
    def gradUpdate(self, Kernels, coef, delta):
        K_t = np.sum(Kernels * coef[:, None, None], axis=0) ** (self.degree-1)
        grad = np.zeros(len(Kernels))
        for m in range(len(Kernels)):
            grad[m] = delta.T.dot((K_t * Kernels[m])).dot(delta)
            
        return - self.degree * grad
    
    def fit(self, Kernels, y, u_0=0, gamma=1, norm='l2', n_iter=5, step=1, weights=None):
        coef = np.random.normal(0, 1, len(Kernels)) / len(Kernels)
        coef = self.bound(coef, u_0, gamma, norm)
        new_coef = 0
        
        score_prev = np.inf
        
        for i in range(n_iter):
            #print(i+1)
            if self.method=='klr':
                delta = self.KlrIterate(Kernels, y, coef, tol=1e-07, max_iters=5)
            elif self.method=='svm':
                delta = self.SvmIterate(Kernels, y, coef)
            else:
                delta = self.KrrIterate(Kernels, y, coef, weights = weights)
                
            grad = self.gradUpdate(Kernels, coef, delta)
            
            new_coef = coef - step * grad
            new_coef = self.bound(new_coef, u_0, gamma, norm)
            
            score = np.linalg.norm(new_coef - coef, np.inf)
            
            if score>score_prev:
                step *= 0.9
                
            if score<self.tol:
                return new_coef
            
            coef = new_coef
            score_prev = score.copy()
            
        self.coef, self.delta = coef, delta
        #return new_coef
    def predict(self, Kernels):
        K_w = np.sum(Kernels * self.coef[:, None, None], axis=0) ** (self.degree)
        y__ = np.sign(K_w.dot(self.delta)).flatten()
        if self.method != 'krr':
            y__ = 0.5 * (y__ + 1)
        return y__
    
    def score(self, Kernels, y):
        y__ = self.predict(Kernels)
        if self.method!='krr':
            score = 100*(y__==y).mean()
        else:
            score = np.mean((y__- y)**2)
        return score
                

In [12]:
def CvSearch(K_xx, K_yx, y, method='svm', degrees=[4], alphas=[0.01], cv=5):
    tt = time.time()
    
    n_iters = cv * len(degrees) * len(alphas)
    
    n_samples = y.shape[0]
    
    DEG, ALPH, TRAIN, VAL = [], [], [], []
    
    i=0
    
    for degree in degrees:
        for alpha in alphas:
            DEG.append(degree)
            ALPH.append(alpha)
            
            #SPLITTING
            INDS = np.array(range(n_samples))
            idx = np.random.permutation(n_samples)
            INDS = INDS[idx]
            
            vals = np.array_split(INDS, cv)
            
            perfs_train = []
            perfs_val = []
            
            for val in vals:
                i += 1 
                sys.stderr.write('\rIteration %d/%d ' %(i, n_iters))
                sys.stderr.flush()
                
                train = np.setdiff1d(range(n_samples),val)
                
                clf = MultiKerOpt(alpha=alpha, tol=1e-07, degree=degree, method=method, hide=True)
                
                clf.fit(K_xx[:,train.reshape(-1,1), train], y[train])
                
                score_train = clf.score(K_xx[:,train.reshape(-1,1), train], y[train])
                
                score_val =  clf.score(K_xx[:,val.reshape(-1,1), train], y[val])
                
                perfs_train.append(score_train)
                perfs_val.append(score_val)
                
            TRAIN.append(np.mean(np.array(perfs_train)))
            VAL.append(np.mean(np.array(perfs_val)))
            
    df = pd.DataFrame({'degree':DEG, 'alpha':ALPH, 'train':TRAIN, 'val':VAL})
    
    tt = time.time() - tt
    print('Done in %.3f'%(tt/60))
    
    return df
#

In [13]:
def get_best(df):
    idx = np.argmax(df.val.values)
    best = np.max(df.val.values)

    best_degree = df.degree[idx]
    best_alpha = df.alpha[idx]
    return best_degree, best_alpha, best

In [14]:
REG_PARAMS_SPAN = [10**i for i in range(-3, 2)] #+ [10**i/2 for i in range(-10, 10)]
print(REG_PARAMS_SPAN)

[0.001, 0.01, 0.1, 1, 10]


## DATASET 0

In [15]:
K_xx_0, X_yx_0, y_train_0 = build_training(0, sizes=[4,5,6,7], normed=False, rang=(4,4))

performing counts..........


In [16]:
RUN = False

if RUN:
    df  = CvSearch(K_xx_0, X_yx_0, y_train_0, method='svm', degrees=[1, 2, 3, 4], 
               alphas=REG_PARAMS_SPAN, cv=5)
    df.to_csv('./CV/X0_cv5_linear_kmers.csv', index=False)
else:
    df = pd.read_csv('./CV/X0_cv5_linear_kmers.csv')

In [17]:
df

Unnamed: 0,degree,alpha,train,val
0,1,0.01,100.0,60.05
1,1,0.1,96.125,61.8
2,1,1.0,91.95,58.75
3,1,10.0,92.2375,59.4
4,2,0.01,100.0,59.5
5,2,0.1,100.0,59.2
6,2,1.0,100.0,59.65
7,2,10.0,99.775,59.9
8,3,0.01,100.0,59.25
9,3,0.1,100.0,58.15


In [18]:
best_degree_0, best_alpha_0, best_0 = get_best(df)
print(best_degree_0, best_alpha_0, best_0)

1 0.1 61.8


In [21]:
clf0 = MultiKerOpt(alpha=best_alpha_0, tol=1e-07, degree=best_degree_0, method='svm', hide=False)
clf0.fit(K_xx_0, y_train_0, n_iter=5)
y_pred_0 = clf0.predict(X_yx_0)

     pcost       dcost       gap    pres   dres
 0: -6.9844e+00 -1.2092e+01  5e+03  7e+01  2e-15
 1: -6.9617e+00 -1.1738e+01  3e+02  4e+00  2e-15
 2: -6.0312e+00 -9.5629e+00  4e+01  6e-01  1e-15
 3: -3.6275e+00 -7.3749e+00  6e+00  3e-02  2e-15
 4: -3.5655e+00 -4.0089e+00  5e-01  1e-03  1e-15
 5: -3.7170e+00 -3.7908e+00  8e-02  2e-04  1e-15
 6: -3.7476e+00 -3.7540e+00  7e-03  1e-05  1e-15
 7: -3.7505e+00 -3.7507e+00  3e-04  3e-07  1e-15
 8: -3.7506e+00 -3.7506e+00  9e-06  8e-09  1e-15
 9: -3.7506e+00 -3.7506e+00  3e-07  1e-10  1e-15
Optimal solution found.
     pcost       dcost       gap    pres   dres
 0: -5.8753e+00 -1.1004e+01  4e+03  7e+01  2e-15
 1: -5.8610e+00 -1.0751e+01  2e+02  3e+00  2e-15
 2: -5.1712e+00 -9.1158e+00  4e+01  5e-01  1e-15
 3: -3.3804e+00 -7.1352e+00  6e+00  3e-02  3e-15
 4: -3.3474e+00 -3.8042e+00  5e-01  1e-03  1e-15
 5: -3.5061e+00 -3.5776e+00  7e-02  2e-04  1e-15
 6: -3.5362e+00 -3.5449e+00  9e-03  2e-05  1e-15
 7: -3.5403e+00 -3.5408e+00  4e-04  6e-07  1e-1

In [22]:
print(y_train_0.mean(), y_pred_0.mean())

0.4885 0.567


## DATASET 1

In [24]:
K_xx_1, X_yx_1, y_train_1 = build_training(1, sizes=[4,5,6], normed=False, rang=(4,4))

performing counts..........


In [25]:
RUN = False

if RUN:
    df1  = CvSearch(K_xx_1, X_yx_1, y_train_1, method='svm', degrees=[1, 2, 3, 4], 
               alphas=REG_PARAMS_SPAN, cv=5)
    df1.to_csv('./CV/X1_cv5_linear_kmers.csv', index=False)
else:
    df1 = pd.read_csv('./CV/X1_cv5_linear_kmers.csv')

In [26]:
df1

Unnamed: 0,degree,alpha,train,val
0,1,0.01,100.0,74.55
1,1,0.1,98.4875,74.95
2,1,1.0,97.7875,73.3
3,1,10.0,97.2375,73.9
4,2,0.01,100.0,75.75
5,2,0.1,100.0,77.05
6,2,1.0,100.0,75.85
7,2,10.0,100.0,75.4
8,3,0.01,100.0,75.05
9,3,0.1,100.0,75.55


In [28]:
best_degree_1, best_alpha_1, best_1 = get_best(df1)
print(best_degree_1, best_alpha_1, best_1)

2 0.1 77.05


In [29]:
clf1 = MultiKerOpt(alpha=best_alpha_1, tol=1e-07, degree=best_degree_1, method='svm')
clf1.fit(K_xx_1, y_train_1, n_iter=5)
y_pred_1 = clf1.predict(X_yx_1)

     pcost       dcost       gap    pres   dres
 0: -4.1068e-02 -5.0415e+00  4e+03  6e+01  2e-15
 1: -4.1066e-02 -5.0176e+00  4e+01  6e-01  1e-15
 2: -3.6185e-02 -3.3088e+00  4e+00  6e-03  1e-15
 3: -3.4843e-02 -1.2588e-01  9e-02  1e-04  1e-15
 4: -4.0990e-02 -4.4988e-02  4e-03  1e-06  1e-15
 5: -4.1067e-02 -4.1348e-02  3e-04  7e-08  1e-15
 6: -4.1068e-02 -4.1075e-02  6e-06  1e-09  1e-15
 7: -4.1068e-02 -4.1068e-02  1e-07  3e-11  1e-15
 8: -4.1068e-02 -4.1068e-02  3e-09  3e-13  1e-15
Optimal solution found.
     pcost       dcost       gap    pres   dres
 0: -3.8044e-02 -5.0384e+00  4e+03  6e+01  2e-15
 1: -3.8043e-02 -5.0145e+00  4e+01  6e-01  1e-15
 2: -3.3488e-02 -3.3013e+00  4e+00  6e-03  1e-15
 3: -3.1796e-02 -1.2355e-01  9e-02  1e-04  1e-15
 4: -3.7959e-02 -4.2008e-02  4e-03  1e-06  1e-15
 5: -3.8044e-02 -3.8322e-02  3e-04  7e-08  1e-15
 6: -3.8044e-02 -3.8050e-02  6e-06  1e-09  1e-15
 7: -3.8044e-02 -3.8045e-02  1e-07  2e-11  1e-15
 8: -3.8044e-02 -3.8044e-02  3e-09  3e-13  9e-1

In [30]:
print(y_train_1.mean(), y_pred_1.mean())

0.499 0.387


## DATASET 2

In [31]:
K_xx_2, X_yx_2, y_train_2 = build_training(2, sizes=[4,5,6,7], normed=False, rang=(4,4))

performing counts..........


In [32]:
RUN = False
if RUN:
    df2  = CvSearch(K_xx_2, X_yx_2, y_train_2, method='svm', degrees=[1, 2, 3, 4], 
               alphas=REG_PARAMS_SPAN, cv=5)
    df2.to_csv('./CV/X2_cv5_linear_kmers.csv', index=False)
else:
    df2 = pd.read_csv('./CV/X2_cv5_linear_kmers.csv')

In [33]:
df2

Unnamed: 0,degree,alpha,train,val
0,1,0.01,100.0,65.65
1,1,0.1,97.975,65.8
2,1,1.0,93.525,62.3
3,1,10.0,89.5875,61.2
4,2,0.01,100.0,66.1
5,2,0.1,100.0,64.8
6,2,1.0,100.0,66.35
7,2,10.0,99.8875,67.15
8,3,0.01,100.0,64.8
9,3,0.1,100.0,64.05


In [34]:
best_degree_2, best_alpha_2, best_2 = get_best(df2)
print(best_degree_2, best_alpha_2, best_2)

2 10.0 67.15


In [35]:
clf2 = MultiKerOpt(alpha=best_alpha_2, tol=1e-07, degree=best_degree_2, method='svm')
clf2.fit(K_xx_2, y_train_2, n_iter=5)
y_pred_2 = clf2.predict(X_yx_2)

     pcost       dcost       gap    pres   dres
 0: -4.2268e-02 -9.2269e-02  4e+03  6e+01  2e-15
 1: -4.2268e-02 -9.2259e-02  4e+01  6e-01  1e-15
 2: -4.2260e-02 -9.1325e-02  8e-01  1e-02  1e-15
 3: -3.7738e-02 -7.4698e-02  1e-01  2e-03  1e-15
 4: -3.0578e-02 -5.6923e-02  4e-02  3e-04  2e-15
 5: -3.1298e-02 -3.4284e-02  3e-03  6e-06  1e-15
 6: -3.2716e-02 -3.3140e-02  4e-04  7e-07  1e-15
 7: -3.2923e-02 -3.2985e-02  6e-05  8e-08  9e-16
 8: -3.2953e-02 -3.2961e-02  8e-06  8e-09  1e-15
 9: -3.2957e-02 -3.2958e-02  9e-07  8e-10  1e-15
10: -3.2958e-02 -3.2958e-02  3e-08  1e-11  1e-15
Optimal solution found.
     pcost       dcost       gap    pres   dres
 0: -3.9123e-02 -8.9125e-02  4e+03  6e+01  2e-15
 1: -3.9123e-02 -8.9117e-02  4e+01  6e-01  1e-15
 2: -3.9117e-02 -8.8333e-02  8e-01  1e-02  1e-15
 3: -3.5429e-02 -7.3293e-02  1e-01  2e-03  1e-15
 4: -2.9481e-02 -5.5504e-02  4e-02  3e-04  2e-15
 5: -3.0214e-02 -3.3167e-02  3e-03  5e-06  2e-15
 6: -3.1626e-02 -3.2026e-02  4e-04  5e-07  1e-1

In [36]:
print(y_train_2.mean(), y_pred_2.mean())

0.4995 0.424


## PREDICTIONS FILES

In [37]:
exp_score = (1/3)*(best_0 + best_1 + best_2)
print(exp_score)

68.66666666666666


In [38]:
y_pred = np.hstack((y_pred_0, y_pred_1, y_pred_2)).astype(int)

In [39]:
y_pred

array([1, 0, 1, ..., 1, 0, 0])

In [40]:
Ids = np.array(range(3000))

In [41]:
predictions = pd.DataFrame({'Id':Ids, 'Bound':y_pred.flatten()})

In [42]:
predictions.head()

Unnamed: 0,Id,Bound
0,0,1
1,1,0
2,2,1
3,3,1
4,4,0


In [None]:
labels_train.head()

In [43]:
predictions.to_csv('predictions9.csv', index=False)