In [166]:
  # importation
import pandas as pd
!pip install easy_dna
import numpy as np
from collections import Counter



In [185]:
Xtr0 = pd.read_csv("./data/Xtr0.csv", sep=",", index_col=0)
Xtr1 = pd.read_csv("./data/Xtr1.csv", sep=",", index_col=0).values
Xtr2 = pd.read_csv("./data/Xtr2.csv", sep=",", index_col=0).values

# shape (2000,100): float
Xtr0_mat100 = pd.read_csv("./data/Xtr0_mat100.csv", sep=" ", header=None).values
Xtr1_mat100 = pd.read_csv("./data/Xtr1_mat100.csv", sep=" ", header=None).values
Xtr2_mat100 = pd.read_csv("./data/Xtr2_mat100.csv", sep=" ", header=None).values

# shape (2000,1): string
Xte0 = pd.read_csv("./data/Xte0.csv", sep=",", index_col=0).values
Xte1 = pd.read_csv("./data/Xte1.csv", sep=",", index_col=0).values
Xte2 = pd.read_csv("./data/Xte2.csv", sep=",", index_col=0).values

# shape (2000,100): float
Xte0_mat100 = pd.read_csv("./data/Xte0_mat100.csv", sep=" ", header=None).values
Xte1_mat100 = pd.read_csv("./data/Xte1_mat100.csv", sep=" ", header=None).values
Xte2_mat100 = pd.read_csv("./data/Xte2_mat100.csv", sep=" ", header=None).values

# shape (2000,1): 0 or 1
Ytr0 = pd.read_csv("./data/Ytr0.csv", sep=",", index_col=0).values
Ytr1 = pd.read_csv("./data/Ytr1.csv", sep=",", index_col=0).values
Ytr2 = pd.read_csv("./data/Ytr2.csv", sep=",", index_col=0).values

#let us print the sequence 929

#print(f" the sequence 929 has a length of  {length} " )

#Rescaling labels
Ytr0 = np.where(Ytr0 == 0, -1, 1)
Ytr1 = np.where(Ytr1 == 0, -1, 1)
Ytr2 = np.where(Ytr2 == 0, -1, 1)


In [186]:
Xtr0.head()

Unnamed: 0_level_0,seq
Id,Unnamed: 1_level_1
0,TCCTGTGCACATCTGCACCCCTGTTGTGGCCACAAAATGATCCGGC...
1,TTAAGTGTATATCTAATAATTTTTTTGCCTACATTCCTGTGTTACC...
2,GTGCTCAATTAGTTGCCTACAAATAGTAGCCTGGCACAGTGTAAGC...
3,CACCTGGAAAATACAAACAGGCGCAAGAAGAGTTAACCCACAGATC...
4,AAATCACTGCCTATCCTTGGGCCAAAAGGTTTCTACAGGAAGCTGC...


In [187]:
Xtr0.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 0 to 1999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   seq     2000 non-null   object
dtypes: object(1)
memory usage: 31.2+ KB


# some Kernels implementation -here we will implement some kernels like linear kernels, gaussian kernels

In [188]:
class Kernel():
    def __init__(self):
        pass

    def similarity(self, x, y, **kwargs):
        """ Similarity between 2 feature vectors (depends on the type of kernel)"""
        return -1

    def gram_matrix(self, X1, X2,**kwargs):
        """ Compute the gram matrix of a data vector X where the (i,j) entry is defined as <Xi,Xj>\\
        X1: data vector (n_samples_1 x n_features)
        X2: data vector (n_samples_2 x n_features), if None compute the gram matrix for (X1,X1)
        """
        if X2 is None: 
            X2=X1
        n_samples_1 = X1.shape[0]
        n_samples_2 = X2.shape[0]
        K = np.zeros((n_samples_1, n_samples_2))
        for i in range(n_samples_1):
            for j in range(n_samples_2):
                K[i,j] = self.similarity(X1[i],X2[j],**kwargs)
        return K

    


In [189]:
class LinearKernels(Kernel):
    def __init():
        super().__init__()
    
    def similarity(self,X1,X2,**kwargs):
        
        return X1.T@X2
    
    
    
 

In [190]:
class GaussianKernel(Kernel):

    def __init__(self):
        super().__init__()
        

    def similarity(self, x, y,**kwargs):
        """ gaussian kernel : k(x,y) = 1/ sqrt(2 pi sigma2)^n * exp( - ||x-y||^2 / 2 sigma^2 )\\
        x, y: array (n_features,)
        """
        sigma=kwargs.get('sigma')
        normalize=kwargs.get('normalize')
        
        if normalize:
            norm_fact = (np.sqrt(2 * np.pi) * sigma) ** len(x)
            return np.exp(-np.linalg.norm(x-y)**2 / (2 *sigma**2)) / norm_fact
        else:
            return np.exp(-np.linalg.norm(x-y)**2 / (2 * sigma**2))


In [191]:
class PolynomialKernel(Kernel):

    def __init__(self):
        super().__init__()
        

    def similarity(self, x, y, **kwargs):
        gamma=kwargs.get('gamma')
        degree=kwargs.get('degree')
        coef=kwargs.get('coef')
        coef0=kwargs.get('coef0')
        """ polynomial kernel : k(x,y) = (gamma <x,y> + r)^d \\
        x, y: array (n_features,)
        """
        return (gamma * np.dot(x,y) + coef0)**degree 


    

In [192]:
# polynomial and gaussian


class PolynomialKernelGaussian(Kernel):

    def __init__(self):
        print("entre dans polyGaussian")
        super().__init__()
        

    def similarity(self, x, y, **kwargs):
        
        gamma=kwargs.get('gamma')
        degree=kwargs.get('degree')
        coef=kwargs.get('coef')
        coef0=kwargs.get('coef0')
        """ polynomial kernel : k(x,y) = (gamma <x,y> + r)^d \\
        x, y: array (n_features,)
        """
        return  0.5*((gamma * np.dot(x,y) + coef0)**degree) + 0.8*(np.exp(-np.linalg.norm(x-y)**2 / (2 * sigma**2)))+2*(x.T@y)


In [193]:

class SpectrumKernel(Kernel):

    def __init__(self):
        super().__init__()
       

    def similarity(self, x, y,**kwargs):
        k= kwargs.get('k')
        x=x[0]
        y=y[0]
        #print(k)
        #print(x)
        #print(y)
        x_substrings = [x[index:index+k] for index in range(len(x)-k+1)]
        y_substrings = [y[index:index+k] for index in range(len(y)-k+1)]
        #print(f'shape of x_substrings is {x_substrings}')
        #print(f'shape of y_substrings is {y_substrings}')
        x_counts = Counter(x_substrings)
        y_counts = Counter(y_substrings)
        substrings = set(x_counts.keys()).intersection(set(y_counts.keys()))
        K = 0
        for string in substrings:
            K += x_counts[string] * y_counts[string]
        return K

# HERE WE MAKE A CLASSIFICATION USING SVM KERNALIZATION

In [194]:
class KernelMethodBase(object):
    '''
    Base class for kernel methods models

    Methods
    ----
    fit
    predict
    fit_K
    predict_K
    '''
    kernels_ = {
        'linear': LinearKernels(),
        'rbf': GaussianKernel(),
        'poly':PolynomialKernel(),
        'poly_gaussian':PolynomialKernelGaussian(),
        'spectrum': SpectrumKernel(),
    }
    
        # 'mismatch': mismatch_kernel,
    
    def __init__(self, kernel='linear', **kwargs):
        print(kernel)
        self.kernel_name = kernel
        self.kernel_function_ = self.kernels_[kernel].gram_matrix
        self.kernel_parameters = self.get_kernel_parameters(**kwargs)
        self.fit_intercept_ = False

    def get_kernel_parameters(self, **kwargs):
        #'linear' 'rbf', 'poly', 'spectrum', 'mismatch' or 'sum' 
        gamma = 10.0
        sigma=np.sqrt(0.5/gamma)
        C = 5.0 #Parameter C for SVM
         #Parameter gamma for SVM (only for 'rbf' or 'poly')
        coef0 = 1.0 #Parameter coef0 for SVM (only for 'poly')
        degree = 3 
        params = {}
        if self.kernel_name == 'rbf':
            params['sigma'] = kwargs.get('sigma', sigma)
            params['normalize']=kwargs.get('normalize', True)
        if self.kernel_name == 'poly':
            params['degree'] = kwargs.get('degree', 10)
            params['coef'] = kwargs.get('coef', 5)
            params['coef0'] = kwargs.get('coef0', 1)
            params['gamma'] = kwargs.get('gamma', 10)
        if self.kernel_name == 'poly_gaussian':
            params['sigma'] = kwargs.get('sigma', sigma)
            params['degree'] = kwargs.get('degree', 3)
            params['coef'] = kwargs.get('coef', 3)
            params['coef0'] = kwargs.get('coef0', 1)
            params['gamma'] = kwargs.get('gamma', 10)
        if self.kernel_name == 'spectrum':
            params['k'] = kwargs.get('k', 3)
            
        return params

    def fit_K(self, K, y, **kwargs):
        """Finds optimal parameters from K(X_train, X_train) and y."""
        pass
        


    def decision_function_K(self, K):
        """Returns decision values from K(X, X_train)."""
        pass

    def fit(self, X, y, fit_intercept=False, **kwargs):

        if fit_intercept:
            X = add_column_ones(X)
            self.fit_intercept_ = True
        self.X_train = X
        self.y_train = y

        K = self.kernel_function_(self.X_train, self.X_train, **self.kernel_parameters)
        print(f'shape of k{K.shape}')
    

        return self.fit_K(K, y, **kwargs)

    def decision_function(self, X):

        if self.fit_intercept_:
            X = add_column_ones(X)

        K_x = self.kernel_function_(X, self.X_train, **self.kernel_parameters)
        print(f'shape of kx{K_x.shape}')
       
        return self.decision_function_K(K_x)

    def predict(self, X):
        """Predicts the labels (e.g. +/- 1 for classification) using the decision function."""
        pass
    
    def predict_K(self, K):
        """Predicts the labels (e.g. +/- 1 for classification) using the decision function."""
        pass

entre dans polyGaussian


In [195]:
!pip install cvxopt
import cvxopt

def cvxopt_qp(P, q, G, h, A, b):
    P = .5 * (P + P.T)
    cvx_matrices = [
        cvxopt.matrix(M.astype(np.double)) if M is not None else None for M in [P, q, G, h, A, b]
    ]
    #cvxopt.solvers.options['show_progress'] = False
    solution = cvxopt.solvers.qp(*cvx_matrices, options={'show_progress': False})
    return np.array(solution['x']).flatten()

solve_qp=cvxopt_qp




In [196]:
def svm_dual_soft_to_qp_kernel(K, y, C=1):
    n = K.shape[0]
    assert (len(y) == n)
    
    # Dual formulation, soft margin
    #print(y)
    print( (np.diag(y.squeeze())))
    P =(np.diag(y.squeeze()))@K@(np.diag(y.squeeze()))

    # As a regularization, we add epsilon * identity to P
    eps = 1e-12
    P += eps * np.eye(n)
    q = - np.ones(n)
    G = np.vstack([-np.eye(n), np.eye(n)])
    h = np.hstack([np.zeros(n), C * np.ones(n)])
    A = y.squeeze()[np.newaxis, :]
    b = np.array([0.])
  

    
    print(f"shape of P{P.shape}")
    print(f"shape of q{q.shape}")
    print(f"shape of G{G.shape}")
    print(f"shape of h{h.shape}")
    print(f"shape of A{A.shape}")
    print(f"shape of b{b.shape}")
    
    return P, q, G, h, A, b


In [201]:
class KernelSVM(KernelMethodBase):
    '''
    Kernel SVM Classification
    
    Methods
    ----
    fit
    predict
    '''
    def __init__(self, C=0.1, **kwargs):
        self.C = C
        super().__init__(**kwargs)
    
    def fit_K(self, K, y, tol=1e-3):
        # Solve dual problem
        self.alpha = solve_qp(*svm_dual_soft_to_qp_kernel(K, y, C=self.C))
        
        
        print(f'shape of alpha{self.alpha.shape}')
        # Compute support vectors and bias b
        sv = np.logical_and((self.alpha > tol), (self.C - self.alpha > tol))
        self.bias = y[sv] - K[sv].dot(self.alpha * y)
        self.bias = self.bias.mean()

        self.support_vector_indices = np.nonzero(sv)[0]
        
        return self
        
    def decision_function_K(self, K_x):
        print(f'alphay {(self.alpha*self.y_train).shape}')
        return K_x.dot(self.alpha.reshape(-1,1)*self.y_train.reshape(-1,1)) + self.bias

    def predict(self, X):
         return np.sign(self.decision_function(X))
        #return np.sign(self.decision_function(X) + 1)//2
        
    

# compute the error

In [202]:
# Prediction error
def error(ypred, ytrue):
    e = (ypred != ytrue).mean()
    return e

def add_column_ones(X):
    n = X.shape[0]
    return np.hstack([X, np.ones((n, 1))])

In [214]:
import tqdm as tqdm
import random
random.seed(42)
kernel = 'spectrum'

sigma = 0.8
gamma=9
degree =2
C =0.1
k=10
#tol = 1e-3
tol = 1e-3
#model0 = KernelSVM(C=C, kernel=kernel)

model0 = KernelSVM(C=C, kernel=kernel,degree=degree,sigma=np.sqrt(0.5/gamma),gamma=gamma,k=k)


print(Xtr0.shape)

n,d=Xtr0.shape
idx=np.arange(n)

idx=np.random.permutation(idx)
print(idx)
x=Xtr1[idx]
Ytr1=Ytr1[idx]
xtrain=x[:int(0.8*n)]

print(x.shape)
y=Ytr1[:int(0.8*n)]
#y=Ytr0[:int(0.8*n)]
print(y.shape)
model0.fit(xtrain,y)
ypred0=model0.predict(x[int(0.8*n):])
print(len(ypred0))
#ypred0=y=np.where(ypred0==-1,0,1)
#plot_decision_function(model, Xtr1, Ytr1,
                      # title='SVM {} Kernel'.format(kernel))
print('Test error: {:.2%}'.format(error(ypred0, Ytr1[int(0.8*n):])))


print(ypred0.shape)

spectrum
(2000, 1)
[ 249 1353 1630 ...  906  782 1031]
(2000, 1)
(1600, 1)
shape of k(1600, 1600)
[[-1  0  0 ...  0  0  0]
 [ 0  1  0 ...  0  0  0]
 [ 0  0  1 ...  0  0  0]
 ...
 [ 0  0  0 ...  1  0  0]
 [ 0  0  0 ...  0  1  0]
 [ 0  0  0 ...  0  0  1]]
shape of P(1600, 1600)
shape of q(1600,)
shape of G(3200, 1600)
shape of h(3200,)
shape of A(1, 1600)
shape of b(1,)
shape of alpha(1600,)
shape of kx(400, 1600)
alphay (1600, 1600)
400
Test error: 52.25%
(400, 1)


In [163]:
from sklearn import svm
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split


#define a seed for reproducibility
seed = 1

# Splitting data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(Xtr0, Ytr0.squeeze(), test_size = 0.25, random_state = seed)

model=svm.SVC(kernel = 'linear')
kfold = KFold(n_splits = 10,shuffle=True, random_state = 1)

cv_results = cross_val_score(model, X_train,y_train, cv = kfold, scoring = 'accuracy')


ValueError: 
All the 10 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/armandine/anaconda3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/armandine/anaconda3/lib/python3.10/site-packages/sklearn/svm/_base.py", line 192, in fit
    X, y = self._validate_data(
  File "/Users/armandine/anaconda3/lib/python3.10/site-packages/sklearn/base.py", line 565, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/Users/armandine/anaconda3/lib/python3.10/site-packages/sklearn/utils/validation.py", line 1106, in check_X_y
    X = check_array(
  File "/Users/armandine/anaconda3/lib/python3.10/site-packages/sklearn/utils/validation.py", line 879, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "/Users/armandine/anaconda3/lib/python3.10/site-packages/sklearn/utils/_array_api.py", line 185, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "/Users/armandine/anaconda3/lib/python3.10/site-packages/pandas/core/generic.py", line 2070, in __array__
    return np.asarray(self._values, dtype=dtype)
ValueError: could not convert string to float: 'GCTGCAATGGAAGTTTATGCACAGGAAAAAATGTGAAAGCTATCCAGCAGGTGGTGCCAAAGTAATGTCTCTTGAAGAGTTCTAATCTTTTTTTCTTTTCT'

--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/armandine/anaconda3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/armandine/anaconda3/lib/python3.10/site-packages/sklearn/svm/_base.py", line 192, in fit
    X, y = self._validate_data(
  File "/Users/armandine/anaconda3/lib/python3.10/site-packages/sklearn/base.py", line 565, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/Users/armandine/anaconda3/lib/python3.10/site-packages/sklearn/utils/validation.py", line 1106, in check_X_y
    X = check_array(
  File "/Users/armandine/anaconda3/lib/python3.10/site-packages/sklearn/utils/validation.py", line 879, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "/Users/armandine/anaconda3/lib/python3.10/site-packages/sklearn/utils/_array_api.py", line 185, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "/Users/armandine/anaconda3/lib/python3.10/site-packages/pandas/core/generic.py", line 2070, in __array__
    return np.asarray(self._values, dtype=dtype)
ValueError: could not convert string to float: 'ACATGGTGAAACCCCATCTCTACTAAAAATACAAAAACTATCCGGGCATGGTGGTGCATGCCTGTAATTCCAGCTACTAGGGAGGCTGAGGCAGGAAATTG'


In [None]:
msg = "{0}: {1} ({2})".format("linear", cv_results.mean(), cv_results.std())
print(msg)