In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.optimize import minimize
from joblib import Parallel, delayed
from tqdm import tqdm
from collections import Counter
from itertools import product
from numba import prange


# Kernel Code

## Substring Kernel

In [2]:
def compute_B_matrix(x, x_prime, k, lambda_decay):
    """Computes the B_k matrix with dynamic programming"""
    n, m = len(x), len(x_prime)
    B = np.zeros((k + 1, n + 1, m + 1))
    B[0, :, :] = 1  # Base case

    for l in range(1, k + 1):
        for i in range(1, n + 1):
            for j in range(1, m + 1):
                if x[i - 1] == x_prime[j - 1]: 
                    B[l, i, j] = lambda_decay * (B[l, i - 1, j] + B[l, i, j - 1] - lambda_decay * B[l, i - 1, j - 1])
                    B[l, i, j] += (lambda_decay ** 2) * B[l - 1, i - 1, j - 1]
                else:
                    B[l, i, j] = lambda_decay * (B[l, i - 1, j] + B[l, i, j - 1] - lambda_decay * B[l, i - 1, j - 1])
    return B

def substring_kernel(x, x_prime, k, lambda_decay):
    """Computes the substring kernel K_k with dynamic programming"""
    n, m = len(x), len(x_prime)
    
    # Compute B_k
    B = compute_B_matrix(x, x_prime, k, lambda_decay)

    # Base cases
    if k==0:
        K = np.ones((n + 1, m + 1))
    else:
        K = np.zeros((n + 1, m + 1))

    # Dynamic Programming
    for i in range(1, n + 1):
        K[i, :] = K[i - 1, :]

        for j in range(1, m + 1):
            res = 0
            for j_prime in range(1,m+1):
                if x_prime[j_prime-1] == x[i - 1]:
                    res += B[k-1, i - 1, j_prime-1]
            K[i, j] += (lambda_decay**2) * res

    # Return the final kernel value
    return K[n, m]  


In [3]:
# Example Usage
x = "CAT"
x_prime = "BAT"
k = 2  # Substring length
lambda_decay = 0.5

kernel_value = substring_kernel(x, x_prime, k, lambda_decay)
print("Substring Kernel Value:", kernel_value)

Substring Kernel Value: 0.0625


## Spectrum Kernel

In [10]:
def get_kmer_counts(seq, k):
    """Extracts k-mer counts from a DNA sequence."""
    return Counter([seq[i:i+k] for i in range(len(seq) - k + 1)])

def spectrum_kernel(x, x_prime, k):
    # Compute k-mer counts for both sequences
    kmer_counts_x = get_kmer_counts(x, k)
    kmer_counts_x_prime = get_kmer_counts(x_prime, k)

    # Compute dot product of k-mer count vectors
    return sum(kmer_counts_x[u] * kmer_counts_x_prime[u] for u in set(kmer_counts_x) & set(kmer_counts_x_prime))


In [11]:
x = 'CARCARD'
x_prime = 'BARARDAR'

k = 2  # Example k-mer length

kernel_value = spectrum_kernel(x, x_prime, k)
print("Spectrum Kernel Value:", kernel_value)


Spectrum Kernel Value: 7


## Mismatch Kernel

In [20]:
def hamming_distance(s1, s2):
    """Computes Hamming distance between two equal-length strings."""
    return sum(c1 != c2 for c1, c2 in zip(s1, s2))

def generate_mismatch_kmers(kmer, m):
    """Generates all possible k-mers within `m` mismatches of a given k-mer."""
    bases = ['A', 'C', 'G', 'T']
    mismatch_kmers = set([kmer])

    for positions in product(range(len(kmer)), repeat=m):
        for replacements in product(bases, repeat=m):
            kmer_list = list(kmer)
            for pos, replacement in zip(positions, replacements):
                kmer_list[pos] = replacement
            mismatch_kmers.add("".join(kmer_list))

    return mismatch_kmers

def count_kmers_with_mismatches(sequence, k, m):
    """Counts k-mers in `sequence`, including up to `m` mismatches."""
    kmer_counts = {}

    for i in range(len(sequence) - k + 1):
        kmer = sequence[i:i+k]
        mismatch_kmers = generate_mismatch_kmers(kmer, m)

        for mismatch_kmer in mismatch_kmers:
            if mismatch_kmer in kmer_counts:
                kmer_counts[mismatch_kmer] += 1
            else:
                kmer_counts[mismatch_kmer] = 1

    return kmer_counts

def mismatch_kernel(x, x_prime, k, m):
    # Count k-mers with mismatches
    x_counts = count_kmers_with_mismatches(x, k, m)
    x_prime_counts = count_kmers_with_mismatches(x_prime, k, m)

    # Compute kernel similarity using dot product of k-mer counts
    similarity = sum(x_counts[kmer] * x_prime_counts.get(kmer, 0) for kmer in x_counts)

    return similarity


In [21]:
x = "ACGTACGT"
x_prime = "ACGGACGT"
k = 3   # 3-mers
m = 1   # Allow 1 mismatch

similarity = mismatch_kernel(x, x_prime, k, m)
print(f"Mismatch Kernel Similarity: {similarity}")


Mismatch Kernel Similarity: 86


# Compute the Kernel Matrix efficiently

In [4]:
def normalize_matrix(K):
    ''' We normalize the matrix for numerical stability'''
    diag_K=np.diag(K)
    root_K=np.sqrt(diag_K)
    K_inv=np.diag(1/root_K)
    return K_inv @ K @ K_inv

def compute_kernel_matrix_parallel(X, k, m=None, lambda_decay = None, n_jobs=-1):
    """
    Computes the Kernel matrix in parallel using joblib.
    """
    n = len(X)
    K = np.zeros((n, n))

    # Generate index pairs for upper triangle computation
    indices = [(i, j) for i in range(n) for j in range(i, n)]

    if lambda_decay is not None:
        # Compute kernel entries in parallel with tqdm
        results = Parallel(n_jobs=n_jobs)(
            delayed(substring_kernel)(X[i], X[j], k, lambda_decay) for i, j in tqdm(indices, desc="Computing Substring Kernel Matrix", unit=" entry")
        )

    elif m is not None:
        # Compute kernel entries in parallel with tqdm
        results = Parallel(n_jobs=n_jobs)(
            delayed(mismatch_kernel)(X[i], X[j], k, m) for i, j in tqdm(indices, desc="Computing Mismatch Kernel Matrix", unit=" entry")
        )

    else:
        # Compute kernel entries in parallel with tqdm
        results = Parallel(n_jobs=n_jobs)(
            delayed(spectrum_kernel)(X[i], X[j], k) for i, j in tqdm(indices, desc="Computing Spectrum Kernel Matrix", unit=" entry")
        )
        
    # Fill the kernel matrix
    index = 0
    for i, j in indices:
        K[i, j] = results[index]
        index += 1

    # Mirror the upper triangle to the lower triangle
    K = K + K.T - np.diag(K.diagonal())  # Ensure symmetry

    return normalize_matrix(K)

We will test algorithm and compute different Kernel Matrices

### Substring Kernel Matrix

In [None]:
# Parameters
k = 5
lambda_decay = 0.5

# Load training sequences
X_train_0 = df_Xtr0.iloc[:, 1].tolist()

K_train_0 = compute_kernel_matrix_parallel(X_train_0, k, lambda_decay = lambda_decay)


### Spectrum Kernel Matrix

In [None]:
k = 8

X_train_0 = df_Xtr0.iloc[:, 1].tolist()

# Compute the kernel matrix in parallel
K_train_0 = compute_kernel_matrix_parallel(X_train_0, k)

Computing Spectrum Kernel Matrix: 100%|██████████| 2001000/2001000 [00:12<00:00, 165380.64 entry/s]


### Mismatch Kernel Matrix

In [None]:
k = 8  # Choose k-mer length
m= 1

X_train = df_Xtr0.iloc[:, 1].tolist()

K_train_mismatch_0 = compute_kernel_matrix_parallel(X_train, k,m = m)

# Logistic Regression

$\hat{f}=\argmin_{f\in \mathcal{H}} \frac{1}{n}\displaystyle\sum_{i=1}^n \ln(1+e^{-y_if(x_i)}) + \frac{\lambda}{2}\Vert f\Vert^2_{\mathcal{H}}$

By the representer theorem, $\hat{f}(x)=\displaystyle\sum_{i=1}^n \alpha_i K(x_i,x)$

We therefore solve $\min_{\alpha\in\mathbb{R}^n}\displaystyle\sum_{i=1}^n \ln(1+e^{-y_i[K\alpha]_i})+ \frac{\lambda}{2}\alpha^TK\alpha$

In [28]:
def kernel_logistic_loss(alpha, K, y, reg_lambda):
    """
    Computes the objective function for kernel logistic regression.
    """
    n = len(y)
    K_alpha = K @ alpha  
    
    loss = np.sum(np.log(1 + np.exp(-y * K_alpha))) / n  
    reg = (reg_lambda / 2) * (alpha @ K @ alpha)  
    return loss + reg

def kernel_logistic_gradient(alpha, K, y, reg_lambda):
    """
    Computes the gradient of the kernel logistic loss.
    """
    n = len(y)
    K_alpha = K @ alpha
    probs = 1 / (1 + np.exp(y * K_alpha))  

    grad = - (K @ (y * probs)) / n  
    grad += reg_lambda * (K @ alpha)  
    return grad


In [29]:
def train_kernel_logistic_regression(K, y, reg_lambda):
    """
    Solves for α using gradient-based optimization.
    """
    n = len(y)
    alpha0 = np.zeros(n)  # Initialize α to zeros

    # Use L-BFGS optimizer
    result = minimize(kernel_logistic_loss, alpha0, 
                      args=(K, y, reg_lambda), 
                      jac=kernel_logistic_gradient,
                      method='L-BFGS-B',
                      options={'maxiter': 1000})

    return result.x  # Optimized α values


In [30]:
def predict_kernel_logistic_binary(alpha, X_train, X_test, k, m = None, lambda_decay=None, n_jobs=-1):
    """
    Predicts binary labels using kernel logistic regression with parallel processing (for faster computation).
    """
    n_test, n_train = len(X_test), len(X_train)
    K_test = np.zeros((n_test, n_train))

    indices = [(i, j) for i in range(n_test) for j in range(n_train)]

    if lambda_decay is not None:
        results = Parallel(n_jobs=n_jobs)(
            delayed(substring_kernel)(X_test[i], X_train[j], k, lambda_decay) 
            for i, j in tqdm(indices, desc="Computing Test-Train Substring Kernel", unit=" entry")
        )

    elif m is not None:
            results = Parallel(n_jobs=n_jobs)(
                delayed(mismatch_kernel)(X_test[i], X_train[j], k, m) 
                for i, j in tqdm(indices, desc="Computing Test-Train Mismatch Kernel", unit=" entry")
            )
    else:
        results = Parallel(n_jobs=n_jobs)(
            delayed(spectrum_kernel)(X_test[i], X_train[j], k) 
            for i, j in tqdm(indices, desc="Computing Test-Train Spectrum Kernel", unit=" entry")
        )

    for index, (i, j) in enumerate(indices):
        K_test[i, j] = results[index]

    scores = K_test @ alpha  

    return np.sign(scores)  # Convert to {-1,1} labels


# TF - 0

## Data

In [5]:
# load the data
df_Xtr0 = pd.read_csv('Xtr0.csv')
df_Ytr0 = pd.read_csv('Ytr0.csv')

df_Xte0 = pd.read_csv('Xte0.csv')

In [None]:
# Convert labels to {-1,1} if they are in {0,1}
df_Ytr0.iloc[:, 1] = 2 * df_Ytr0.iloc[:, 1] - 1 if df_Ytr0.iloc[:, 1].min() == 0 else df_Ytr0.iloc[:, 1]

# Get the number of samples
n = len(df_Xtr0)
split_index = int(0.8 * n)  # 80% training, 20% validation

# Shuffle indices
indices = np.arange(n)
np.random.seed(40)  # For reproducibility
np.random.shuffle(indices)

# Apply shuffle to DataFrames
df_Xtr0_shuffled = df_Xtr0.iloc[indices].reset_index(drop=True)
df_Ytr0_shuffled = df_Ytr0.iloc[indices].reset_index(drop=True)

# Split into train and validation DataFrames
df_X_train_split = df_Xtr0_shuffled.iloc[:split_index]  # Training features
df_Y_train_split = df_Ytr0_shuffled.iloc[:split_index]  # Training labels
df_X_val = df_Xtr0_shuffled.iloc[split_index:]  # Validation features
df_Y_val = df_Ytr0_shuffled.iloc[split_index:]  # Validation labels

print(f"Training set size: {len(df_X_train_split)}, Validation set size: {len(df_X_val)}")


In [None]:
X_train = df_X_train_split.iloc[:, 1].tolist()
y_train = df_Y_train_split.iloc[:, 1].values

X_val = df_X_val.iloc[:, 1].tolist()
y_val = df_Y_val.iloc[:, 1].values

## Train Model

## Train/Validation Split

### Compute Spectrum Kernel Matrix

In [None]:
k = 8  # Choose length of substring

# Compute Spectrum Kernel Matrix
K_train_spectrum_0 = compute_kernel_matrix_parallel(X_train, k)

### Compute Substring Kernel Matrix

In [None]:
# Define parameters
k = 4
lambda_decay = 0.5
n_jobs = -1  # Use all available CPU cores

# Load training sequences
X_train_0 = df_Xtr0.iloc[:, 1].tolist()

K_train_0 = compute_kernel_matrix_parallel(X_train_0, k, lambda_decay, n_jobs=n_jobs)

### Compute Mismatch Kernel Matrix

In [None]:
k = 8  # Choose length of substring
m=1

# Compute Spectrum Kernel Matrix
K_train_mismatch_0 = compute_kernel_matrix_parallel(X_train, k, m)

### Train!

In [None]:
K = K_train_mismatch_0

reg_lambda = 1e-5 # Regularization parameter

# Train model
alpha = train_kernel_logistic_regression(K, y_train, reg_lambda)

# Compute train accuracy
y_pred = np.sign(K @ alpha)  # Predict labels
train_accuracy = np.mean(y_pred == y_train)  
print(f'Train accuracy: {train_accuracy:.4f}')

In [None]:
# Compute validation kernel matrix (K_val)
y_pred = predict_kernel_logistic_binary(alpha, X_train, X_val, k, m = m)

val_accuracy = np.mean(y_pred == y_val) 
print(f'Validation accuracy: {val_accuracy:.4f}')

## Train with Full Data

In [None]:
X_train = df_Xtr0.iloc[:, 1].tolist()
y_train = df_Ytr0.iloc[:, 1].values

print(f"Training set size: {len(X_train)}")

### Comptute Spectrum Kernel Matrix

In [None]:
k = 8  # Choose length of substring

# Compute Spectrum Kernel Matrix
K_train_spectrum_0 = compute_kernel_matrix(X_train, k)

np.save("spectrum_kernel_matrix_0.npy", K_train_spectrum_0)
print("Spectrum Kernel Matrix computed and saved successfully!")

In [None]:
K = K_train_spectrum_0

reg_lambda = 1e-5 # Regularization parameter

# Train model
alpha = train_kernel_logistic_regression(K, y_train, reg_lambda)

# Compute train accuracy
y_pred = np.sign(K @ alpha)  # Predict labels
train_accuracy = np.mean(y_pred == y_train)  
print(f'Train accuracy: {train_accuracy:.4f}')

In [None]:
# Load test data
X_test = df_Xte0.iloc[:, 1].tolist()  # Test sequences

# Predict probabilities for test set
y_test_probs = predict_kernel_logistic_binary(alpha, X_train, X_test, k)

In [None]:
y_test_probs = ((y_test_probs + 1) / 2).astype(int)  # Convert {-1,1} to {0,1}

#save as csv file

df_Yte0 = pd.DataFrame(data = y_test_probs, columns = ['Bound'])
df_Yte0.index.name = 'Id'
df_Yte0.to_csv('Yte0.csv')

### Compute Mismatch Kernel Matrix

In [None]:
k = 8  # Choose length of substring
m=1

# Compute Spectrum Kernel Matrix
K_train_mismatch_0 = compute_kernel_matrix_parallel(X_train, k, m)

np.save("mismatch_kernel_matrix_0.npy", K_train_mismatch_0)
print("Spectrum Kernel Matrix computed and saved successfully!")

In [None]:
K = K_train_mismatch_0

reg_lambda = 1e-5 # Regularization parameter

# Train model
alpha = train_kernel_logistic_regression(K, y_train, reg_lambda)

# Compute train accuracy
y_pred = np.sign(K @ alpha)  # Predict labels
train_accuracy = np.mean(y_pred == y_train)  # Compute accuracy
print(f'Train accuracy: {train_accuracy:.4f}')

In [None]:
# Load test data
X_test = df_Xte0.iloc[:, 1].tolist()  # Test sequences

# Predict probabilities for test set
y_test_probs = predict_kernel_logistic_binary(alpha, X_train, X_test, k)
y_test_probs = ((y_test_probs + 1) / 2).astype(int)  # Convert {-1,1} to {0,1}

In [None]:
#save as csv file

df_Yte0 = pd.DataFrame(data = y_test_probs, columns = ['Bound'])
df_Yte0.index.name = 'Id'
df_Yte0.to_csv('Yte0.csv')

# TF - 1

## Data

In [None]:
## Data
# load the data
df_Xtr1 = pd.read_csv('Xtr1.csv')
df_Ytr1 = pd.read_csv('Ytr1.csv')

df_Xte1 = pd.read_csv('Xte1.csv')

In [17]:
# Convert labels to {-1,1} if they are in {0,1}
df_Ytr1.iloc[:, 1] = 2 * df_Ytr1.iloc[:, 1] - 1 if df_Ytr1.iloc[:, 1].min() == 0 else df_Ytr1.iloc[:, 1]

# Get the number of samples
n = len(df_Xtr1)
split_index = int(0.8 * n)  # 80% training, 20% validation

# Shuffle indices
indices = np.arange(n)
np.random.seed(42)  # For reproducibility
np.random.shuffle(indices)

# Apply shuffle to DataFrames
df_Xtr1_shuffled = df_Xtr1.iloc[indices].reset_index(drop=True)
df_Ytr1_shuffled = df_Ytr1.iloc[indices].reset_index(drop=True)

# Split into train and validation DataFrames
df_X_train_split = df_Xtr1_shuffled.iloc[:split_index]  # Training features
df_Y_train_split = df_Ytr1_shuffled.iloc[:split_index]  # Training labels
df_X_val = df_Xtr1_shuffled.iloc[split_index:]  # Validation features
df_Y_val = df_Ytr1_shuffled.iloc[split_index:]  # Validation labels

print(f"Training set size: {len(df_X_train_split)}, Validation set size: {len(df_X_val)}")


Training set size: 1600, Validation set size: 400


In [18]:
X_train = df_X_train_split.iloc[:, 1].tolist()
y_train = df_Y_train_split.iloc[:, 1].values

X_val = df_X_val.iloc[:, 1].tolist()
y_val = df_Y_val.iloc[:, 1].values

## Train Model

## Train/Validation Split

### Compute Spectrum Kernel Matrix

In [None]:
k = 8  # Choose length of substring

# Compute Spectrum Kernel Matrix
K_train_spectrum_1 = compute_kernel_matrix_parallel(X_train, k)

Computing Spectrum Kernel Matrix: 100%|██████████| 1280800/1280800 [00:07<00:00, 179645.63 entry/s]


Spectrum Kernel Matrix computed and saved successfully!


### Compute Mismatch Kernel Matrix

In [None]:
k = 8  # Choose length of substring
m=1

# Compute Spectrum Kernel Matrix
K_train_mismatch_1 = compute_kernel_matrix_parallel(X_train, k, m)

Computing Mismatch Kernel Matrix: 100%|██████████| 1280800/1280800 [10:39<00:00, 2001.97 entry/s]


### Train!

In [31]:
K = K_train_mismatch_1

reg_lambda = 1e-5 # Regularization parameter

# Train model
alpha = train_kernel_logistic_regression(K, y_train, reg_lambda)

# Compute train accuracy

y_pred = np.sign(K @ alpha)  # Predict labels
train_accuracy = np.mean(y_pred == y_train)  # Compute accuracy
print(f'Train accuracy: {train_accuracy:.4f}')

Train accuracy: 1.0000


In [32]:
# Compute validation kernel matrix (K_val)
y_pred = predict_kernel_logistic_binary(alpha, X_train, X_val, k, lambda_decay = None)

val_accuracy = np.mean(y_pred == y_val)  # Compute accuracy
print(f'Validation accuracy: {val_accuracy:.4f}')

Computing Mismatch Kernel Matrix:   1%|▏         | 18415/1280800 [12:44<14:33:56, 24.07 entry/s]y/s]
Computing Test-Train Spectrum Kernel: 100%|██████████| 640000/640000 [00:03<00:00, 181353.07 entry/s]


Validation accuracy: 0.7000


## Train with full Data

In [None]:
X_train = df_Xtr1.iloc[:, 1].tolist()
y_train = df_Ytr1.iloc[:, 1].values

print(f"Training set size: {len(X_train)}")

### Compute Substring Kernel Matrix

In [None]:
#Comptute Spectrum Kernel Matrix
k = 8  # Choose length of substring

# Compute Spectrum Kernel Matrix
K_train_spectrum_1 = compute_kernel_matrix_parallel(X_train, k)

np.save("spectrum_kernel_matrix_1.npy", K_train_spectrum_1)
print("Spectrum Kernel Matrix computed and saved successfully!")

In [None]:
K = K_train_spectrum_1

reg_lambda = 1e-5 # Regularization parameter

# Train model
alpha = train_kernel_logistic_regression(K, y_train, reg_lambda)

# Compute train accuracy
y_pred = np.sign(K @ alpha)  # Predict labels
train_accuracy = np.mean(y_pred == y_train)  # Compute accuracy
print(f'Train accuracy: {train_accuracy:.4f}')

In [None]:
# Load test data
X_test = df_Xte1.iloc[:, 1].tolist()  # Test sequences

# Predict probabilities for test set
y_test_probs = predict_kernel_logistic_binary(alpha, X_train, X_test, k)
y_test_probs = ((y_test_probs + 1) / 2).astype(int)  # Convert {-1,1} to {0,1}

In [None]:
#save as csv file

df_Yte1 = pd.DataFrame(data = y_test_probs, columns = ['Bound'])
df_Yte1.index.name = 'Id'
df_Yte1.to_csv('Yte1.csv')

### Compute Mismatch Kernel Matrix

In [None]:
k = 8  # Choose length of substring
m=1

# Compute Spectrum Kernel Matrix
K_train_mismatch_1 = compute_kernel_matrix_parallel(X_train, k, m)

np.save("mismatch_kernel_matrix_1.npy", K_train_mismatch_1)
print("Spectrum Kernel Matrix computed and saved successfully!")

In [None]:
K = K_train_mismatch_1

reg_lambda = 1e-5 # Regularization parameter

# Train model
alpha = train_kernel_logistic_regression(K, y_train, reg_lambda)

# Compute train accuracy
y_pred = np.sign(K @ alpha)  # Predict labels
train_accuracy = np.mean(y_pred == y_train)  # Compute accuracy
print(f'Train accuracy: {train_accuracy:.4f}')

In [None]:
# Load test data
X_test = df_Xte1.iloc[:, 1].tolist()  # Test sequences

# Predict probabilities for test set
y_test_probs = predict_kernel_logistic_binary(alpha, X_train, X_test, k)
y_test_probs = ((y_test_probs + 1) / 2).astype(int)  # Convert {-1,1} to {0,1}

In [None]:
#save as csv file

df_Yte1 = pd.DataFrame(data = y_test_probs, columns = ['Bound'])
df_Yte1.index.name = 'Id'
df_Yte1.to_csv('Yte0.csv')

# TF - 2

## Data

In [None]:
# load the data
df_Xtr2 = pd.read_csv('Xtr2.csv')
df_Ytr2 = pd.read_csv('Ytr2.csv')

df_Xte2 = pd.read_csv('Xte2.csv')

In [None]:
# Convert labels to {-1,1} if they are in {0,1}
df_Ytr2.iloc[:, 1] = 2 * df_Ytr2.iloc[:, 1] - 1 if df_Ytr2.iloc[:, 1].min() == 0 else df_Ytr2.iloc[:, 1]

# Get the number of samples
n = len(df_Xtr2)
split_index = int(0.8 * n)  # 80% training, 20% validation

# Shuffle indices
indices = np.arange(n)
np.random.seed(42)  # For reproducibility
np.random.shuffle(indices)

# Apply shuffle to DataFrames
df_Xtr2_shuffled = df_Xtr2.iloc[indices].reset_index(drop=True)
df_Ytr2_shuffled = df_Ytr2.iloc[indices].reset_index(drop=True)


# Split into train and validation DataFrames
df_X_train_split = df_Xtr2_shuffled.iloc[:split_index]  # Training features
df_Y_train_split = df_Ytr2_shuffled.iloc[:split_index]  # Training labels
df_X_val = df_Xtr2_shuffled.iloc[split_index:]  # Validation features
df_Y_val = df_Ytr2_shuffled.iloc[split_index:]  # Validation labels

print(f"Training set size: {len(df_X_train_split)}, Validation set size: {len(df_X_val)}")

In [None]:
X_train = df_X_train_split.iloc[:, 1].tolist()
y_train = df_Y_train_split.iloc[:, 1].values

X_val = df_X_val.iloc[:, 1].tolist()
y_val = df_Y_val.iloc[:, 1].values

## Train Model

## Train/Validatin Split

### Compute Spectrum Kernel Matrix

In [None]:
k = 8  # Choose length of substring

# Compute Spectrum Kernel Matrix
K_train_spectrum_2 = compute_kernel_matrix(X_train, k)

### Compute Mismatch Kernel Matrix

In [None]:
k = 8  # Choose length of substring
m=1

# Compute Spectrum Kernel Matrix
K_train_mismatch_2 = compute_kernel_matrix_parallel(X_train, k, m)

### Train!

In [None]:
K = K_train_mismatch_2

reg_lambda = 1e-5 # Regularization parameter

# Train model
alpha = train_kernel_logistic_regression(K, y_train, reg_lambda)

# Compute train accuracy
y_pred = np.sign(K @ alpha)  # Predict labels
train_accuracy = np.mean(y_pred == y_train)  # Compute accuracy
print(f'Train accuracy: {train_accuracy:.4f}')

In [None]:
# Compute validation kernel matrix (K_val)
y_pred = predict_kernel_logistic_binary(alpha, X_train, X_val, k, m = m)

val_accuracy = np.mean(y_pred == y_val)  # Compute accuracy
print(f'Validation accuracy: {val_accuracy:.4f}')

## Train on Full Data

In [None]:
X_train = df_Xtr2.iloc[:, 1].tolist()
y_train = df_Ytr2.iloc[:, 1].values

print(f"Training set size: {len(X_train)}")

### Compute Spectrum Kernel Matrix

In [None]:
#Comptute Spectrum Kernel Matrix
k = 7  # Choose length of substring

# Compute Spectrum Kernel Matrix
K_train_spectrum_2 = compute_kernel_matrix(X_train, k)

np.save("spectrum_kernel_matrix_2.npy", K_train_spectrum_2)
print("Spectrum Kernel Matrix computed and saved successfully!")

In [None]:
K = K_train_spectrum_2

reg_lambda = 1e-6 # Regularization parameter

# Train model
alpha = train_kernel_logistic_regression(K, y_train, reg_lambda)

# Compute train accuracy
y_pred = np.sign(K @ alpha)  # Predict labels
train_accuracy = np.mean(y_pred == y_train)  # Compute accuracy
print(f'Train accuracy: {train_accuracy:.4f}')

In [None]:
# Load test data
X_test = df_Xte2.iloc[:, 1].tolist()  # Test sequences

# Predict probabilities for test set
y_test_probs = predict_kernel_logistic_binary(alpha, X_train, X_test, k)
y_test_probs = ((y_test_probs + 1) / 2).astype(int)  # Convert {-1,1} to {0,1}

In [None]:
df_Yte2 = pd.DataFrame(data = y_test_probs, columns = ['Bound'])
df_Yte2.index.name = 'Id'
df_Yte2.to_csv('Yte2.csv')

### Compute Mismatch Kernel Matrix

In [None]:
k = 8  # Choose length of substring
m=1

# Compute Spectrum Kernel Matrix
K_train_mismatch_2 = compute_kernel_matrix_parallel(X_train, k, m)

np.save("mismatch_kernel_matrix_2.npy", K_train_mismatch_2)
print("Spectrum Kernel Matrix computed and saved successfully!")

In [None]:
K = K_train_mismatch_2

reg_lambda = 1e-5 # Regularization parameter

# Train model
alpha = train_kernel_logistic_regression(K, y_train, reg_lambda)

# Compute train accuracy
y_pred = np.sign(K @ alpha)  # Predict labels
train_accuracy = np.mean(y_pred == y_train)  # Compute accuracy
print(f'Train accuracy: {train_accuracy:.4f}')

In [None]:
# Load test data
X_test = df_Xte2.iloc[:, 1].tolist()  # Test sequences

# Predict probabilities for test set
y_test_probs = predict_kernel_logistic_binary(alpha, X_train, X_test, k)
y_test_probs = ((y_test_probs + 1) / 2).astype(int)  # Convert {-1,1} to {0,1}

In [None]:
#save as csv file

df_Yte2 = pd.DataFrame(data = y_test_probs, columns = ['Bound'])
df_Yte2.index.name = 'Id'
df_Yte2.to_csv('Yte2.csv')

# Creating a submission file

In [None]:
#load the data

df_Yte0 = pd.read_csv('Yte0.csv')
df_Yte1 = pd.read_csv('Yte1.csv')
df_Yte2 = pd.read_csv('Yte2.csv')

In [None]:
# concatenate the results but fix the ids

df_Yte = pd.concat([df_Yte0, df_Yte1, df_Yte2])
df_Yte.Id = np.arange(len(df_Yte))
#df_Yte.index.name = 'Id'
df_Yte.to_csv('Yte.csv', index=False)

In [None]:
df_Yte