In [None]:
import numpy as np
import pandas as pd
from math import exp
import matplotlib.pyplot as plt
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn.over_sampling import RandomOverSampler

In [None]:
df = pd.read_csv('train.dat', delimiter='\t', names=["antibiofilm","peptides"])

In [None]:
df

Unnamed: 0,antibiofilm,peptides
0,-1,DVELDLVEISPNALP
1,-1,KADEELFNKLFFGT
2,-1,FLVALHLGTAFALLWYFRKRWCALVRGFFASFGGRRNDDAHMM
3,-1,RDQMRARIADITGVAISRIA
4,-1,RKRLQLLLL
...,...,...
1561,-1,YAQSEGDTG
1562,1,FIKHFIHRFGGGRWRWRWF
1563,1,KWKIRVRLSA
1564,-1,TLNTVVGSISGAVP


In [None]:
X, Y = df['peptides'].to_numpy(), df['antibiofilm'].to_numpy()

In [None]:
label_counts = Counter(Y)
print(f"Count of -1: {label_counts[-1]}")
print(f"Count of 1: {label_counts[1]}")

Count of -1: 1424
Count of 1: 142


In [None]:
AMINO_ACIDS = "ARNDCQEGHILKMFPSTWYV"

def features_extract(X, features_set=None, max_k=3, include_bow=True):

    def kmer(sequence, max_k):
        kmers = []
        for k in range(1, max_k + 1):
            kmers.extend([sequence[i:i + k].lower() for i in range(len(sequence) - k + 1)])
        return kmers

    X_features = []
    if features_set is None:
        features_set = set()

    for seq in X:
        kmer_features = kmer(seq, max_k)

        if include_bow:
            bow_features = [seq.count(aa) for aa in AMINO_ACIDS]
        else:
            bow_features = []

        combined_features = kmer_features + [f"bow_{aa}_{count}" for aa, count in zip(AMINO_ACIDS, bow_features) if count > 0]
        X_features.append(' '.join(combined_features))

        features_set.update(kmer_features)
        if include_bow:
            for aa, count in zip(AMINO_ACIDS, bow_features):
                if count > 0:
                    features_set.add(f"bow_{aa}_{count}")

    features_set = sorted(features_set)
    return features_set, X_features

def feature_vectors(X_features, features_set):

    X_vector = np.zeros((len(X_features), len(features_set)))

    for i, features in enumerate(X_features):
        count = Counter(features.split())
        for feature, cnt in count.items():
            if feature in features_set:
                X_vector[i][features_set.index(feature)] = cnt

        X_vector[i] = X_vector[i] / np.max(X_vector[i]) if np.max(X_vector[i]) != 0 else X_vector[i]

    return X_vector


features_set, X_features = features_extract(X, max_k=3, include_bow=True)

X_vector = feature_vectors(X_features, features_set)



In [None]:
def oversample(X, Y):
    oversample = RandomOverSampler(sampling_strategy=0.3, random_state = 65)
    X_over, Y_over = oversample.fit_resample(X, Y)
    return X_over, Y_over
X, y = oversample(X_vector, Y)

In [None]:
print(X.shape)
print(y.shape)

(1851, 7448)
(1851,)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

In [None]:
label_counts = Counter(y_train)
print(f"Count of -1: {label_counts[-1]}")
print(f"Count of 1: {label_counts[1]}")

label_counts = Counter(y_test)
print(f"Count of -1: {label_counts[-1]}")
print(f"Count of 1: {label_counts[1]}")

Count of -1: 1139
Count of 1: 341
Count of -1: 285
Count of 1: 86


In [None]:
print(X_train.shape)
print(y_train.shape)

(1480, 7448)
(1480,)


In [None]:
y_train = np.where(y_train<0, 0, 1)
y_test = np.where(y_test<0, 0, 1)

In [None]:
BATCH_SIZE = 128
HN1 = 250
HN2 = 120
ALPHA = 0.0001
LR = 0.001

def relu(x):
    return np.maximum(0.0, x)

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(s_x):
    return np.multiply(s_x, np.subtract(1.0, s_x))

def calculate_mse_loss(y_pred, y, W1, W2, W3):
    mse_loss = np.sum((y - y_pred) ** 2) / len(y)
    reg_loss = ALPHA * (np.sum(W1**2) + np.sum(W2**2) + np.sum(W3**2))
    return mse_loss + reg_loss

def forward_propagation(X, W1, W2, W3, b1, b2, b3):
    z1 = relu(np.dot(X, W1) + b1)
    z2 = relu(np.dot(z1, W2) + b2)
    y_pred = sigmoid(np.dot(z2, W3) + b3)
    return z1, z2, y_pred

def backward_propagation(X, Y, y_pred, Z1, Z2, W1, W2, W3, b1, b2, b3):
    Y = Y[:y_pred.shape[0]]

    delta_w3 = sigmoid_derivative(y_pred) * (Y - y_pred)
    delta_w2 = relu_derivative(Z2) * np.dot(delta_w3, W3.T)
    delta_w1 = relu_derivative(Z1) * np.dot(delta_w2, W2.T)

    W3 += LR * np.dot(Z2.T, delta_w3) - LR * ALPHA * W3
    b3 += LR * np.sum(delta_w3, axis=0, keepdims=True)
    W2 += LR * np.dot(Z1.T, delta_w2) - LR * ALPHA * W2
    b2 += LR * np.sum(delta_w2, axis=0, keepdims=True)
    W1 += LR * np.dot(X.T, delta_w1) - LR * ALPHA * W1
    b1 += LR * np.sum(delta_w1, axis=0, keepdims=True)

    return W1, W2, W3, b1, b2, b3

def model(X_train, Y_train, X_test, Y_test, features, n_epochs):
    b1 = np.zeros((1, HN1))
    b2 = np.zeros((1, HN2))
    b3 = np.zeros((1, 1))
    R = np.random.RandomState(1997)
    W1 = R.normal(0, (2 / X_train.shape[1]), (X_train.shape[1], HN1))
    W2 = R.normal(0, (2 / HN1), (HN1, HN2))
    W3 = R.normal(0, (2 / HN2 + 1), (HN2, 1))

    Y_train = Y_train[:, np.newaxis]
    Y_test = Y_test[:, np.newaxis]
    training_loss = []
    validation_loss = []
    training_mcc = []
    validation_mcc = []

    for epoch in range(n_epochs):
        for i in range(0, len(X_train), BATCH_SIZE):
            Xbatch = X_train[i:i + BATCH_SIZE]
            Ybatch = Y_train[i:i + BATCH_SIZE]

            Z1, Z2, y_pred = forward_propagation(Xbatch, W1, W2, W3, b1, b2, b3)
            W1, W2, W3, b1, b2, b3 = backward_propagation(Xbatch, Ybatch, y_pred, Z1, Z2, W1, W2, W3, b1, b2, b3)

        Z1, Z2, y_pred = forward_propagation(X_train, W1, W2, W3, b1, b2, b3)
        loss = calculate_mse_loss(y_pred, Y_train, W1, W2, W3)
        training_loss.append(loss)
        y_pred_binary = np.where(y_pred < 0.5, 0, 1)
        training_mcc.append(matthews_corrcoef(y_pred_binary.ravel(), Y_train.ravel()))

        print(f"Epoch {epoch + 1}/{n_epochs} - Training Loss: {loss:.4f}, Training MCC: {training_mcc[-1]:.4f}")

        Z1, Z2, y_pred = forward_propagation(X_test, W1, W2, W3, b1, b2, b3)
        loss = calculate_mse_loss(y_pred, Y_test, W1, W2, W3)
        validation_loss.append(loss)
        y_pred_binary = np.where(y_pred < 0.5, 0, 1)
        validation_mcc.append(matthews_corrcoef(y_pred_binary.ravel(), Y_test.ravel()))

        print(f"Validation Loss: {validation_loss[-1]:.4f}, Validation MCC: {validation_mcc[-1]:.4f}")

    return W1, W2, W3, b1, b2, b3, training_loss, training_mcc, validation_loss, validation_mcc


W1, W2, W3, b1, b2, b3, training_loss, training_mcc, validation_loss, validation_mcc = model(X_train, y_train, X_test, y_test, features_set, 120)


Epoch 1/120 - Training Loss: 0.1908, Training MCC: 0.0000
Validation Loss: 0.1914, Validation MCC: 0.0000
Epoch 2/120 - Training Loss: 0.1868, Training MCC: 0.0000
Validation Loss: 0.1871, Validation MCC: 0.0000
Epoch 3/120 - Training Loss: 0.1793, Training MCC: 0.0000
Validation Loss: 0.1793, Validation MCC: 0.0000
Epoch 4/120 - Training Loss: 0.1652, Training MCC: 0.0000
Validation Loss: 0.1645, Validation MCC: 0.0000
Epoch 5/120 - Training Loss: 0.1427, Training MCC: 0.0000
Validation Loss: 0.1408, Validation MCC: 0.0000
Epoch 6/120 - Training Loss: 0.1215, Training MCC: 0.6035
Validation Loss: 0.1185, Validation MCC: 0.6328
Epoch 7/120 - Training Loss: 0.1103, Training MCC: 0.7138
Validation Loss: 0.1070, Validation MCC: 0.7513
Epoch 8/120 - Training Loss: 0.1028, Training MCC: 0.7677
Validation Loss: 0.0996, Validation MCC: 0.8154
Epoch 9/120 - Training Loss: 0.0965, Training MCC: 0.8001
Validation Loss: 0.0939, Validation MCC: 0.8468
Epoch 10/120 - Training Loss: 0.0910, Training

In [None]:
print(features_set)

['a', 'aa', 'aaa', 'aac', 'aad', 'aae', 'aaf', 'aag', 'aah', 'aai', 'aak', 'aal', 'aam', 'aan', 'aap', 'aaq', 'aar', 'aas', 'aat', 'aav', 'aaw', 'aay', 'ac', 'aca', 'ace', 'acg', 'ach', 'aci', 'acl', 'acn', 'acp', 'acq', 'acr', 'acs', 'act', 'acv', 'acw', 'acy', 'ad', 'ada', 'adc', 'add', 'ade', 'adf', 'adg', 'adh', 'adi', 'adk', 'adl', 'adm', 'adn', 'adp', 'adq', 'adr', 'ads', 'adt', 'adv', 'ady', 'ae', 'aea', 'aec', 'aed', 'aee', 'aef', 'aeg', 'aeh', 'aei', 'aek', 'ael', 'aem', 'aen', 'aep', 'aeq', 'aer', 'aes', 'aet', 'aev', 'aew', 'aey', 'af', 'afa', 'afc', 'afd', 'afe', 'aff', 'afg', 'afh', 'afi', 'afk', 'afl', 'afn', 'afp', 'afq', 'afr', 'afs', 'aft', 'afv', 'afw', 'afy', 'ag', 'aga', 'agc', 'agd', 'age', 'agf', 'agg', 'agh', 'agi', 'agk', 'agl', 'agm', 'agn', 'agp', 'agq', 'agr', 'ags', 'agt', 'agv', 'agw', 'agy', 'ah', 'aha', 'ahe', 'ahf', 'ahg', 'ahh', 'ahi', 'ahk', 'ahl', 'ahm', 'ahn', 'ahp', 'ahq', 'ahr', 'ahs', 'aht', 'ahv', 'ahw', 'ahy', 'ai', 'aia', 'aic', 'aid', 'aie', '

In [None]:
df_t = pd.read_csv('test.dat', delimiter='\t', names=["peptides"])
X_t = df_t['peptides'].to_numpy()
test_set, X_test_features = features_extract(X_t)
X_t = feature_vectors(X_test_features, features_set)
Z1, Z2, y = forward_propagation(X_t, W1, W2, W3, b1, b2, b3)

y = np.where(y < 0.5, -1, 1)

np.savetxt('predict.txt', y, delimiter='\n', fmt='%i')
