<a href="https://colab.research.google.com/github/Sari275/my-deep-learning-projects/blob/main/Sari_Elian_Program_1_Numpy_Submission_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing Libraries

In [None]:
import numpy as np
from sklearn.metrics import matthews_corrcoef, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

Reading Training and Testing Files

In [None]:
train_path = "/content/train.dat"
test_path = "/content/test.dat"
train_data = pd.read_csv(train_path, header=None, sep='\t')
test_data = pd.read_csv(test_path,header=None, sep='\t')
print(train_data.shape, test_data.shape)

(1566, 2) (392, 1)


Visualizing data

In [None]:
train_data[0].value_counts()

-1    1424
 1     142
Name: 0, dtype: int64

In [None]:
peptide = train_data[1].values
y = 1*(train_data[0].values > 0)

Cross-Validation

In [None]:
pep_train, pep_valid, y_train, y_valid = train_test_split(peptide, y, random_state=21, test_size=0.2, stratify=y)

In [None]:
pep_train[0].split()


['LLKEDNMGFSFHITTIYEGKDFDMHYQNHLES']

In [None]:
list(peptide[0])

['D', 'V', 'E', 'L', 'D', 'L', 'V', 'E', 'I', 'S', 'P', 'N', 'A', 'L', 'P']

Bag of Words

In [None]:
vec = CountVectorizer(tokenizer=lambda g: list(g), ngram_range=(1,2))
X_train = vec.fit_transform(pep_train).toarray()
X_valid = vec.transform(pep_valid).toarray()



Scaling Training data

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_valid = sc.transform(X_valid)

Visualizing Feature Shapes

In [None]:
X_train.shape, X_valid.shape

((1252, 427), (314, 427))

Desining Neural Network

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

def initialize_parameters(input_size, hidden_size, output_size):
    np.random.seed(42)
    W1 = np.random.randn(hidden_size,input_size)* 0.01
    b1 = np.zeros((hidden_size,1))
    W2 = np.random.randn(output_size,hidden_size) * 0.01
    b2 = np.zeros((output_size,1))
    return W1, b1, W2, b2

def forward_propagation(X, W1, b1, W2, b2):
    Z1 = (W1 @ X.T + b1).T
    A1 = sigmoid(Z1)
    Z2 = (W2 @ A1.T + b2).T
    A2 = sigmoid(Z2)

    return Z1, A1, Z2, A2

def compute_loss(A2, Y):
    logprobs =  -np.log(A2)*Y - np.log(1 - A2)*(1 - Y)
    return logprobs.mean()

def backward_propagation(X, y, Z1, A1, Z2, A2, W1, W2, b1, b2):
    m = X.shape[0]
    #print(X.shape, y.shape, Z1.shape, A1.shape, Z2.shape, A2.shape, W1.shape, W2.shape, b1.shape, b2.shape)
    dZ2 = A2 - y
    dW2 = np.dot(A1.T, dZ2) / m
    #print(dW2.shape)
    db2 = np.sum(dZ2, axis=0, keepdims=True) / m
    #print(db2.shape)
    dZ1 = np.dot(dZ2, W2) * sigmoid_derivative(A1)
    #print(dZ1.shape)
    dW1 = np.dot(X.T, dZ1) / m
    #print(dW1.shape)
    db1 = np.sum(dZ1, axis=0, keepdims=True) / m
    #print(db1.shape)
    return dW1.T, db1.T, dW2.T, db2.T

def update_parameters(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate):
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2
    return W1, b1, W2, b2
#backward_propagation(X_train, y_train, Z1, A1, Z2, A2, W1, W2, b1, b2);

In [None]:
pd.Series(y).value_counts(1)

0    0.909323
1    0.090677
dtype: float64

Training the Model

In [None]:

# Reshape y to be a column vector
Y_train = y_train.reshape(-1, 1)

# Neural network parameters
input_size = X_train.shape[1]
hidden_size = 128
output_size = Y_train.shape[1]
learning_rate = 0.01
epochs = 5000

# Initialize parameters
W1, b1, W2, b2 = initialize_parameters(input_size, hidden_size, output_size)

# Training loop
for epoch in range(epochs):
    # Forward propagation
    Z1, A1, Z2, A2 = forward_propagation(X_train, W1, b1, W2, b2)

    # Compute loss
    loss = compute_loss(A2, Y_train)

    # Backward propagation
    dW1, db1, dW2, db2 = backward_propagation(X_train, Y_train, Z1, A1, Z2, A2, W1, W2, b1, b2)

    # Update parameters
    W1, b1, W2, b2 = update_parameters(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate)

    # Print the loss every 100 epochs
    if epoch % 200 == 0:
        print(f"Epoch {epoch}, Loss: {loss}")

Epoch 0, Loss: 0.6973594843608308
Epoch 200, Loss: 0.29972115641461056
Epoch 400, Loss: 0.2944392058395729
Epoch 600, Loss: 0.2888241117426566
Epoch 800, Loss: 0.2826781133029575
Epoch 1000, Loss: 0.2757686305872609
Epoch 1200, Loss: 0.2678242455553495
Epoch 1400, Loss: 0.2585462728896432
Epoch 1600, Loss: 0.24764426632986133
Epoch 1800, Loss: 0.23490597078772354
Epoch 2000, Loss: 0.2203051723789668
Epoch 2200, Loss: 0.204122718596291
Epoch 2400, Loss: 0.1870048723970134
Epoch 2600, Loss: 0.16986126276236957
Epoch 2800, Loss: 0.153600323069213
Epoch 3000, Loss: 0.13886240088323507
Epoch 3200, Loss: 0.12592093203466692
Epoch 3400, Loss: 0.11475512494726144
Epoch 3600, Loss: 0.10518267966199868
Epoch 3800, Loss: 0.09696672303933523
Epoch 4000, Loss: 0.08987592649340176
Epoch 4200, Loss: 0.08370947092647006
Epoch 4400, Loss: 0.07830269568290303
Epoch 4600, Loss: 0.07352418517326278
Epoch 4800, Loss: 0.06927002162272529


Making prediction on Validation Set

In [None]:
_, _, _, y_prob = forward_propagation(X_valid, W1, b1, W2, b2)
y_pred = 1*(y_prob > 0.5)
print('MCC:',matthews_corrcoef(y_valid, y_pred))
print('Accuracy:',accuracy_score(y_valid, y_pred))

MCC: 0.7661526500553661
Accuracy: 0.964968152866242


Scaling Test Data

In [None]:
X_test = vec.transform(test_data[0].values).toarray()
X_test = sc.transform(X_test)

Predicting Test labels

In [None]:
_, _, _, y_prob = forward_propagation(X_test, W1, b1, W2, b2)
y_test = 1*(y_prob.ravel() > 0.5)
y_test[y_test == 0] = -1


In [None]:
_, _, _, a = forward_propagation(X_test, W1, b1, W2, b2)
a.shape, a.ravel().shape

((392, 1), (392,))

In [None]:
y_test

array([-1, -1,  1, -1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1, -1,  1, -1,
       -1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1,  1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1,
       -1,  1, -1,  1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1

Creating required 'test.txt' file

In [None]:
pd.Series(y_test).to_csv('test.txt', index=False, header=None)