## DATA305 - Project

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# load in required packages
import pandas as pd
import os
import random
import numpy as np
from tensorflow.keras import layers
import tensorflow as tf

In [None]:
# https://link.springer.com/article/10.1007/s10462-025-11148-3
# This article indicates CNNs as being a good choice for APCs, so
# have decided to implement that architecture

In [None]:
# this random seed will be used for the initial set up (e.g., data split)
random_seed = 0
def set_seeds(seed= random_seed):
  os.environ["PYTHONHASHSEED"] = str(seed)
  random.seed(seed)
  np.random.seed(seed)
  tf.random.set_seed(seed)
  os.environ["TF_DETERMINISTIC_OPS"] = "1"

In [None]:
set_seeds()

In [None]:
# load in data
train = pd.read_csv("/content/drive/My Drive/DATA305 Project Data/train.csv")
test =  pd.read_csv("/content/drive/My Drive/DATA305 Project Data/test.csv")

**Preprocessing**

In [None]:
train["FASTA_length"] = train["FASTA"].str.len()
train["FASTA_length"].describe()
# min size 10
# max size 50

SEQ_MAX_SIZE = 50

In [None]:
unique_chars_train = set()
for sequence in train["FASTA"]:
  unique_chars_train.update(sequence)

len(unique_chars_train)

VOCAB_SIZE = 22 # need to add 2 to incl. unknown and padding
# 20 unique amino acids

In [None]:
from sklearn.model_selection import train_test_split

# validation set of 10% created
X_train, X_val, y_train, y_val = train_test_split(train["FASTA"], train["label"], test_size=0.10, random_state=random_seed)

train_ds = tf.data.Dataset.from_tensor_slices((X_train.values, y_train.values))
valid_ds = tf.data.Dataset.from_tensor_slices((X_val.values, y_val.values))
test_ds = tf.data.Dataset.from_tensor_slices((test["FASTA"].values, test["label"].values))

train_ds = train_ds.batch(24).prefetch(tf.data.AUTOTUNE)
valid_ds = valid_ds.batch(24).prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.batch(24).prefetch(tf.data.AUTOTUNE)

**Modelling**

In [None]:
#checkpoint and early stopping for feature extraction
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint("model_feature_extract.weights.h5",
save_best_only=True,
save_weights_only=True)
early_stopping_cb = tf.keras.callbacks.EarlyStopping(monitor = "accuracy", patience=3,
restore_best_weights=True)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
results_list = []

seq_vec_layer = tf.keras.layers.TextVectorization(max_tokens = 22,
                                                split = "character",
                                                output_sequence_length = SEQ_MAX_SIZE)

seq_vec_layer.adapt(tf.constant(train["FASTA"].tolist()))

embed_size = 180

for i in range(1,6):
  set_seeds(seed = i)
  inputs = tf.keras.Input(shape=(), dtype=tf.string)
  x = seq_vec_layer(inputs)
  x = layers.Embedding(VOCAB_SIZE, embed_size)(x)
  conv3 = layers.Conv1D(64, 3, activation="relu", padding="same")(x)
  conv5 = layers.Conv1D(64, 5, activation="relu", padding="same")(x)
  conv7 = layers.Conv1D(64, 7, activation="relu", padding="same")(x)
  x = layers.Concatenate()([conv3, conv5, conv7])
  x = layers.GlobalMaxPooling1D()(x)
  x = layers.Dense(64, activation="relu")(x)
  x = layers.Dropout(0.2)(x)
  outputs = layers.Dense(1, activation="sigmoid")(x)
  model = tf.keras.Model(inputs=inputs, outputs=outputs)
  model.compile(loss="binary_crossentropy", optimizer="adam", metrics=[tf.keras.metrics.AUC(curve = "PR", name = "PR_AUC"), tf.keras.metrics.AUC(curve = "ROC", name = "ROC_AUC"),  "accuracy"])
  history = model.fit(train_ds, validation_data=valid_ds, epochs=50, callbacks=[checkpoint_cb, early_stopping_cb])

  # Evaluate the model using test.csv
  results = model.evaluate(test_ds, return_dict=True)
  results_list.append(results)
  print(results)
  tf.keras.backend.clear_session()

Epoch 1/50
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 35ms/step - PR_AUC: 0.7405 - ROC_AUC: 0.7259 - accuracy: 0.6543 - loss: 0.6147 - val_PR_AUC: 0.8516 - val_ROC_AUC: 0.8387 - val_accuracy: 0.7585 - val_loss: 0.5207
Epoch 2/50
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 16ms/step - PR_AUC: 0.8645 - ROC_AUC: 0.8589 - accuracy: 0.7889 - loss: 0.4685 - val_PR_AUC: 0.8511 - val_ROC_AUC: 0.8396 - val_accuracy: 0.7458 - val_loss: 0.5425
Epoch 3/50
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - PR_AUC: 0.8973 - ROC_AUC: 0.8875 - accuracy: 0.8084 - loss: 0.4213 - val_PR_AUC: 0.8580 - val_ROC_AUC: 0.8411 - val_accuracy: 0.7331 - val_loss: 0.5742
Epoch 4/50
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - PR_AUC: 0.9237 - ROC_AUC: 0.9137 - accuracy: 0.8478 - loss: 0.3692 - val_PR_AUC: 0.8572 - val_ROC_AUC: 0.8309 - val_accuracy: 0.7331 - val_loss: 0.6205
Epoch 5/50
[1m89/89[0m [32m━━━━━━━━━━━━━

In [None]:
results = pd.DataFrame(results_list)
results["Seeds"] = [1,2,3,4,5]
results.set_index("Seeds", inplace=True)
results

Unnamed: 0_level_0,PR_AUC,ROC_AUC,accuracy,loss
Seeds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.457429,0.835537,0.826113,0.532637
2,0.458733,0.828579,0.82819,0.546094
3,0.503059,0.851088,0.835905,0.533001
4,0.430477,0.820077,0.820772,0.548904
5,0.494572,0.83115,0.836498,0.627337


In [None]:
# ROC AUC standard deviation
std_roc_auc = pd.Series(np.std(results["ROC_AUC"])).to_frame()
std_roc_auc.index = ["std ROC AUC"]

# PR AUC standard deviation
std_pr_auc = pd.Series(np.std(results["PR_AUC"])).to_frame()
std_pr_auc.index = ["std PR AUC"]

results = pd.concat([results.mean(), std_roc_auc, std_pr_auc])
results_final = results.drop(["accuracy", "loss"])
results_final = results_final.rename({"PR_AUC" : "mean PR AUC",
                      "ROC_AUC" : "mean ROC AUC"}, axis = 0)
results_final = results_final.rename({0 : "Values"}, axis = 1)
results_final

Unnamed: 0,Values
mean PR AUC,0.468854
mean ROC AUC,0.833286
std ROC AUC,0.010229
std PR AUC,0.026598


## Discussion

**Learnings**

Initially I started with a deep model with three CNN layers, but found that it was not performing too well, likely due to the size of the training set, and reduced it to two layers - which improved performance.

Initially I also had filters increasing with each subsequent CNN layer - in order to capture the simple features in the lower layers and capture more complex features in the upper layers. However I redesigned the architecture after I discovered parallelisation of CNN filters, which was conducted in a similar problem by Ahmed et al. (2021) which could prevent overfitting problems with the approach of multiple layers, whilst still getting the benefits of various kernal sizes according to Martins, C. (2023).

I found that there could be a trade off between PR AUC and ROC AUC with some models doing better in ROC AUC doing poorly in PR AUC and vice-versa. I ensured that there was a balance that resulted in a model that met the requirements as specified by the assignment of:
- mean PR_AUC >= 0.43
- mean ROC_AUC >= 0.83

I made a good model which performed to an initial seed. However, what I didn't realise was that clear_session() reset didn't reset the model weights so that the model could be re-trained, so it performed very poorly for my other seeds as it was continuing to train the good model (and overfitting), and therefore performed worse on subsequent seeds / testing sets. I spent a lot of time trying to debug / improve my model to perform on the different seeds, but it was actually a problem with my understanding of clear_session() which was impacting the results of my subsequent testing on different seeds.

I also learned that it's good practice to test the performance of a model across various seeds.

**Difficulties**

I had terrible difficulties with hitting the assignment requirements for AUC ROC and PR AUC, however once I realised the problem with the clear_session function rather than my model, this wasn't a problem anymore.

**What has worked and not worked**

I thought maybe a kernel of 10 would allow the model to detect longer motifs and improve performance however it caused PR_AUC to increase substantially but resulted in AUC_ROC falling below assignment specifications (i.e., PR_AUC and ROC_AUC trade off), so I removed this.

Initially I also tried decreasing filter sizes with depth for the CNN (before I moved to the parallelisation architecture) however this didn't perform well, so went with a static filter size. With the parallisation this was kept as a static / fixed number for each parallelised CNN.

Additionally, for my early stopping I used accuracy rather than ROC AUC or PR AUC as using one of those criterion (ROC AUC vs PR AUC) would cause the other to perform worse, relating to the trade off between the two criteria.

**Future directions**
- Consulting with a domain expert to consider appropriate transformations or model designs to better capture features.
- Consider hyperparameter tuning of the model

**References**

Ahmed, S., Muhammod, R., Khan, Z. H., Adilina, S., Sharma, A., Shatabda, S., & Dehzangi, A. (2021). ACP-MHCNN: an accurate multi-headed deep-convolutional neural network to predict anticancer peptides. *Scientific reports*, *11*(1), 23676. https://doi.org/10.1038/s41598-021-02703-3

Martins, C. (2023). *Inception — Understanding Multiple Parallel Convolutional Layers.* Medium. https://cdanielaam.medium.com/inception-understanding-multiple-parallel-convolutional-layers-7be281aab2da
