# Benchmarking the TextCNN Model
> Sample code to run prediction and evaluate a TextCNN model with Word2Vec embeddings

In [None]:
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm

df_update = pd.read_csv("dataset_mRFP.csv")
sequence = df_update['Sequence'].values.tolist()

## Creating Fragments

In [None]:
import sys

sys.path.append("..")

In [None]:
from utils.fragmentation import KmerFragmenter

fragmenter = KmerFragmenter()
sequences = df_update['Sequence'].values.tolist()
sequences = [x.replace("T", "U") for x in sequences]
fragments = fragmenter.split_words(sequences,3,3)

## Generate Embeddings

These will be used to train the CNN model. Here we are using a simple Word2vec model.

In [None]:
from utils.vectorizer import Vectorizer

vector_space_embedder = Vectorizer()

In [None]:
from gensim import models

word2vec_path = './sg_1_vs_128_ws_5.model'
mod = models.Word2Vec.load(word2vec_path)
vector_space_embedder.model = mod
vector_space_embedder.rna_fragments = fragments
vector_stack_w2v = vector_space_embedder.create_vector_concat()

### Save word2vec matrix for CNN model training
> See `benchmarks/textcnn` for training code

In [None]:
save = False
if save:
    np.save("dataset_mRFP_embeddings.npy", vector_stack_w2v)

## Import Trained CNN model
We trained the model with the embeddings from above by running the `UDS-CodonBERT/benchmarks/textcnn/main.py` script.

Below we import the saved model artifact.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class CNN_Text(nn.Module):
    def __init__(self, args):
        super(CNN_Text, self).__init__()
        self.args = args

        V = args.get("embed_num")
        D = args.get("embed_dim")
        C = 1
        Ci = 1
        Co = args.get("kernel_num")
        Ks = args.get("kernel_sizes")

        # self.embed = nn.Embedding(V, D)
        self.convs = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])
        self.dropout = nn.Dropout(args.get("dropout"))
        self.fc1 = nn.Linear(len(Ks) * Co, C)

        if args.get("static"):
            self.embed.weight.requires_grad = False

    def forward(self, x):
        x = x.unsqueeze(1)  # (N, Ci, W, D)
        x = [
            F.relu(conv(x)).squeeze(3) for conv in self.convs
        ]  # [(N, Co, W), ...]*len(Ks)
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(N, Co), ...]*len(Ks)
        x = torch.cat(x, 1)
        x = self.dropout(x)  # (N, len(Ks)*Co)
        logit = self.fc1(x)  # (N, C)
        return logit

In [None]:
args = {
    "embed_num": 6,
    "embed_dim": 128,
    "kernel_num": 100,
    "kernel_sizes": [3, 4, 5],
    "dropout": 0.1,
    "static": False
}

X_test = np.load("")
y_test = np.load("")
cnn = CNN_Text(args)
cnn.load_state_dict(torch.load(""))

## Benchmark system

> Pass test cases into neural network and measure correlation with actual values

In [None]:
cnn.eval()
with torch.no_grad():
    test_preds = cnn(torch.tensor(X_test)).squeeze().numpy()

In [None]:
from scipy import stats
from sklearn.metrics import mean_squared_error

def train_test_acc(y_test, y_pred):
    spr = stats.spearmanr(y_test, y_pred)[0]
    acc = mean_squared_error(y_test, y_pred)
    return acc, spr

In [None]:
acc, spr = train_test_acc(y_test, test_preds)
print(f"Spearman correlation: {round(spr, 3)}")

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
p1 = max(max(test_preds), max(y_test))
p2 = min(min(test_preds), min(y_test))

ax.scatter(y_test, test_preds)
ax.plot([p1, p2], [p1, p2], "r-")
plt.xlabel("Observed")
plt.ylabel("Predicted");