In [7]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.utils.data as Data
import numpy as np
import matplotlib.pyplot as plt
from os import listdir
from os.path import isfile, join
from PIL import Image
import random

from sklearn.metrics import accuracy_score
from copy import deepcopy

## Read in training data

In [107]:
seq = np.load("../data/final_data/trainingSequence.npz")
training_dna = seq["dna"].astype(np.int8)
training_dna_labels = seq["labels"].astype(np.int8)
training_dna_labels = np.where(training_dna_labels == [2], [0], [1])

seq = np.load("../data/final_data/testingSequence.npz")
testing_dna = seq["dna"].astype(np.int8)
testing_dna_labels = seq["labels"].astype(np.int8)
testing_dna_labels = np.where(testing_dna_labels == [2], [0], [1])


In [108]:
print(training_dna.shape)
print(training_dna_labels.shape)
print(testing_dna.shape)
print(testing_dna_labels.shape)

(11162, 2200, 4)
(11162, 1)
(1222, 2200, 4)
(1222, 1)


In [109]:
chip_seq = np.load("../data/final_data/trainingChip.npz")
training_chip = chip_seq["chip"].astype(np.float32)
training_chip_labels = chip_seq["labels"].astype(np.int8)
training_chip_labels = np.where(training_chip_labels == [2], [0], [1])

chip_seq = np.load("../data/final_data/testingChip.npz")
testing_chip = chip_seq["chip"].astype(np.float32)
testing_chip_labels = chip_seq["labels"].astype(np.int8)
testing_chip_labels = np.where(testing_chip_labels == [2], [0], [1])


In [110]:
print(training_chip.shape)
print(training_chip_labels.shape)
print(testing_chip.shape)
print(testing_chip_labels.shape)

(11162, 2200, 20)
(11162, 1)
(1222, 2200, 20)
(1222, 1)


## Build CNN

In [111]:
class Pol3_CNN(nn.Module):
    def __init__(self):
        super().__init__()
        

        """
        Define your layers here.
        """
        
        self.conv1 = nn.Conv1d(in_channels=2200,out_channels=320,kernel_size=4, padding=70)
        self.relu1 = nn.ReLU()
        self.maxpool1 = nn.MaxPool1d(kernel_size=4, padding=2)
        self.bn1 = nn.BatchNorm1d(320)
        self.conv2 = nn.Conv1d(in_channels=320, out_channels=480, kernel_size=4)
        self.relu2 = nn.ReLU()
        self.maxpool2 = nn.MaxPool1d(kernel_size=4, padding=2)
        self.bn2 = nn.BatchNorm1d(480)
        self.output_layer = nn.Linear(4320, 1)


    def forward(self, X):

        """
        Design the process of your network.
        """
        
        X = self.conv1(X)
        X = self.bn1(X)
        X = self.relu1(X)
        X = self.maxpool1(X)
        
        X = self.conv2(X)
        X = self.bn2(X)
        X = self.relu2(X)
        X = self.maxpool2(X)
        
        X = torch.flatten(X, 1)        

        logits = self.output_layer(X)
        return logits   # do not apply softmax

    def classify(self, X):
        
        """
        Write a function that outputs the labels.
        """
        logits = self(X)
        logits = F.softmax(logits, dim=1)
        logits = nn.functional.normalize(logits, dim = 1)
        labels = torch.argmax(logits, dim=1)
        return labels.type(torch.long)

In [112]:
def train(model, training_data, training_labels, testing_data, testing_labels, epochs=15, batch_size=16, lr=1e-3):
    """
    Q:  write the training loop following the schema shown above.

    Inputs
    - model: the model to be trained - a PyTorch nn.Module class object
    - X_train, y_train, X_val, y_val: training and validation data
    - epochs: num epochs, or the number of times we want to run through the entire training data
    - batch_size: number of data points per batch
    - lr: learning rate
    - optimizer: optimizer used

    Outputs
    - losses: a list of losses
    - accuracies: a list of accuracies
    """

    torch.cuda.empty_cache()

    best_acc = -1
    best_model = None

    batches = int(np.ceil(len(training_data) / batch_size))

    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    losses = []
    accuracies = []

    testing_data = torch.tensor(testing_data).type(torch.float32)
    testing_labels = torch.tensor(testing_labels).type(torch.float32)

    for epoch in range(epochs):
        for i in range(batches):
#             if i % 1000 == 0:
#               print("Epoch " + str(epoch+1) + "/" + str(epochs) + ": " + str(i/batches*100) + "%")
            X_batch = training_data[i*batch_size:i*batch_size+batch_size]
            X_batch = torch.tensor(X_batch).type(torch.float32)
            y_batch = training_labels[i*batch_size:i*batch_size+batch_size]
            y_batch = torch.tensor(y_batch).type(torch.float32)
            
            logits = model(X_batch)
            loss = loss_fn(logits, y_batch)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        # calculate the validation accuracy and append the loss of this epoch
        y_pred_test = model.classify(testing_data)
        accuracy = accuracy_score(testing_labels.cpu(), y_pred_test.cpu())
        accuracies.append(accuracy)
        losses.append(loss)

        if accuracy > best_acc:
          best_acc = accuracy
          best_model = deepcopy(model)

        # print epoch, loss, and current test accuracy (don't delete this line - it's slightly more organized now)
        print(f"Epoch {epoch + 1}:\tloss {np.round(loss.detach().cpu().numpy().item(), 4)}\t& accuracy {np.round(accuracy, 4)}")
    print(f"Resetting model... Best validation accuracy:\t{np.round(best_acc, 4)}")
    model.load_state_dict(best_model.state_dict())
    return losses, accuracies


In [113]:
model = Pol3_CNN()
losses, accuracies = train(model, training_dna, training_dna_labels, testing_dna, testing_dna_labels, batch_size=100, epochs=10)


Epoch 1:	loss -0.0	& accuracy 0.5
Epoch 2:	loss -0.0	& accuracy 0.5
Epoch 3:	loss -0.0	& accuracy 0.5
Epoch 4:	loss -0.0	& accuracy 0.5
Epoch 5:	loss -0.0	& accuracy 0.5
Epoch 6:	loss -0.0	& accuracy 0.5
Epoch 7:	loss -0.0	& accuracy 0.5
Epoch 8:	loss -0.0	& accuracy 0.5
Epoch 9:	loss -0.0	& accuracy 0.5


KeyboardInterrupt: 