In [33]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import json
import numpy as np
from torchvision import models
from torchsummary import summary

In [63]:
num_epochs = 30
batch_size = 32
learning_rate = 0.001
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [35]:
f = open("strandenergylist.json")
data = json.load(f)
energy = []
strands = []
for pair in data:
    energy.append(pair[1])
    strands.append(pair[0])

In [36]:
intstrands = []
for strand in strands:
    intstrand = []
    for base in strand:
        if base == "A":
            intstrand.append(0)
        if base == "T":
            intstrand.append(1)
        if base == "C":
            intstrand.append(2)
        if base == "G":
            intstrand.append(3)
    intstrands.append(intstrand)

In [54]:
strandstensor = torch.tensor(intstrands)
onehotstrands = F.one_hot(strandstensor, num_classes=4)
onehotstrands = onehotstrands.reshape(1000000,120)
energytensor = torch.tensor(energy)

In [109]:
dataset = TensorDataset(onehotstrands.type(torch.float32), energytensor)
train_set, test_set = torch.utils.data.random_split(dataset, [900000, 100000])
dataloader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
testdataloader = DataLoader(test_set, batch_size=batch_size, shuffle=False)

In [114]:
class RegressionLSTM(nn.Module):
    def __init__(self):
        super(RegressionLSTM, self).__init__()

        self.lstm = nn.LSTM(input_size=120, hidden_size=270, num_layers=2, dropout=0.6)
        self.fc1 = nn.Linear(270,1)

    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.fc1(x)

        return x

In [115]:
lstm = RegressionLSTM() 
lstm.to(device)
lstmcriterion = nn.MSELoss()
lstmoptimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate)

In [116]:
model_parameters = filter(lambda p: p.requires_grad, lstm.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print(params)

1008991


In [None]:
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (batch_data, batch_labels) in enumerate(dataloader):
        batch_data = batch_data.to(device)
        batch_labels = batch_labels.to(device)
        
        # Zero the parameter gradients
        lstmoptimizer.zero_grad()

        # Forward pass
        outputs = lstm(batch_data)

        # Compute the loss
        
        loss = lstmcriterion(outputs, batch_labels.unsqueeze(1))

        # Backward pass
        loss.backward()

        # Update the parameters
        lstmoptimizer.step()

        # Print statistics
        running_loss += loss.item()
        if i % 9000 == 8999:
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 9000))
            running_loss = 0.0
    
    

    totalloss = 0 
    with torch.inference_mode():
      lstm.eval()
      for data, labels in testdataloader:
        data = data.to(device)
        labels = labels.to(device)
        outputs = lstm(data)
        loss = lstmcriterion(outputs, labels.unsqueeze(1))
        totalloss += loss.item()
    
    lstm.train()

    avgloss = totalloss*batch_size/100000
    print('[%d] Test loss: %.3f' %(epoch + 1, avgloss))

[1,  9000] loss: 1.169
[1, 18000] loss: 1.064
[1, 27000] loss: 1.046
[1] Test loss: 1.043
[2,  9000] loss: 1.032
[2, 18000] loss: 1.025
[2, 27000] loss: 1.028
[2] Test loss: 1.030
[3,  9000] loss: 1.017
[3, 18000] loss: 1.014
[3, 27000] loss: 1.011
[3] Test loss: 1.033
[4,  9000] loss: 1.002
[4, 18000] loss: 1.010
[4, 27000] loss: 1.004
[4] Test loss: 1.022
[5,  9000] loss: 0.995
[5, 18000] loss: 0.996
[5, 27000] loss: 1.003
[5] Test loss: 1.018
[6,  9000] loss: 0.988
[6, 18000] loss: 0.992
[6, 27000] loss: 0.996
[6] Test loss: 1.021
[7,  9000] loss: 0.981
[7, 18000] loss: 0.989
[7, 27000] loss: 0.991
[7] Test loss: 1.011
[8,  9000] loss: 0.978
[8, 18000] loss: 0.983
[8, 27000] loss: 0.980
[8] Test loss: 1.012
[9,  9000] loss: 0.980
[9, 18000] loss: 0.969
[9, 27000] loss: 0.979
[9] Test loss: 1.013
[10,  9000] loss: 0.972
[10, 18000] loss: 0.969
[10, 27000] loss: 0.982
[10] Test loss: 1.009
[11,  9000] loss: 0.966
[11, 18000] loss: 0.969
[11, 27000] loss: 0.974
[11] Test loss: 1.008
[1

In [98]:
torch.save(lstm.state_dict(),"lstm1M2.pt")

In [95]:
testoutputs = []
testlabels = []

totalloss = 0 
with torch.inference_mode():
  for data, labels in testdataloader:
    data = data.to(device)
    labels = labels.to(device)
    outputs = lstm(data)
    loss = lstmcriterion(outputs, labels.unsqueeze(1))
    totalloss += loss.item()
    testoutputs.append(outputs)
    testlabels.append(labels)

avgloss = totalloss*batch_size/100000

In [96]:
print(avgloss)

1.0431535211086274


In [None]:
for i, j in zip(testoutputs, testlabels):
  print(i, end="")
  print(j)