packages used

In [21]:
# !pip install pandas
# !pip install torch
# !pip install matplotlib
# !pip install "numpy<2"

In [None]:
import pandas as pd
import torch

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import functools
import re
import numpy as np
import os
from sys import platform
from model import Model

In [23]:
# hyperparameters
hiddenDim = 512
batch_size = 16
outputDim = 3 # three coords for each output
learning_rate = 0.0001
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cpu


# Preprocessing

In [24]:

# input processing

if platform == "linux" or platform == "linux2":
    inputDir = "/kaggle/input/"
elif platform == "darwin":
    inputDir = "."
trainSeqs = pd.read_csv(os.path.join(inputDir,"stanford-rna-3d-folding/train_sequences.csv"))



nucs = functools.reduce(set.union,trainSeqs["sequence"].apply(list).map(set).to_list()) # get all unique nucs at all positions
nucs.add("NONE")
nucToIdx = {nuc: i for i, nuc in enumerate(nucs)}

def processSeqDF(inputSeqs,nucToIdx):
    indexSeqs = inputSeqs["sequence"].apply(list)
    indexSeqs = indexSeqs.apply(pd.Series).fillna("NONE")
    idxToKeep = torch.tensor((indexSeqs != 'NONE').to_numpy(int),dtype=int).unsqueeze(-1)



    indexSeqs = indexSeqs.map(lambda nuc: nucToIdx[nuc])
    indexSeqs.index = inputSeqs["target_id"]

    seqTensor = torch.tensor(indexSeqs.values, dtype=torch.int)
    return seqTensor, indexSeqs, idxToKeep

seqTensor,_ , idxToKeep = processSeqDF(trainSeqs, nucToIdx)

In [25]:
# output label processing

trainLabels = pd.read_csv(os.path.join(inputDir,"stanford-rna-3d-folding/train_labels.csv"))
trainLabels["ID"] = trainLabels["ID"].map(lambda x: re.sub("_\d+$","",x))
pivotedLabels = trainLabels.pivot(index="ID",columns="resid",values=["x_1","y_1","z_1"])
pivotedLabels.fillna(0,inplace=True)
pivotedLabels.shape
labelTensor = torch.tensor(pivotedLabels.to_numpy().reshape(pivotedLabels.shape[0],3,-1)).transpose(1,2)
print(labelTensor[10,:20,0])

labelTensor = labelTensor - torch.concat((torch.zeros_like(labelTensor[:,:1]), labelTensor[:,:-1]), dim=1)

# 10, 5, 10 = 10, 5, 10
# 20, 13,5 = 10, 8, -5 labelTensor[:,1:]

# 10 - 0 20 - 10
# 10, 20,25, 28,


print(labelTensor[10,:20,0])
# labelTensor = torch.nn.functional.normalize(labelTensor)

  trainLabels["ID"] = trainLabels["ID"].map(lambda x: re.sub("_\d+$","",x))


tensor([ -8.8940,  -6.2430,  -4.4540,  -3.7060,  -5.6050, -10.1530, -13.1060,
        -18.6920, -18.3870, -19.1380, -17.9000, -13.7290,  -8.1270,  -3.6100,
          0.0590,   0.5840,   1.4240,   0.0000,   0.0000,   0.0000],
       dtype=torch.float64)
tensor([-8.8940,  2.6510,  1.7890,  0.7480, -1.8990, -4.5480, -2.9530, -5.5860,
         0.3050, -0.7510,  1.2380,  4.1710,  5.6020,  4.5170,  3.6690,  0.5250,
         0.8400, -1.4240,  0.0000,  0.0000], dtype=torch.float64)


In [26]:
# make data loader
train_loader = DataLoader(list(zip(seqTensor, idxToKeep,labelTensor)),batch_size=batch_size,shuffle=True)

In [27]:
## test input processing 
testSeqs = pd.read_csv(os.path.join(inputDir,"stanford-rna-3d-folding/test_sequences.csv"))
seqTensor, indexSeqs, idxToKeep = processSeqDF(testSeqs, nucToIdx)
test_loader = DataLoader(list(zip(seqTensor,idxToKeep, indexSeqs.index)), batch_size=2)

# model architecture and training

In [29]:
def plotRollingAvg(losses,window = 100):
    plt.plot(losses,alpha=0.5)
    
    plt.plot([sum(losses[i:i+window]) / len(losses[i:i+window])  for i in range(len(losses))],color="b")


In [30]:
def train(model, opter, train_loader,numEpochs = 100,logFreq = 5):
    losses = []
    try:
        with open("trainingLog.txt", "a") as logFile:
            for epoch in range(numEpochs):
                for step, (batchSeqs, batchIdx, batchLabels) in enumerate(train_loader):
                    batchSeqs = batchSeqs.to(device)
                    batchLabels = batchLabels.to(device)
                    batchIdx = batchIdx.to(device)
                    opter.zero_grad()

                    
                    seqLens = torch.sum(batchIdx,axis=1)

                    maxLen = max(seqLens)
                    batchSeqs = batchSeqs[:,:maxLen]
                    batchIdx = batchIdx[:,:maxLen]
                    batchLabels = batchLabels[:,:maxLen]
                    # MSE that accounts for differing lengths. This makes short sequences have a similar error to long sequences and zeros out errors on non existant positions
                    loss = torch.sum((batchLabels - model(batchSeqs)) ** 2 * batchIdx / seqLens.unsqueeze(-1)) 
                    
                    losses.append(loss.item())
                    loss.backward()
                    opter.step()

                    if step % logFreq == 0:
                        print("epoch", epoch,"step", step, torch.mean(torch.tensor(losses[-logFreq:])), file=logFile,flush=True)
                        print("epoch", epoch,"step", step, torch.mean(torch.tensor(losses[-logFreq:])),flush=True)
                        torch.save(model, "model.pt")
    except KeyboardInterrupt:
        return losses
    # except:
    #     torch.save(model, "model.pt")
    return losses


In [31]:
numDistinctInputs = len(nucs)
model = Model(numDistinctInputs, hiddenDim, outputDim, device)

opter = torch.optim.Adam(model.parameters(),lr=learning_rate)
# losses = train(model, opter, train_loader)


made model with 6309891 parameters of which 6309891 are trainable


In [32]:
# plotRollingAvg(losses)