In [1]:
import torch

In [2]:
import pandas as pd
import glob
import numpy as np

In [3]:
li = []
for filename in glob.glob("*.tsv"):
    df = pd.read_csv(filename, sep="\t")
    li.append(df)
    
data = pd.concat(li, axis=0, ignore_index=True)

In [3]:
data = pd.read_csv("danRer11_chopchop.csv")

In [4]:
data

Unnamed: 0,Rank,Target sequence,Genomic location,Self-complementarity,MM0,MM1,MM2,MM3,Efficiency,Batch Zone
0,1,CGATGTTGGGAAACTTGGGTAGG,chr10:10016472,0,1,0,0,0,70.94,chr10:10016197-10016497
1,2,GATGAATGAGGAACGCGCAGCGG,chr10:10016403,0,1,0,0,0,64.36,chr10:10016197-10016497
2,3,TCAATTCATTATTCACGCGGAGG,chr10:10016437,0,1,0,0,0,59.94,chr10:10016197-10016497
3,4,ACGCGTCTTGAGCACTCGCTGGG,chr10:10016211,0,1,0,0,0,51.73,chr10:10016197-10016497
4,5,AAGATCGATGTTGGGAAACTTGG,chr10:10016477,0,1,0,0,0,38.63,chr10:10016197-10016497
...,...,...,...,...,...,...,...,...,...,...
283021,17,CAGAAAAGAAGATACTCTGGGGG,chr9:9961117,0,0,0,4,29,70.45,chr9:9960819-9961119
283022,18,CTGGGGGAAAAAAAGCTGAAAGG,chr9:9961101,0,0,0,5,32,58.27,chr9:9960819-9961119
283023,19,GAGCAGAAAAGAAGATACTCTGG,chr9:9961120,1,0,0,3,38,40.29,chr9:9960819-9961119
283024,20,GCAGAAAAGAAGATACTCTGGGG,chr9:9961118,0,0,0,2,47,69.60,chr9:9960819-9961119


In [5]:
train_data = data[:10000]
test_data = data[10000:12000]

In [6]:
data['Target sequence'][0]

'TTGCGTAATCTTGTGAGAGTCGG'

In [7]:
data['Efficiency']

0       59.78
1       38.36
2       58.56
3       62.58
4       60.03
        ...  
1182    48.42
1183    29.75
1184    52.38
1185    38.71
1186    46.64
Name: Efficiency, Length: 1187, dtype: float64

In [35]:
transform_sequence(data['Target sequence'][0]).shape

(92,)

In [6]:
def transform_sequence(seq):
    m = np.zeros((len(seq), 4))
    for i, char in enumerate(seq):
        if char == 'A':
            m[i][0] = 1
        elif char == 'T':
            m[i][1] = 1
        elif char == 'C':
            m[i][2] = 1
        elif char == 'G':
            m[i][3] = 1
    m = m.reshape(m.shape[0]*m.shape[1])
    return m

def transform_sequence_rnn(seq):
    m = np.zeros((len(seq), 4))
    for i, char in enumerate(seq):
        if char == 'A':
            m[i][0] = 1
        elif char == 'T':
            m[i][1] = 1
        elif char == 'C':
            m[i][2] = 1
        elif char == 'G':
            m[i][3] = 1
#     m = m.reshape(m.shape[0]*m.shape[1])
    return m


In [108]:
data['Target sequence'].values

array(['TTGCGTAATCTTGTGAGAGTCGG', 'CACACATACGTCCGTGCTGCTGG',
       'GCTCCCTCTAGTGCTTTGGTTGG', ..., 'CATTGAGAGCCGTGTGCCGAAGG',
       'ACACGGCTCTCAATGACATTTGG', 'TCAAAACTTTTTCCTATGAAGGG'], dtype=object)

In [134]:
transform_sequence(data['Target sequence'].values)

array([0., 0., 0., ..., 0., 0., 0.])

In [12]:
class GeneDataset(object):
    def __init__(self, data, use_rnn=False):
        self.target_sequence = data['Target sequence'].values
        self.efficiency = data['Efficiency'].values
        self.use_rnn = use_rnn
    
    def __getitem__(self, idx):
        if self.use_rnn:
            seq = torch.as_tensor(transform_sequence_rnn(self.target_sequence[idx]), dtype=torch.float32)
        else:
            seq = torch.as_tensor(transform_sequence(self.target_sequence[idx]), dtype=torch.float32)
        eff = torch.as_tensor(self.efficiency[idx] / 100, dtype=torch.float32)
        return seq, eff
    
    def __len__(self):
        return len(self.target_sequence)

In [58]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(92, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        x = torch.sigmoid(x)
        return x

class RNN_Net(nn.Module):
    def __init__(self):
        super(RNN_Net, self).__init__()
        self.lstm = nn.LSTM(input_size=4, hidden_size=16, num_layers=2, batch_first=True)
        self.fc = nn.Linear(16, 1)

    def forward(self, x):
        x1, _ = self.lstm(x)
        x2 = torch.mean(x1, 1)
#         x3 = self.fc(x1[:,-1,:])
        x3 = torch.sigmoid(x2)
        
        return x3
    

In [51]:
batch_size = 64
seq_len = 23
input_dim = 4
n_layers = 1
hidden_dim = 16
inp = torch.randn(batch_size, seq_len, input_dim)
hidden_state = torch.randn(n_layers, batch_size, hidden_dim)
cell_state = torch.randn(n_layers, batch_size, hidden_dim)
hidden = (hidden_state, cell_state)

lstm_layer = nn.LSTM(input_dim, hidden_dim, n_layers, batch_first=True)


In [23]:
out, hidden = lstm_layer(inp, hidden)


In [28]:
print(out.shape)
print(hidden[0].shape)
print(torch.mean(out, 1).shape)

torch.Size([64, 23, 16])
torch.Size([1, 64, 16])
torch.Size([64, 16])


In [59]:
net = RNN_Net()
criterion = nn.MSELoss()
optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9)

In [60]:
train_set = GeneDataset(train_data, use_rnn=True)
trainloader = torch.utils.data.DataLoader(train_set, batch_size=64,
                                          shuffle=True, num_workers=2)

In [61]:
for epoch in range(500):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, ele in enumerate(trainloader):
        # get the inputs; data is a list of [inputs, labels]
        seq, eff = ele
#         print(ele)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(seq)
#         print(outputs.shape)
#         print("outputs", outputs, "eff", eff)
        loss = criterion(outputs[:, 0], eff)
#         print("loss", loss)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()

    print("Epoch {} ,loss: {}".format(epoch, running_loss))
    
print('Finished Training')
# train_time[8] = time.time() - now

Epoch 0 ,loss: 2.514204996638
Epoch 1 ,loss: 2.5159356743097305
Epoch 2 ,loss: 2.4934799736365676
Epoch 3 ,loss: 2.483655429445207
Epoch 4 ,loss: 2.469254986383021
Epoch 5 ,loss: 2.4467031359672546
Epoch 6 ,loss: 2.4049555994570255
Epoch 7 ,loss: 2.3525924663990736
Epoch 8 ,loss: 2.2975352043285966
Epoch 9 ,loss: 2.2633271254599094
Epoch 10 ,loss: 2.2532203029841185
Epoch 11 ,loss: 2.2462245505303144
Epoch 12 ,loss: 2.235874073114246
Epoch 13 ,loss: 2.2284156708046794
Epoch 14 ,loss: 2.214628697372973
Epoch 15 ,loss: 2.207522557582706
Epoch 16 ,loss: 2.2054627994075418
Epoch 17 ,loss: 2.2013190928846598
Epoch 18 ,loss: 2.18560887966305
Epoch 19 ,loss: 2.1852404293604195
Epoch 20 ,loss: 2.1796050556004047
Epoch 21 ,loss: 2.1731291571632028
Epoch 22 ,loss: 2.168731168843806
Epoch 23 ,loss: 2.163150707259774
Epoch 24 ,loss: 2.1564439618960023
Epoch 25 ,loss: 2.148312389384955
Epoch 26 ,loss: 2.137854639440775
Epoch 27 ,loss: 2.131053335033357
Epoch 28 ,loss: 2.1258258083835244
Epoch 29 ,l

Epoch 233 ,loss: 1.3702361327596009
Epoch 234 ,loss: 1.3595450734719634
Epoch 235 ,loss: 1.3573345611803234
Epoch 236 ,loss: 1.367452866397798
Epoch 237 ,loss: 1.3708941009826958
Epoch 238 ,loss: 1.362440211698413
Epoch 239 ,loss: 1.35294488677755
Epoch 240 ,loss: 1.3540351730771363
Epoch 241 ,loss: 1.3600806728936732
Epoch 242 ,loss: 1.3615943198092282
Epoch 243 ,loss: 1.3510226835496724
Epoch 244 ,loss: 1.3513199109584093
Epoch 245 ,loss: 1.3497275495901704
Epoch 246 ,loss: 1.347585535608232
Epoch 247 ,loss: 1.3486528075300157
Epoch 248 ,loss: 1.3488802025094628
Epoch 249 ,loss: 1.3420351026579738
Epoch 250 ,loss: 1.3387743416242301
Epoch 251 ,loss: 1.3439036458730698
Epoch 252 ,loss: 1.3495812439359725
Epoch 253 ,loss: 1.3390296828001738
Epoch 254 ,loss: 1.3507283218204975
Epoch 255 ,loss: 1.3418721375055611
Epoch 256 ,loss: 1.335901897167787
Epoch 257 ,loss: 1.3375207991339266
Epoch 258 ,loss: 1.3407345805317163
Epoch 259 ,loss: 1.330197413917631
Epoch 260 ,loss: 1.3404656858183444

Epoch 463 ,loss: 1.2377893798984587
Epoch 464 ,loss: 1.2420594117138535
Epoch 465 ,loss: 1.2404697756282985
Epoch 466 ,loss: 1.2376735066063702
Epoch 467 ,loss: 1.2424041708000004
Epoch 468 ,loss: 1.2484887572936714
Epoch 469 ,loss: 1.2358752149157226
Epoch 470 ,loss: 1.2465306425001472
Epoch 471 ,loss: 1.23953393753618
Epoch 472 ,loss: 1.2388776787556708
Epoch 473 ,loss: 1.241737405769527
Epoch 474 ,loss: 1.2456393893808126
Epoch 475 ,loss: 1.2455051974393427
Epoch 476 ,loss: 1.2398031037300825
Epoch 477 ,loss: 1.2410291456617415
Epoch 478 ,loss: 1.2395324958488345
Epoch 479 ,loss: 1.2406102470122278
Epoch 480 ,loss: 1.2377249686978757
Epoch 481 ,loss: 1.2376517378725111
Epoch 482 ,loss: 1.2346416041254997
Epoch 483 ,loss: 1.2343202116899192
Epoch 484 ,loss: 1.2339724595658481
Epoch 485 ,loss: 1.2374164503999054
Epoch 486 ,loss: 1.2300389853771776
Epoch 487 ,loss: 1.2345951315946877
Epoch 488 ,loss: 1.2390905641950667
Epoch 489 ,loss: 1.2311498397029936
Epoch 490 ,loss: 1.233441307209

In [43]:
test_set = GeneDataset(test_data, use_rnn=True)
testloader = torch.utils.data.DataLoader(test_set, batch_size=1,
                                          shuffle=True, num_workers=2)

In [48]:
mse = 0
for i, ele in enumerate(testloader):
    # get the inputs; data is a list of [inputs, labels]
    seq, eff = ele

    # forward + backward + optimize
    outputs = net(seq)
    if i % 10 == 0:
        print("outputs", outputs[0], "eff", eff)
    mse += criterion(outputs[0], eff)
mse = mse / len(test_set)
print(mse)


outputs tensor([0.4250], grad_fn=<SelectBackward>) eff tensor([0.4625])
outputs tensor([0.5037], grad_fn=<SelectBackward>) eff tensor([0.4376])
outputs tensor([0.5765], grad_fn=<SelectBackward>) eff tensor([0.6153])
outputs tensor([0.5796], grad_fn=<SelectBackward>) eff tensor([0.6121])
outputs tensor([0.2934], grad_fn=<SelectBackward>) eff tensor([0.2319])
outputs tensor([0.5205], grad_fn=<SelectBackward>) eff tensor([0.5630])
outputs tensor([0.5796], grad_fn=<SelectBackward>) eff tensor([0.5140])
outputs tensor([0.4618], grad_fn=<SelectBackward>) eff tensor([0.4843])
outputs tensor([0.6120], grad_fn=<SelectBackward>) eff tensor([0.6284])
outputs tensor([0.4306], grad_fn=<SelectBackward>) eff tensor([0.4739])
outputs tensor([0.6549], grad_fn=<SelectBackward>) eff tensor([0.5728])
outputs tensor([0.4092], grad_fn=<SelectBackward>) eff tensor([0.3839])
outputs tensor([0.4084], grad_fn=<SelectBackward>) eff tensor([0.4373])
outputs tensor([0.3175], grad_fn=<SelectBackward>) eff tensor([0

outputs tensor([0.1862], grad_fn=<SelectBackward>) eff tensor([0.0127])
outputs tensor([0.6725], grad_fn=<SelectBackward>) eff tensor([0.6679])
outputs tensor([0.5476], grad_fn=<SelectBackward>) eff tensor([0.4851])
outputs tensor([0.4032], grad_fn=<SelectBackward>) eff tensor([0.4302])
outputs tensor([0.6320], grad_fn=<SelectBackward>) eff tensor([0.6480])
outputs tensor([0.5234], grad_fn=<SelectBackward>) eff tensor([0.3951])
outputs tensor([0.5181], grad_fn=<SelectBackward>) eff tensor([0.5296])
outputs tensor([0.5841], grad_fn=<SelectBackward>) eff tensor([0.6425])
outputs tensor([0.4498], grad_fn=<SelectBackward>) eff tensor([0.4452])
outputs tensor([0.5400], grad_fn=<SelectBackward>) eff tensor([0.5736])
outputs tensor([0.6234], grad_fn=<SelectBackward>) eff tensor([0.6283])
outputs tensor([0.1193], grad_fn=<SelectBackward>) eff tensor([0.2149])
outputs tensor([0.6640], grad_fn=<SelectBackward>) eff tensor([0.6067])
outputs tensor([0.6764], grad_fn=<SelectBackward>) eff tensor([0

In [49]:
RMSE = np.sqrt(mse.detach().numpy())
print("RMSE:", RMSE)

RMSE: 0.05673287
