In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import seaborn as sns

# Load data

In [2]:
data_train = pd.read_csv('data/danrer11_chopchop_train.csv',index_col=0)
data_test = pd.read_csv('data/danrer11_chopchop_test.csv',index_col=0)

# Transform Data

## One-hot encoding

In [3]:
encoding = {'A':np.array([1,0,0,0]),
            'C':np.array([0,1,0,0]),
            'G':np.array([0,0,1,0]),
            'T':np.array([0,0,0,1])}

def one_hot(guide,encoding):
    data = np.zeros((4,len(guide)))
    assert data.shape == (4,23)
    for i in range(data.shape[-1]):
        data[:,i] = encoding[guide[i]]
    return data

#print(one_hot('CTGATCACGGCTGAAGGACTCGG',encoding))

def batch_one_hot(data,encoding):
    guides = np.zeros((len(data),4,23))
    i=0
    for guide in data['GUIDE']:
        guides[i] = one_hot(guide,encoding)
        i+=1
    return guides

guides_train = batch_one_hot(data_train,encoding)
guides_test = batch_one_hot(data_test,encoding)
print('Train dataset size:',guides_train.shape)
print('Test dataset size:',guides_test.shape)

Train dataset size: (226420, 4, 23)
Test dataset size: (56606, 4, 23)


## `Pytorch` data format

In [65]:
class GGEDataset(Dataset):

    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        sample = self.data[idx]
        if self.transform:
            sample = self.transform(sample)

        return sample.float()
    
transform = transforms.Compose([
        transforms.ToTensor()
    ])
GGE_dataset_train = GGEDataset(data = guides_train, transform = transform)
GGE_dataset_test = GGEDataset(data = guides_test, transform = transform)

trainloader = torch.utils.data.DataLoader(GGE_dataset_train, batch_size=50000,shuffle=True, num_workers=12)
testloader = torch.utils.data.DataLoader(GGE_dataset_test, batch_size=50000,shuffle=True, num_workers=12)

### Inspect dataset

In [66]:
next(iter(trainloader))

tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 1., 0., 0.],
          [0., 1., 1.,  ..., 0., 1., 1.],
          [1., 0., 0.,  ..., 0., 0., 0.]]],


        [[[0., 1., 0.,  ..., 0., 0., 0.],
          [0., 0., 1.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 1., 1., 1.],
          [1., 0., 0.,  ..., 0., 0., 0.]]],


        [[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 1., 1.],
          [1., 1., 1.,  ..., 1., 0., 0.]]],


        ...,


        [[[0., 0., 1.,  ..., 0., 0., 0.],
          [1., 0., 0.,  ..., 0., 0., 0.],
          [0., 1., 0.,  ..., 0., 1., 1.],
          [0., 0., 0.,  ..., 1., 0., 0.]]],


        [[[1., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 1.,  ..., 1., 0., 0.],
          [0., 1., 0.,  ..., 0., 1., 1.],
          [0., 0., 0.,  ..., 0., 0., 0.]]],


        [[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [1., 0., 1.,  ..., 0., 1., 1.],

# Create Autoencoder

In [67]:
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()

        self.encoder = nn.Sequential(
            nn.Linear(92, 60),
            nn.ReLU(),
            nn.Linear(60, 40),
            nn.ReLU(),
            nn.Linear(40, 30)
        )
        self.decoder = nn.Sequential(
            nn.Linear(30, 40),
            nn.ReLU(),
            nn.Linear(40, 60),
            nn.ReLU(),
            nn.Linear(60, 92),
            nn.Sigmoid()
        )
        
        self.conv1 = nn.Conv2d(1, 6, 2)
        self.pool1 = nn.MaxPool2d(2, 2)
        self.conv2 = nn.ConvTranspose2d(6, 1, 2)
        self.pool2 =nn.MaxUnpool2d(2, 2)

    def forward(self, x):
        x = x.view(-1,1*4*23)
        x = self.encoder(x)
        x = self.decoder(x)
        x = x.view(-1,1,4,23)

        return x

In [68]:
net_t = Autoencoder()
data_i = next(iter(trainloader))
net_t(data_i).shape

torch.Size([50000, 1, 4, 23])

In [69]:
data_i.shape

torch.Size([50000, 1, 4, 23])

## Training Data

In [70]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [71]:
net = Autoencoder().to(device)

In [78]:
#criterion = nn.CrossEntropyLoss()
criterion = nn.MSELoss()
optimizer = optim.SGD(net.parameters(), lr=1, momentum=0.5)

for epoch in range(200):
    running_loss = 0.0
    for data in trainloader:
        data = data.to(device)
        optimizer.zero_grad()
        outputs = net(data)
        loss = criterion(outputs, data)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    if epoch % 10 == 9:
        print('Epoch:{}, loss:{}'.format(epoch+1,running_loss))

Epoch:10, loss:0.22641125693917274
Epoch:20, loss:0.2118239402770996
Epoch:30, loss:0.2050689198076725
Epoch:40, loss:0.2001567967236042
Epoch:50, loss:0.19618016853928566
Epoch:60, loss:0.19258033484220505
Epoch:70, loss:0.1893480084836483
Epoch:80, loss:0.18626950681209564
Epoch:90, loss:0.18349628522992134
Epoch:100, loss:0.180872593075037
Epoch:110, loss:0.17834994196891785
Epoch:120, loss:0.17595647275447845
Epoch:130, loss:0.17370330542325974
Epoch:140, loss:0.17145054414868355
Epoch:150, loss:0.16938578337430954
Epoch:160, loss:0.167251817882061
Epoch:170, loss:0.1651107221841812
Epoch:180, loss:0.1631321795284748
Epoch:190, loss:0.16110064461827278
Epoch:200, loss:0.15907832980155945


## Inspect Output

In [14]:
data_i = next(iter(trainloader))
print('RAW Data:\n',data_i.int()[0,0,:,:])
with torch.no_grad():
    output_i = net(data_i.to(device)).to('cpu')
    i = 0
    for s in output_i:
        output_i[i] = F.one_hot(torch.argmax(s[0,:,:],axis = 0),num_classes=4).T
        i += 1
    print('\nReconstruct Data:\n',output_i.int()[0,0,:,:])

RAW Data:
 tensor([[0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1],
        [1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0]],
       dtype=torch.int32)

Reconstruct Data:
 tensor([[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1],
        [0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
       dtype=torch.int32)


## Evaluation

In [80]:
with torch.no_grad():
    correct = 0
    for data in trainloader:
        outputs = net(data.to(device)).to('cpu')
        i = 0
        for s in outputs:
            outputs[i] = F.one_hot(torch.argmax(s[0,:,:],axis = 0),num_classes=4).T
            i += 1
        
        for i in range(len(data)):
            for j in range(23):
                if outputs[i][:,:,j].int().equal(data[i][:,:,j].int()):
                    correct += 1
    accuracy = correct/len(trainloader.dataset)/23
    print('Training Accuracy: {}%'.format(accuracy*100))

Training Accuracy: 91.41553019974423%


In [79]:
with torch.no_grad():
    correct = 0
    for data in testloader:
        outputs = net(data.to(device)).to('cpu')
        i = 0
        for s in outputs:
            outputs[i] = F.one_hot(torch.argmax(s[0,:,:],axis = 0),num_classes=4).T
            i += 1
        
        for i in range(len(data)):
            for j in range(23):
                if outputs[i][:,:,j].int().equal(data[i][:,:,j].int()):
                    correct += 1
    accuracy = correct/len(testloader.dataset)/23
    print('Testing Accuracy: {}%'.format(accuracy*100))

Testing Accuracy: 91.33845083252812%


In [81]:
torch.save({'state_dict': net.state_dict()}, 'model/autoencoder.pth.tar')