In [18]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import seaborn as sns

# Load data

In [19]:
data_train = pd.read_csv('data/danrer11_chopchop_train.csv',index_col=0)
data_test = pd.read_csv('data/danrer11_chopchop_test.csv',index_col=0)

# Transform Data
## One-hot encoding

In [20]:
encoding = {'A':np.array([1,0,0,0]),
            'C':np.array([0,1,0,0]),
            'G':np.array([0,0,1,0]),
            'T':np.array([0,0,0,1])}

def one_hot(guide,encoding):
    data = np.zeros((4,len(guide)))
    assert data.shape == (4,23)
    for i in range(data.shape[-1]):
        data[:,i] = encoding[guide[i]]
    return data

#print(one_hot('CTGATCACGGCTGAAGGACTCGG',encoding))

def batch_one_hot(data,encoding):
    guides = np.zeros((len(data),4,23))
    i=0
    for guide in data['GUIDE']:
        guides[i] = one_hot(guide,encoding)
        i+=1
    return guides

guides_train = batch_one_hot(data_train,encoding)
guides_test = batch_one_hot(data_test,encoding)
print('Train dataset size:',guides_train.shape)
print('Test dataset size:',guides_test.shape)

Train dataset size: (226420, 4, 23)
Test dataset size: (56606, 4, 23)


## `Pytorch` data format

In [29]:
class GGEDataset(Dataset):

    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        sample = self.data[idx]
        if self.transform:
            sample = self.transform(sample)

        return sample.float()
    
transform = transforms.Compose([
        transforms.ToTensor()
    ])
GGE_dataset_train = GGEDataset(data = guides_train, transform = transform)
GGE_dataset_test = GGEDataset(data = guides_test, transform = transform)

trainloader = torch.utils.data.DataLoader(GGE_dataset_train, batch_size=50000,shuffle=True, num_workers=12)
testloader = torch.utils.data.DataLoader(GGE_dataset_test, batch_size=50000,shuffle=True, num_workers=12)

# Create Autoencoder

In [30]:
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()

        self.encoder = nn.Sequential(
            nn.Linear(92, 60),
            nn.ReLU(),
            nn.Linear(60, 40),
            nn.ReLU(),
            nn.Linear(40, 30)
        )
        self.decoder = nn.Sequential(
            nn.Linear(30, 40),
            nn.ReLU(),
            nn.Linear(40, 60),
            nn.ReLU(),
            nn.Linear(60, 92),
            nn.Sigmoid()
        )
        
        self.conv1 = nn.Conv2d(1, 6, 2)
        self.pool1 = nn.MaxPool2d(2, 2)
        self.conv2 = nn.ConvTranspose2d(6, 1, 2)
        self.pool2 =nn.MaxUnpool2d(2, 2)

    def forward(self, x):
        x = x.view(-1,1*4*23)
        x = self.encoder(x)
        x = self.decoder(x)
        x = x.view(-1,1,4,23)

        return x

In [31]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [32]:
net = Autoencoder().to(device)

## Load model

In [33]:
checkpoint = torch.load('model/autoencoder.pth.tar')
net.load_state_dict(checkpoint['state_dict'])

<All keys matched successfully>

# Testing Data

In [34]:
with torch.no_grad():
    correct = 0
    for data in testloader:
        outputs = net(data.to(device)).to('cpu')
        i = 0
        for s in outputs:
            outputs[i] = F.one_hot(torch.argmax(s[0,:,:],axis = 0),num_classes=4).T
            i += 1
        
        for i in range(len(data)):
            for j in range(23):
                if outputs[i][:,:,j].int().equal(data[i][:,:,j].int()):
                    correct += 1
    accuracy = correct/len(testloader.dataset)/23
    print('Testing Accuracy: {}%'.format(accuracy*100))

Testing Accuracy: 91.33845083252812%


# Extract features
From 92 dimension to 30 dimension, while `50000` is the batch size.

In [43]:
with torch.no_grad():
    for data in testloader:
        outputs = net.encoder(data.view(-1,1*4*23).to(device)).to('cpu')
        break
print("Data shape: {}".format(outputs.shape))
outputs

Data shape: torch.Size([50000, 30])


tensor([[-0.4642,  8.1225,  1.1612,  ..., -1.5633, -9.9387,  3.5303],
        [ 4.1952,  6.7439, -9.7511,  ...,  8.5822,  2.8152,  7.8830],
        [10.4661,  3.6456,  2.6252,  ...,  1.0471, -8.0809,  2.0523],
        ...,
        [ 6.9700, -1.1418, -5.6706,  ...,  0.5899, 10.4342,  1.7868],
        [ 1.5521, -1.3733, -8.8301,  ...,  6.7905,  1.7564,  5.2488],
        [ 2.3361,  1.9210, -3.6909,  ..., -1.2125, -8.0576,  1.9304]])