In [18]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
import os
os.chdir("/content/drive/My Drive/ml_proj")

In [None]:
%%shell
cd /content/drive/My Drive/ml_proj

In [None]:
%%shell
unzip data.zip

In [20]:
%%shell
pip install scanpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/




In [21]:
import os
import torch
import numpy as np
import scanpy as sc
from copy import deepcopy
import warnings
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score

from util import transfer_letter_to_num
from loader import customDataset, DataLoader
from net import CNN, train

warnings.filterwarnings('ignore')

In [22]:
path_train = '/data/train'
path_test = './data/test'
path_cell_type = './data/celltype.txt'
train_path_list = []
test_path_list = []
for idx in ['train', 'test']:
    for item in [500, 1000, 1500]:
        if idx == 'train':
            train_path_list.append('data/train/{}/sequences_train.txt'.format(item))
        else:
            test_path_list.append('data/test/{}/sequences_test.txt'.format(item))

with open(train_path_list[0], 'r') as f:
    train_500 = f.readlines()

with open(test_path_list[0], 'r') as f:
    test_500 = f.readlines()

f_mat_train_500 = sc.read('./data/train/500/matrix_train.mtx')
label_train_500 = np.array(f_mat_train_500.X.todense()).astype(np.int_)

f_mat_test_500 = sc.read('./data/test/500/matrix_test.mtx')
label_test_500 = np.array(f_mat_test_500.X.todense()).astype(np.int_)

mat_train_500 = transfer_letter_to_num(train_500)
mat_test_500 = transfer_letter_to_num(test_500)

In [23]:
class customDataset(Dataset):
    def __init__(self, data, label):
        super().__init__()
        self.data = data
        self.label = label

    def __getitem__(self, item):
        data = torch.FloatTensor(self.data[item].reshape(4, -1))
        label = torch.FloatTensor(self.label[item])
        return data, label

    def __len__(self):
        return len(self.data)

In [24]:
def create_conv_block(in_channels, out_channels, kernel_size, stride):
    return nn.Sequential(
        nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride),
        nn.BatchNorm1d(out_channels),
        nn.GELU(),
    )


class CNN(nn.Module):
    def __init__(self, k):
        super().__init__()
        stride = 1 if k == 500 else 2 if k == 1000 else 3
        self.conv_block1 = create_conv_block(4, 64, 5, stride)  # shape: [n, 64, 496]
        self.conv_block2 = create_conv_block(64, 128, 4, 4)  # shape: [n, 128, 124]
        self.conv_block3 = create_conv_block(128, 256, 5, 4)  # shape: [n, 256, 30]
        self.conv_block4 = create_conv_block(256, 512, 5, 4)  # shape: [n, 512, 7]
        self.linear_block = nn.Sequential(
            nn.Linear(512 * 7, 64),
            nn.Dropout(.2),
            nn.GELU(),
            nn.Linear(64, 2000),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.conv_block1(x)
        x = self.conv_block2(x)
        x = self.conv_block3(x)
        x = self.conv_block4(x)
        x = x.reshape(-1, 512 * 7)
        x = self.linear_block(x)
        return x


def train(net, loader, cri, opt, device, max_epoch):
    net.train()
    loss_list = []
    for epoch in range(max_epoch):
        cur_loss = .0
        for idx, (data, label) in enumerate(loader):
            data = data.to(device)
            label = label.to(device)
            output = net(data)
            loss = cri(output, label)
            opt.zero_grad()
            loss.backward()
            opt.step()
            cur_loss += loss.item()
        cur_loss /= len(loader)
        loss_list.append(cur_loss)
        print('epoch: {}, loss: {:.6f}'.format(epoch+1, cur_loss))
    return loss_list


def cal_m_auc(pred, label):
    res = .0
    for idx in range(label.shape[-1]):
        res += roc_auc_score(label[:, idx], pred[:, idx])
    res /= label.shape[-1]
    return res


def test(net, loader, label, device):
    net.eval()
    res = []
    for idx, (data, label) in enumerate(loader):
        data = data.to(device)
        output = net(data)
        output = output.detach().cpu().numpy()
        res.append(output)
    res = np.array(res)
    return cal_m_auc(res, label)

In [25]:
max_epoch = 20
batch_size = 256
lr = 1e-4
wd = 5e-4
device = 'cuda'


train_dataset = customDataset(mat_train_500, label_train_500)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
net = CNN(k=500).to(device)
cri = nn.BCELoss().to(device)
opt = optim.Adam(net.parameters(), lr=lr, weight_decay=wd)

loss_list = train(net, train_loader, cri, opt, device, max_epoch)

epoch: 1, loss: 0.555477
epoch: 2, loss: 0.335428
epoch: 3, loss: 0.297043
epoch: 4, loss: 0.285619
epoch: 5, loss: 0.275396
epoch: 6, loss: 0.266613
epoch: 7, loss: 0.262623
epoch: 8, loss: 0.260310
epoch: 9, loss: 0.258939
epoch: 10, loss: 0.258204
epoch: 11, loss: 0.257495
epoch: 12, loss: 0.257132
epoch: 13, loss: 0.256872
epoch: 14, loss: 0.256491
epoch: 15, loss: 0.256442
epoch: 16, loss: 0.256482
epoch: 17, loss: 0.256200
epoch: 18, loss: 0.256038
epoch: 19, loss: 0.256072
epoch: 20, loss: 0.256096


In [26]:
def cal_m_auc(pred, label):
    res = .0
    for idx in range(len(label)):
        res += roc_auc_score(label[idx], pred[idx])
    res /= len(label)
    return res


def test(net, loader, label, device):
    net.eval()
    res = []
    for idx, (data, label) in enumerate(loader):
        data = data.to(device)
        output = net(data)
        output = output.detach().cpu().numpy()
        res.append(output)
    res = np.concatenate(res, axis=0)
    return res

In [27]:
pred = test(net, train_loader, label_train_500, device)

In [28]:
cal_m_auc(pred, label_train_500)

0.7052852677756509

In [29]:
test_dataset = customDataset(mat_test_500, label_test_500)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [30]:
pred_test = test(net, test_loader, label_test_500, device)

In [31]:
cal_m_auc(pred_test, label_test_500)

0.708761747191723

In [32]:
net

CNN(
  (conv_block1): Sequential(
    (0): Conv1d(4, 64, kernel_size=(5,), stride=(1,))
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): GELU(approximate='none')
  )
  (conv_block2): Sequential(
    (0): Conv1d(64, 128, kernel_size=(4,), stride=(4,))
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): GELU(approximate='none')
  )
  (conv_block3): Sequential(
    (0): Conv1d(128, 256, kernel_size=(5,), stride=(4,))
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): GELU(approximate='none')
  )
  (conv_block4): Sequential(
    (0): Conv1d(256, 512, kernel_size=(5,), stride=(4,))
    (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): GELU(approximate='none')
  )
  (linear_block): Sequential(
    (0): Linear(in_features=3584, out_features=64, bias=True)
    (1): Dropout(p=0.2, inplace=False)
    (2): GELU

In [33]:
clus_data = net.linear_block[3].weight.cpu().detach().numpy()
clus_data = clus_data.reshape(2000, -1)

In [34]:
np.save('clus_data.npy', clus_data)