In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.chdir("/content/drive/My Drive/ml_proj")

In [None]:
%%shell
cd /content/drive/My Drive/ml_proj

In [None]:
%%shell
unzip data.zip

In [3]:
%%shell
pip install scanpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scanpy
  Downloading scanpy-1.9.1-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
Collecting matplotlib>=3.4
  Downloading matplotlib-3.6.2-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (9.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m95.7 MB/s[0m eta [36m0:00:00[0m
Collecting umap-learn>=0.3.10
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.2/88.2 KB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting session-info
  Downloading session_info-1.0.0.tar.gz (24 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting anndata>=0.7.4
  Downloading anndata-0.8.0-py3-none-any.whl (96 kB)
[2K    



In [4]:
import os
import numpy as np
import scanpy as sc
from copy import deepcopy
import umap.umap_ as umap
import warnings
from torch import nn, optim
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from torch.utils.data import DataLoader
from sklearn.metrics import adjusted_mutual_info_score, normalized_mutual_info_score
warnings.filterwarnings('ignore')

In [5]:
import torch
import numpy as np
from torch import nn


def create_conv_block(in_channels, out_channels, kernel_size, stride):
    return nn.Sequential(
        nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride),
        nn.BatchNorm1d(out_channels),
        nn.GELU(),
    )


class CNN(nn.Module):
    def __init__(self, k):
        super().__init__()
        stride = 1 if k == 500 else 2 if k == 1000 else 3
        self.conv_block1 = create_conv_block(4, 64, 1, stride)  # shape: [n, 64, 500]
        self.conv_block2 = create_conv_block(64, 128, 7, 4)  # shape: [n, 128, 124]
        self.conv_block3 = create_conv_block(128, 256, 5, 4)  # shape: [n, 256, 30]
        self.conv_block4 = create_conv_block(256, 512, 5, 4)  # shape: [n, 512, 7]
        self.linear_block = nn.Sequential(
            nn.Linear(512 * 7, 64),
            nn.Dropout(.2),
            nn.GELU(),
            nn.Linear(64, 2000),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.conv_block1(x)
        x = self.conv_block2(x)
        x = self.conv_block3(x)
        x = self.conv_block4(x)
        x = x.reshape(-1, 512 * 7)
        x = self.linear_block(x)
        return x


class AlternativeCNN(nn.Module):
    def __init__(self, k):
        super().__init__()
        stride = 1 if k == 500 else 2 if k == 1000 else 3
        self.conv_block1 = nn.Sequential(
            nn.Conv1d(4, 64, (1, ), (stride, )),  # shape: [n, 64, 500]
            nn.BatchNorm1d(64),
            nn.GELU(),
            nn.MaxPool1d(2, 2),  # shape: [n, 64, 250]
        )
        self.conv_block2 = nn.Sequential(
            nn.Conv1d(64, 256, (3, ), (4, )),  # shape: [n, 256, 62]
            nn.BatchNorm1d(256),
            nn.GELU(),
            nn.MaxPool1d(2, 2),  # shape: [n, 256, 31]
        )
        self.conv_block3 = nn.Sequential(
            nn.Conv1d(256, 512, (5, ), (2, )),  # shape: [n, 512, 14]
            nn.BatchNorm1d(512),
            nn.GELU(),
            nn.MaxPool1d(2, 2)  # shape: [n, 512, 7]
        )
        self.linear_block = nn.Sequential(
            nn.Linear(512 * 7, 64),
            nn.BatchNorm1d(64),
            nn.Dropout(.2),
            nn.GELU(),
            nn.Linear(64, 2000),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.conv_block1(x)
        x = self.conv_block2(x)
        x = self.conv_block3(x)
        x = x.reshape(-1, 512 * 7)
        x = self.linear_block(x)
        return x


class AlternativeCNN1(nn.Module):
    def __init__(self, k):
        super().__init__()
        stride = 1 if k == 500 else 2 if k == 1000 else 3
        self.conv_block1 = nn.Sequential(
            nn.Conv1d(4, 64, (1, ), (stride, )),  # shape: [n, 64, 500]
            nn.BatchNorm1d(64),
            nn.GELU(),
            nn.MaxPool1d(2, 2),  # shape: [n, 64, 250]
        )
        self.conv_block2 = nn.Sequential(
            nn.Conv1d(64, 256, (3, ), (4, )),  # shape: [n, 256, 62]
            nn.BatchNorm1d(256),
            nn.GELU(),
            nn.MaxPool1d(2, 2),  # shape: [n, 256, 31]
        )
        self.conv_block3 = nn.Sequential(
            nn.Conv1d(256, 64, (5, ), (4, )),  # shape: [n, 64, 7]
            nn.BatchNorm1d(64),
            nn.GELU(),
            nn.MaxPool1d(7)  # shape: [n, 64, 1]
        )
        self.linear_block = nn.Sequential(
            nn.Linear(64, 2000),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.conv_block1(x)
        x = self.conv_block2(x)
        x = self.conv_block3(x)
        x = x.reshape(-1, 64)
        x = self.linear_block(x)
        return x


class AlternativeCNN2(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv1d(4, 64, kernel_size=(9, ), stride=(3, ))  # shape: [n, 64, 164]
        self.b = nn.BatchNorm1d(64)
        self.g = nn.GELU()
        self.p = nn.MaxPool1d(2, 2)  # shape: [n, 64, 82]
        self.conv_block1 = nn.Sequential(
            nn.Conv1d(64, 128, kernel_size=(3, ), stride=(2, )),  # shape: [n, 128, 40]
            nn.BatchNorm1d(128),
            nn.GELU(),
            nn.MaxPool1d(2, 2)  # shape: [n, 128, 20]
        )
        self.conv_block2 = nn.Sequential(
            nn.Conv1d(128, 256, kernel_size=(5, ), stride=(2, )),  # shape: [n, 256, 8]
            nn.BatchNorm1d(256),
            nn.GELU(),
            nn.MaxPool1d(2, 2)  # shape: [n, 256, 4]
        )
        self.linear_block = nn.Sequential(
            nn.Linear(256 * 4, 64),
            nn.BatchNorm1d(64),
            nn.Dropout(.2),
            nn.Linear(64, 2000),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.p(self.g(self.b(self.conv1(x))))
        x = self.conv_block1(x)
        x = self.conv_block2(x)
        x = x.reshape(-1, 256 * 4)
        x = self.linear_block(x)
        return x

    def get_feature_map(self, x):
        return self.conv1(x)


def train(net, loader, cri, opt, device, max_epoch):
    net.train()
    loss_list = []
    for epoch in range(max_epoch):
        cur_loss = .0
        for idx, (data, label) in enumerate(loader):
            data = data.to(device)
            label = label.to(device)
            output = net(data)
            loss = cri(output, label)
            opt.zero_grad()
            loss.backward()
            opt.step()
            cur_loss += loss.item()
        cur_loss /= len(loader)
        loss_list.append(cur_loss)
        print('epoch: {}, loss: {:.6f}'.format(epoch+1, cur_loss))
    return loss_list


def test(net, loader, device):
    net.eval()
    res = []
    for idx, (data, label) in enumerate(loader):
        data = data.to(device)
        output = net(data)
        output = output.detach().cpu().numpy()
        res.append(output)
    res = np.concatenate(res, axis=0)
    return res


def get_conv_map(net, loader, device):
    net.eval()
    f_map_list = []
    for (data, label) in loader:
        net = net.eval()
        data = data.to(device)
        f_map = net.get_feature_map(data).detach().cpu().numpy()
        f_map_list.append(f_map)
    f_map_list = np.concatenate(f_map_list)
    return f_map_list


In [6]:
import torch
from torch.utils.data import Dataset


class CustomDataset(Dataset):
    def __init__(self, data, label):
        super().__init__()
        self.data = data
        self.label = label

    def __getitem__(self, item):
        data = torch.FloatTensor(self.data[item].reshape(4, -1))
        label = torch.FloatTensor(self.label[item])
        return data, label

    def __len__(self):
        return len(self.data)


In [7]:
import numpy as np
from copy import deepcopy
from sklearn.metrics import roc_auc_score


def transfer_letter_to_num(data):
    res = []
    res_int = []
    for seq in data:
        tmp = deepcopy(seq)
        tmp = tmp.replace('A', '0')
        tmp = tmp.replace('G', '1')
        tmp = tmp.replace('C', '2')
        tmp = tmp.replace('T', '3')
        tmp = np.array([int(s) for s in list(tmp)[:-1]])
        res_int.append(tmp.reshape(1, -1))
        one_hot = np.zeros(shape=(len(tmp), 4))
        one_hot[np.arange(len(tmp)), tmp] = 1
        res.append(one_hot.reshape((1, -1, 4)))
    res = np.concatenate(res)
    res_int = np.concatenate(res_int)
    return res, res_int


def clean_str(data):
    res = []
    for seq in data:
        seq = seq.replace('\n', '')
        res.append(seq)
    return res


def cal_auc(pred, label):
    res = .0
    res_list = []
    for idx in range(len(label)):
        tmp = roc_auc_score(label[idx], pred[idx])
        res += tmp
        res_list.append(tmp)
    res /= len(label)
    return res_list, res


In [23]:
class AlternativeCNN3(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv1d(4, 32, kernel_size=(9,), stride=(1, ))  # shape: [n, 32, 492]
        self.b = nn.BatchNorm1d(32)
        self.g = nn.GELU()
        self.p = nn.MaxPool1d(3, 3)  # shape: [n, 32, 164]
        self.conv_block1 = nn.Sequential(
            nn.Conv1d(32, 64, kernel_size=(5,), stride=(1,)),  # shape: [n, 64, 160]
            nn.BatchNorm1d(64),
            nn.GELU(),
            nn.MaxPool1d(2, 2)  # shape: [n, 64, 80]
        )
        self.conv_block2 = nn.Sequential(
            nn.Conv1d(64, 128, kernel_size=(7,), stride=(1,)),  # shape: [n, 128, 64]
            nn.BatchNorm1d(128),
            nn.GELU(),
            nn.MaxPool1d(2, 2)  # shape: [n, 128, 32]
        )
        self.conv_block3 = nn.Sequential(
            nn.Conv1d(128, 256, kernel_size=(5,), stride=(1,)),  # shape: [n, 256, 28]
            nn.BatchNorm1d(256),
            nn.GELU(),
            nn.MaxPool1d(2, 2)  # shape: [n, 256, 14]
        )
        self.conv_block4 = nn.Sequential(
            nn.Conv1d(256, 512, kernel_size=(5,), stride=(1,)),  # shape: [n, 512, 10]
            nn.BatchNorm1d(256),
            nn.GELU(),
            nn.MaxPool1d(2, 2)  # shape: [n, 256, 5]
        )
        self.conv_block4 = nn.Sequential(
            nn.Conv1d(256, 512, kernel_size=(5,), stride=(1,)),  # shape: [n, 512, 10]
            nn.BatchNorm1d(512),
            nn.GELU(),
            nn.MaxPool1d(2, 2)  # shape: [n, 256, 5]
        )
        self.conv_block5 = nn.Sequential(
            nn.Conv1d(512, 1024, kernel_size=(3,), stride=(1,)),  # shape: [n, 512, 3]
            nn.BatchNorm1d(1024),
            nn.GELU(),
            nn.MaxPool1d(3, 3)  # shape: [n, 512, 1]
        )
        self.linear_block = nn.Sequential(
            nn.Linear(1024, 64),
            nn.BatchNorm1d(64),
            nn.Dropout(.2),
            nn.Linear(64, 2000, bias=False),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.p(self.g(self.b(self.conv1(x))))
        x = self.conv_block1(x)
        x = self.conv_block2(x)
        x = self.conv_block3(x)
        x = self.conv_block4(x)
        x = self.conv_block5(x)
        x = x.reshape(-1, 1024)
        x = self.linear_block(x)
        return x
    
    def get_feature_map(self, x):
        return self.conv1(x)

In [22]:
class AlternativeCNN4(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv1d(4, 288, kernel_size=(13,), stride=(1, ))  # shape: [n, 288, 488]
        self.b = nn.BatchNorm1d(288)
        self.g = nn.GELU()
        self.p = nn.MaxPool1d(2, 2)  # shape: [n, 288, 244]
        self.conv_block1 = nn.Sequential(
            nn.Conv1d(288, 323, kernel_size=(9,), stride=(1,)),  # shape: [n, 323, 236]
            nn.BatchNorm1d(323),
            nn.GELU(),
            nn.MaxPool1d(2, 2)  # shape: [n, 323, 118]
        )
        self.conv_block2 = nn.Sequential(
            nn.Conv1d(323, 363, kernel_size=(7,), stride=(1,)),  # shape: [n, 363, 112]
            nn.BatchNorm1d(363),
            nn.GELU(),
            nn.MaxPool1d(2, 2)  # shape: [n, 363, 56]
        )
        self.conv_block3 = nn.Sequential(
            nn.Conv1d(363, 407, kernel_size=(5,), stride=(1,), padding=2),  # shape: [n, 407, 56]
            nn.BatchNorm1d(407),
            nn.GELU(),
            nn.MaxPool1d(2, 2)  # shape: [n, 407, 28]
        )
        self.conv_block4 = nn.Sequential(
            nn.Conv1d(407, 456, kernel_size=(5,), stride=(1,), padding=2),  # shape: [n, 456, 28]
            nn.BatchNorm1d(456),
            nn.GELU(),
            nn.MaxPool1d(2, 2)  # shape: [n, 456, 14]
        )
        self.conv_block5 = nn.Sequential(
            nn.Conv1d(456, 512, kernel_size=(5,), stride=(1,), padding=2),  # shape: [n, 512, 14]
            nn.BatchNorm1d(512),
            nn.GELU(),
            nn.MaxPool1d(2, 2)  # shape: [n, 512, 7]
        )
        self.conv_block6 = nn.Sequential(
            nn.Conv1d(512, 256, kernel_size=(5, ), stride=(1, ), padding=2),  # shape: [n, 256, 7]
            nn.BatchNorm1d(256),
            nn.GELU(),
        )
        self.linear_block = nn.Sequential(
            nn.Linear(256 * 7, 32),
            nn.BatchNorm1d(32),
            nn.Dropout(.2),
            nn.GELU(),
            nn.Linear(32, 2000, bias=False),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.p(self.g(self.b(self.conv1(x))))
        x = self.conv_block1(x)
        x = self.conv_block2(x)
        x = self.conv_block3(x)
        x = self.conv_block4(x)
        x = self.conv_block5(x)
        x = self.conv_block6(x)
        x = x.reshape(-1, 256 * 7)
        x = self.linear_block(x)
        return x

    def get_feature_map(self, x):
        return self.conv1(x)


In [8]:
path_train = '/data/train'
path_test = './data/test'
path_cell_type = './data/celltype.txt'
train_path_list = []
test_path_list = []
for idx in ['train', 'test']:
    for item in [500, 1000, 1500]:
        if idx == 'train':
            train_path_list.append('data/train/{}/sequences_train.txt'.format(item))
        else:
            test_path_list.append('data/test/{}/sequences_test.txt'.format(item))

with open(train_path_list[0], 'r') as f:
    train_500 = f.readlines()

with open(test_path_list[0], 'r') as f:
    test_500 = f.readlines()

with open(path_cell_type, 'r') as f:
    cell = f.readlines()


f_mat_train_500 = sc.read('./data/train/500/matrix_train.mtx')
label_train_500 = np.array(f_mat_train_500.X.todense()).astype(np.int_)

f_mat_test_500 = sc.read('./data/test/500/matrix_test.mtx')
label_test_500 = np.array(f_mat_test_500.X.todense()).astype(np.int_)

mat_train_500, int_train_500 = transfer_letter_to_num(train_500)
mat_test_500, int_test_500 = transfer_letter_to_num(test_500)
str_test_500 = clean_str(test_500)

In [27]:
max_epoch = 40
batch_size = 256
lr = 1e-3
wd = 5e-4
device = 'cuda'

train_dataset = CustomDataset(mat_train_500, label_train_500)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataset = CustomDataset(mat_test_500, label_test_500)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
net = AlternativeCNN3().to(device)
cri = nn.BCELoss().to(device)
opt = optim.Adam(net.parameters(), lr=lr, weight_decay=wd)

loss_list = train(net, train_loader, cri, opt, device, max_epoch)

epoch: 1, loss: 0.637546
epoch: 2, loss: 0.381163
epoch: 3, loss: 0.323984
epoch: 4, loss: 0.305918
epoch: 5, loss: 0.296620
epoch: 6, loss: 0.291163
epoch: 7, loss: 0.287222
epoch: 8, loss: 0.284804
epoch: 9, loss: 0.283010
epoch: 10, loss: 0.281871
epoch: 11, loss: 0.280689
epoch: 12, loss: 0.279829
epoch: 13, loss: 0.279088
epoch: 14, loss: 0.278568
epoch: 15, loss: 0.278162
epoch: 16, loss: 0.277829
epoch: 17, loss: 0.277467
epoch: 18, loss: 0.277153
epoch: 19, loss: 0.276745
epoch: 20, loss: 0.276664
epoch: 21, loss: 0.276313
epoch: 22, loss: 0.276359
epoch: 23, loss: 0.275938
epoch: 24, loss: 0.275812
epoch: 25, loss: 0.275989
epoch: 26, loss: 0.275742
epoch: 27, loss: 0.275897
epoch: 28, loss: 0.275690
epoch: 29, loss: 0.275579
epoch: 30, loss: 0.275768
epoch: 31, loss: 0.275592
epoch: 32, loss: 0.275419
epoch: 33, loss: 0.275307
epoch: 34, loss: 0.275528
epoch: 35, loss: 0.275509
epoch: 36, loss: 0.275463
epoch: 37, loss: 0.275327
epoch: 38, loss: 0.275497
epoch: 39, loss: 0.27

In [28]:
pred = test(net, test_loader, device)
auc_list, auc = cal_auc(pred, label_test_500)

In [29]:
auc

0.7054471075712039

In [30]:
net

AlternativeCNN3(
  (conv1): Conv1d(4, 32, kernel_size=(9,), stride=(1,))
  (b): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (g): GELU(approximate='none')
  (p): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  (conv_block1): Sequential(
    (0): Conv1d(32, 64, kernel_size=(5,), stride=(1,))
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): GELU(approximate='none')
    (3): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv_block2): Sequential(
    (0): Conv1d(64, 128, kernel_size=(7,), stride=(1,))
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): GELU(approximate='none')
    (3): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv_block3): Sequential(
    (0): Conv1d(128, 256, kernel_size=(5,), stride=(1,))
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.

In [31]:
auc_array = np.array(auc_list)
np.save('./auc_array_500.npy', auc_array)

In [32]:
f_map = get_conv_map(net, test_loader, device)

In [49]:
def get_motif(f_map, ratio, data_str, data_int, data_mat, k_size, stride):
    res_str, res_int = [], []
    res_mat = np.zeros(shape=(k_size, 4))
    for chan in range(f_map.shape[1]):
        f_map_chan = f_map[chan]
        thr = ratio * np.max(f_map_chan)
        index = np.where(f_map_chan > thr)
        batch_index, seq_index = index
        seq_index = seq_index * stride
        for idx, batch in enumerate(batch_index):
            seq_idx = seq_index[idx]
            res_str.append(data_str[batch][seq_idx: seq_idx+k_size])
            res_int.append(data_int[batch][seq_idx: seq_idx+k_size])
            res_mat += data_mat[batch][seq_idx: seq_idx+k_size]
    return res_str, res_int, res_mat.transpose(1, 0)

In [50]:
def get_motif_chan(f_map, ratio, chan, data_str, data_int, data_mat, k_size, stride):
    res_str, res_int = [], []
    res_mat = np.zeros(shape=(k_size, 4))
    f_map_chan = f_map[chan]
    thr = ratio * np.max(f_map_chan)
    index = np.where(f_map_chan > thr)
    batch_index, seq_index = index
    seq_index = seq_index * stride
    for idx, batch in enumerate(batch_index):
        seq_idx = seq_index[idx]
        res_str.append(data_str[batch][seq_idx: seq_idx+k_size])
        res_int.append(data_int[batch][seq_idx: seq_idx+k_size])
        res_mat += data_mat[batch][seq_idx: seq_idx+k_size]
    return res_str, res_int, res_mat.transpose(1, 0)

In [57]:
res_str, res_int, res_mat = get_motif(f_map, .98, str_test_500, int_test_500, mat_test_500, 9, 1)

In [58]:
res_mat

array([[11.,  8.,  9.,  7., 13., 12., 11., 11., 10.],
       [13., 20., 12., 10.,  9., 15., 20., 16., 15.],
       [12., 11., 16., 25., 15., 14., 14., 14., 11.],
       [14., 11., 13.,  8., 13.,  9.,  5.,  9., 14.]])

In [59]:
res_mat_ratio = res_mat / np.sum(res_mat, axis=0)

In [60]:
res_mat_ratio

array([[0.22, 0.16, 0.18, 0.14, 0.26, 0.24, 0.22, 0.22, 0.2 ],
       [0.26, 0.4 , 0.24, 0.2 , 0.18, 0.3 , 0.4 , 0.32, 0.3 ],
       [0.24, 0.22, 0.32, 0.5 , 0.3 , 0.28, 0.28, 0.28, 0.22],
       [0.28, 0.22, 0.26, 0.16, 0.26, 0.18, 0.1 , 0.18, 0.28]])

In [76]:
res_str_set = set(res_str)
for idx, item in enumerate(res_str_set):
    if idx % 4 == 3:
        print(item)
    else:
        print(item, end=' ')

CAAGCAGGT GGCCTATGA CGTCTAGGG GAAGATCAG
AGATTAACT TCGCCCGGC CGGGCGGGG CCCTTAAGC
CTCCTGCAG TTCCGGCCC CACCCCACT GCGCCCACA
GTACGTGTG AGGTTCTGC CTAACAGGA TCCCCCACC
GGACCACGT AGTACGCAA GCGGGCGGG TGGCGGCCA
GGCGCCCCC ACGCTCCTG CGTCCCGGA AGAAATGAG
AGCCTAGGC AGGTTGACA TGCACGTTT TCTGTCATG
TTTGTGAGC GTACGCAAT TTCTCGGAG AACAACCCT
TTTCAGGTT AGTTTGGGA CCTCACAAG GATCACGAG
CTGCAGCAA TGTCTACAA AGCTCTGCG ACCCATGCT
CTCCGAGGG GATAGGCCT GGGAAATTC TCAGATAGG
CAGTGGTTT GGGGAAGAT TGCCATCTT TCTGCTGCC
GACCGTCCC 

In [69]:
clus_data = net.linear_block[3].weight.cpu().detach().numpy()

In [70]:
cell_str = ''.join(cell)
cell_str = cell_str.replace('\n', '')
cell_str = cell_str.replace('CLP', '0')
cell_str = cell_str.replace('CMP', '1')
cell_str = cell_str.replace('GMP', '2')
cell_str = cell_str.replace('HSC', '3')
cell_str = cell_str.replace('LMPP', '4')
cell_str = cell_str.replace('MEP', '5')
cell_str = cell_str.replace('MPP', '6')
cell_str = cell_str.replace('pDC', '7')
cell_str = cell_str.replace('UNK', '8')
cell_str = cell_str.replace('mono', '9')
cell_int = np.array([int(s) for s in cell_str])


clus_k = KMeans(n_clusters=10)
clus_res = clus_k.fit_predict(clus_data)

tsne = TSNE(n_components=2, perplexity=30, learning_rate='auto')
dim_rdc_res = tsne.fit_transform(clus_data)


ami = adjusted_mutual_info_score(cell_int, clus_res)
nmi = normalized_mutual_info_score(cell_int, clus_res)

In [72]:
ami

0.040721380790712976

In [73]:
nmi

0.05032347996358814

In [75]:
np.save('./clus_data_500.npy', clus_data)