In [1]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestRegressor
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import copy
import numpy as np
import sys
import scanpy as sc
import anndata as ad
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LinearRegression
from scipy.sparse import csc_matrix
import logging
from torch.utils.data.dataset import Dataset
from catboost import CatBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
import numpy as np
from scipy.sparse import csr_matrix, coo_matrix, csc_matrix
from scipy.io import mmread, mmwrite
import pandas as pd

In [2]:
meta = { 'functionality_name': 'lslab' }

In [3]:
cell_names = pd.read_csv('./Dataset37/Train/RNA/barcodes.tsv', sep = '\t', header=None, index_col=None)
cell_names.columns = ['cell_ids'] 
X = csr_matrix(mmread('./Dataset37/Train/RNA/matrix.mtx').T)
gene_names = pd.read_csv('./Dataset37/Train/RNA/features.tsv', sep = '\t', header=None, index_col=None) 
gene_names.columns = ['gene_ids'] 
input_train_mod1 = ad.AnnData(X, obs=pd.DataFrame(index=cell_names.cell_ids), var=pd.DataFrame(index = gene_names.gene_ids))
input_train_mod1.var_names_make_unique()

cell_names = pd.read_csv('./Dataset37/Test/RNA/barcodes.tsv', sep = '\t', header=None, index_col=None)
cell_names.columns = ['cell_ids'] 
X = csr_matrix(mmread('./Dataset37/Test/RNA/matrix.mtx').T)
gene_names = pd.read_csv('./Dataset37/Train/RNA/features.tsv', sep = '\t', header=None, index_col=None) 
gene_names.columns = ['gene_ids'] 
final_input_test_mod1 = ad.AnnData(X, obs=pd.DataFrame(index=cell_names.cell_ids), var=pd.DataFrame(index = gene_names.gene_ids))
final_input_test_mod1.var_names_make_unique()


cell_names = pd.read_csv('./Dataset37/Train/ATAC/barcodes.tsv', sep = '\t', header=None, index_col=None)
cell_names.columns = ['cell_ids'] 
X = csr_matrix(mmread('./Dataset37/Train/ATAC/matrix.mtx').T)
gene_names = pd.read_csv('./Dataset37/Train/ATAC/features.tsv', sep = '\t', header=None, index_col=None) 
gene_names.columns = ['gene_ids'] 
input_train_mod2 = ad.AnnData(X, obs=pd.DataFrame(index=cell_names.cell_ids), var=pd.DataFrame(index = gene_names.gene_ids))
input_train_mod2.var_names_make_unique()

cell_names = pd.read_csv('./Dataset37/Test/ATAC/barcodes.tsv', sep = '\t', header=None, index_col=None)
cell_names.columns = ['cell_ids'] 
X = csr_matrix(mmread('./Dataset37/Test/ATAC/matrix.mtx').T)
gene_names = pd.read_csv('./Dataset37/Train/ATAC/features.tsv', sep = '\t', header=None, index_col=None) 
gene_names.columns = ['gene_ids'] 
input_test_mod2 = ad.AnnData(X, obs=pd.DataFrame(index=cell_names.cell_ids), var=pd.DataFrame(index = gene_names.gene_ids))
input_test_mod2.var_names_make_unique()

save_path = './Results/'

input_train_mod1.var['feature_types'] = pd.Categorical(len(input_train_mod1.var_names)*['ATAC'])
input_train_mod1.obs['batch'] = pd.Categorical(len(input_train_mod1.obs)*['batch1'])
input_train_mod1.uns = {'dataset_id': 'human_pbmc_3k', 'organism': 'human'}
input_train_mod2.var['feature_types'] = pd.Categorical(len(input_train_mod2.var_names)*['ATAC'])
input_train_mod2.obs['batch'] = pd.Categorical(len(input_train_mod2.obs)*['batch1'])
input_train_mod2.uns = {'dataset_id': 'human_pbmc_3k', 'organism': 'human'}
dataset_id = "gex2atac"
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
batches = set(input_train_mod1.obs["batch"])
batch_dict = {batch:i for i, batch in enumerate(batches)}
y = []
for i in range(input_train_mod1.n_obs):
    y.append(int(batch_dict[input_train_mod1.obs["batch"][i]]))
fold = 0
X = input_train_mod1.obs
batches = np.array(y)
inp_train_mod1 = input_train_mod1.copy()
inp_train_mod2 = input_train_mod2.copy()
out1, out2 = 0, 0
if "gex2atac" in dataset_id:
    out_knn = 0
    for train_index, test_index in skf.split(X, y):
        print(fold)
        fold += 1
        input_test_mod1 = inp_train_mod1[test_index, :]
        true_test_mod2 = inp_train_mod2[test_index, :]
        input_train_mod1 = inp_train_mod1[train_index, :]
        input_train_mod2 = inp_train_mod2[train_index, :]
        input_mod1 = ad.concat(
                {"train": input_train_mod1, "val": input_test_mod1, "test": final_input_test_mod1},
                axis=0,
                join="outer",
                label="group",
                fill_value=0,
                index_unique="-",
            )
        logging.info('Performing dimensionality reduction on modality 1 values...')
        embedder_mod1 = TruncatedSVD(n_components=50)
        mod1_pca = embedder_mod1.fit_transform(input_mod1.X)
        logging.info('Performing dimensionality reduction on modality 2 values...')
        embedder_mod2 = TruncatedSVD(n_components=50)
        mod2_pca = embedder_mod2.fit_transform(input_train_mod2.X)
        X_train = mod1_pca[input_mod1.obs['group'] == 'train']
        X_test = mod1_pca[input_mod1.obs['group'] == 'test']
        y_train = mod2_pca
        logging.info('Running Linear regression...')
        reg = KNeighborsRegressor(n_neighbors=25, metric='minkowski')
        reg.fit(X_train, y_train)
        y_pred = reg.predict(X_test)
        y_pred = y_pred @ embedder_mod2.components_
        out_knn += y_pred
        y_pred = out_knn / 10
        y_pred = csc_matrix(y_pred)
        adata = ad.AnnData(
            X=y_pred,
        obs=final_input_test_mod1.obs,
        var=inp_train_mod2.var,
        uns={
            'dataset_id': dataset_id,
            'method_id': meta["functionality_name"],
        },
        )
        logging.info('Storing annotated data...')
pred = pd.DataFrame(data=adata.X.todense(),index=adata.obs_names,columns=adata.var_names)
true = pd.DataFrame(data=input_test_mod2.X.todense(),index=input_test_mod2.obs_names,columns=input_test_mod2.var_names)
pred.to_hdf(save_path + 'KAUST_pred.h5', 'a')
true.to_hdf(save_path + 'KAUST_true.h5', 'a')

0
1
2
3
4
5
6
7
8
9
