**Step 0. Preparing packages and modules**

In [None]:
!pip install phenograph
!pip install umap-learn
!pip install scanpy
!pip install louvain
!pip install git+https://github.com/saketkc/pysctransform.git

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
from tensorflow.keras.losses import MSE, KLD
import phenograph

from keras.layers import Input, Dense
from keras.models import Model
from keras import regularizers
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

from pysctransform import vst, get_hvg_residuals, SCTransform
import anndata as ad
import scanpy as sc
from scipy.sparse import *

In [None]:
from google.colab import drive
drive.mount('/content/drive')

**Step 1. Preprocessing**

Here we use Scanpy and pySCTransform to normalize data.

You can use SCTransform in the Seurat R package alternatively.

In [None]:
idents = np.array(pd.read_csv("/content/drive/My Drive/compbio/test_data/pbmc_idents.csv", index_col=0, sep="\t"))
# For datasets with no labels,
# idents = np.zeros(n_sample)
df = pd.read_csv("/content/drive/My Drive/compbio/test_data/pbmc_raw.csv", index_col=0, sep="\t")
count = np.array(df)
# row: cells
# columan: genes
# or directly upload SCTransformed data

In [None]:
adata=sc.AnnData(count)

In [None]:
#cells = df.index.values.astype('str')
#genes = df.columns.values.astype('str')
#adata.var_names = genes

In [None]:
# Normalization
adata.X = csr_matrix(adata.X)
residuals = SCTransform(adata, var_features_n=3000)

**Step 2. Building model**

In [None]:
import os
os.chdir("/content/drive/My Drive/compbio/gitclone/clustering_SAE")
from utils import *
from layers import DenseTranspose
from sae import SAE

In [None]:
count = np.array(residuals)
idents = idents.flatten()
[n_sample, n_gene] = count.shape
x_train = count.astype('float32')
idents = idents.astype('str')
idents_new = id2number(idents)

In [None]:
autoencoder = SAE(x_train, idents, n_sample, n_gene, n_sample)

**Step 3. Pre-training**

In [None]:
h = autoencoder.train1()
h = autoencoder.train2(h)
h = autoencoder.train3(h)
h = autoencoder.train4(h)
autoencoder.train(max_epoch=100)

In [None]:
# t-SNE
encoded_data = autoencoder.ec(x_train)
ed = np.array(encoded_data)
ed = dotsne(ed)
myscatter(ed, idents, legend=True)

In [None]:
# UMAP
encoded_data = autoencoder.ec(x_train)
embeddings = np.array(encoded_data)
embeddings = doumap(ed)
myscatter(embeddings, idents, legend=True)

In [None]:
# Evaluation
labels, _,  _ = phenograph.cluster(np.array(encoded_data))
measure(idents, labels)

**Step 4. Clustering training**

In [None]:
autoencoder.clustering_train(max_epoch=50)

In [None]:
# t-SNE
encoded_data = autoencoder.ec(x_train)
ed = np.array(encoded_data)
ed = dotsne(ed)
myscatter(ed, idents, legend=True)

In [None]:
# UMAP
encoded_data = autoencoder.ec(x_train)
embeddings = np.array(encoded_data)
embeddings = doumap(ed)
myscatter(embeddings, idents, legend=True)

In [None]:
# Evaluation
labels, _,  _ = phenograph.cluster(np.array(encoded_data))
measure(idents, labels)

**Step 5. Saving results**

In [None]:
autoencoder.autoencoder.save_weights('autoencoder_pbmc.h5')

In [None]:
save = pd.DataFrame(np.array(encoded_data))
save.to_csv('/content/drive/My Drive/compbio/test_data/pbmc_model.csv',index=False,header=True)

In [None]:
save = pd.DataFrame(ed)
save.to_csv('/content/drive/My Drive/compbio/test_data/pbmc_model_tsne.csv',index=False,header=True)

In [None]:
save = pd.DataFrame(embeddings)
save.to_csv('/content/drive/My Drive/compbio/test_data/pbmc_model_umap.csv',index=False,header=True)