In [None]:
import sys
import os
import os.path
import numpy as np
import pandas as pd
import scanpy as sc
import scanpy.external as sce
from scipy.sparse import issparse
import anndata
from anndata import AnnData
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display, HTML
from collections import defaultdict, OrderedDict
import gzip
import csv
import re
import io
import logging
import warnings
import subprocess
from preprocessing import *

FORMAT = '%(asctime)-15s %(message)s'
logging.basicConfig(format=FORMAT)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 50)
sc.settings.verbosity = 3
sc.logging.print_header()
sc.settings.set_figure_params(dpi=120, color_map="cividis")

In [None]:
dataset = "North_2019"
species = "Human"

In [None]:
dirname = os.getcwd()
source = os.path.abspath(os.path.join(dirname, f'../../build/datasets/{dataset}'))
fns = os.listdir(os.path.join(source, 'downloads'))
print(source)
fns

In [None]:
meta_fn = 'Supplementary_Table1.xlsx'
data_fn = 'Supplementary_Table2.xlsx'

In [None]:
meta_df = pd.read_excel(os.path.join(source, 'downloads', meta_fn), header=1)
meta_df = meta_df[0:29]
meta_df.columns = [x.lower().replace(" ", "_") for x in meta_df.columns]
meta_df.rename(columns = {"patient_#":"patient_id"}, inplace=True)
meta_df["patient_id"] = meta_df["patient_id"].astype(str)
meta_df["vas_at_maximal_intensity"].replace("Not reported", np.nan, inplace=True)
meta_df["vas_at_maximal_intensity"] = meta_df["vas_at_maximal_intensity"].astype(float)
meta_df

In [None]:
sample_df = pd.read_excel(os.path.join(source, 'downloads', data_fn), header=2)[0:3]
sample_df.drop(columns=['Unnamed: 1', 'Unnamed: 2'], inplace=True)
sample_df = sample_df.T
sample_df.columns = ["sex", "patient_id", "associated_pain"]
sample_df = sample_df[1:]
sample_df

In [None]:
tpm_df = pd.read_excel(os.path.join(source, 'downloads', data_fn), header=6)
tpm_df.columns = ["gene_name", "entrez_id", "gene_chr"] + list(sample_df.index)
tpm_df["gene_name"] = [x.lstrip('<').rstrip('>') for x in tpm_df["gene_name"]]
tpm_df.set_index("gene_name", inplace=True)
var_df = tpm_df[["entrez_id", "gene_chr"]].copy()
tpm_df.drop(columns=["entrez_id", "gene_chr"], inplace=True)

for c in tpm_df:
    tpm_df[c] = tpm_df[c].replace('[NA]', np.nan)
    tpm_df[c] = tpm_df[c].astype(float)
    
tpm_df

In [None]:
adata = AnnData(np.array(tpm_df.T), dtype=float)

adata.obs = sample_df.copy()
adata.obs["sample_id"] = [x.rsplit("_", 1)[0] for x in adata.obs_names]
adata.obs["is_pair"] = [x.split(" (")[-1].rstrip(")") for x in adata.obs["patient_id"]]
adata.obs["patient_id"] = [str(x.split(" ")[0]) for x in adata.obs["patient_id"]]
adata.obs = adata.obs.merge(meta_df.drop(columns=["sex"]), how="left", on="patient_id")
adata.obs.index = [x for x in list(adata.obs["sample_id"])]

adata.var = var_df.copy()
adata

In [None]:
# process dataset
ensure_unique_idx(adata)
enforce_sparsity(adata)
fix_nan_x(adata)
filter_nan_var_names(adata)
del_raw(adata)
calc_n_genes(adata)
standardize_gene_symbols(adata, species=species)
adata

In [None]:
adata.var

In [None]:
perplexity = min(adata.X.shape[0] - 1, 10)

sc.pp.pca(adata)
sc.pp.neighbors(adata)
    
if adata.X.shape[0] > 2:
    sc.tl.louvain(adata)
    sc.tl.leiden(adata)
    sc.tl.paga(adata, groups='louvain')
    sc.pl.paga(adata)
    sc.tl.umap(adata, init_pos='paga')
    sc.tl.tsne(adata, perplexity=perplexity)

In [None]:
fname = os.path.join(source, f"{dataset}.h5ad")
print(fname)
adata.write(fname, compression="gzip")