In [1]:
import os              
os.environ['PYTHONHASHSEED'] = '0'
import pandas as pd                                                    
import numpy as np                                                     
import scanpy as sc                                                                                 
from time import time                                                       
import sys
import matplotlib

import matplotlib.pyplot as plt
from anndata import AnnData, read_h5ad, concat
from tqdm import tqdm
import scipy
import scipy.stats as ss
from sklearn.model_selection import train_test_split

Raw SMARTseq reference data are downloaded from Nowakowski, T. J. et al. doi:10.1126/science.aap8809 (2017). "h2_marker.csv" is generated by "Fig1/Fig1E_marker_detect.py"

In [None]:
sc_expression = pd.read_csv('SMARTseq_exprMatrix.tsv', sep='\t', header=0)

# Set gene column as row names and remove the original gene column
sc_expression.set_index('gene', inplace=True)

# Extract gene names before the pipe symbol (|)
sc_expression.index = sc_expression.index.str.split('|').str[0]

# Load metadata
sc_meta = pd.read_csv('SMARTseq_meta.tsv', sep='\t', header=0)
sc_meta.index = sc_meta['Cell']
# Filter out columns with NA values in WGCNAcluster
# valid_indices = ~sc_meta['WGCNAcluster'].isna()
sc_expression = sc_expression.iloc[:,~sc_meta['WGCNAcluster'].isna().values]
sc_meta = sc_meta.iloc[~sc_meta['WGCNAcluster'].isna().values, :]

adata = AnnData(sc_expression.T, obs=sc_meta)

adata = adata[adata.obs['WGCNAcluster'] != ""].copy()
cluster_exclude = ["U1", "U2", "U3", "U4", "MGE-div", "MGE-IPC1", "MGE-IPC2", "MGE-IPC3", "MGE-RG1", "MGE-RG2"]
adata = adata[~adata.obs['WGCNAcluster'].isin(cluster_exclude)].copy()

cluster_in = ["nIN1", "nIN2", "nIN3", "nIN4", "nIN5"]
adata.obs['WGCNAcluster'][adata.obs['WGCNAcluster'].isin(cluster_in)] = "nIN"

  adata = AnnData(sc_expression.T, obs=sc_meta)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.obs['WGCNAcluster'][adata.obs['WGCNAcluster'].isin(cluster_in)] = "nIN"


In [21]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.scale(adata, max_value=10)

In [30]:
marker_h2 = pd.read_csv("h2_marker.csv", index_col = 0)
adata_select = adata[:, adata.var.index.isin(marker_h2.iloc[0, :].values)]
adata_df = pd.DataFrame(adata_select.X)
adata_df.columns = adata_select.var.index
# adata_df['cluster'] = adata.obs['H2_annotation'].values
adata_df['cluster'] = adata_select.obs['WGCNAcluster'].values

adata_zs = adata_df.groupby(by='cluster').agg('mean')
adata_nz = adata_df.groupby(by='cluster').agg(lambda x: np.mean(x != np.min(x)))

adata_zs.to_csv("result/smartseq_zs.csv")
adata_nz.to_csv("result/smartseq_nz.csv")