In [1]:
# https://hotspot.readthedocs.io/en/latest/Lineage_Tutorial.html
# https://colab.research.google.com/drive/1UArV6s-3JN2nKuHcsjfsYu33elR1ySa6#scrollTo=Dj24KOME2K6l

import sys
import warnings; warnings.simplefilter('ignore')

import hotspot
import scanpy as sc

import numpy as np
import mplscience

import anndata as ad

In [2]:
adata = ad.read_h5ad("/home/users/kzlin/kzlinlab/projects/scContrastiveLearn/out/kevin/Writeup5/Larry_41093_2000_norm_log.h5ad")
adata

AnnData object with n_obs × n_vars = 41093 × 2000
    obs: 'Library', 'Cell barcode', 'time_info', 'Starting population', 'state_info', 'Well', 'SPRING-x', 'SPRING-y', 'Time point', 'clone_id', 'n_genes'
    var: 'Accession', 'Chromosome', 'End', 'Start', 'Strand', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'data_des', 'hvg', 'log1p'
    obsm: 'X_clone', 'X_emb'
    layers: 'ambiguous', 'matrix', 'raw_counts', 'spliced', 'unspliced'

In [3]:
scVI_embedding = np.load("/home/users/kzlin/kzlinlab/projects/scContrastiveLearn/out/kevin/Writeup5/Larry_scvi_full_embeddings.npy")
scVI_embedding.shape

(41093, 10)

In [5]:
adata.obsm["X_scVI"] = scVI_embedding

# Verify the assignment
print(adata.obsm["X_scVI"].shape)

(41093, 10)


In [6]:
import pandas as pd

# Load the trajectory file
trajectory_file = "/home/users/kzlin/kzlinlab/data/larry_hematopoiesis/stateFate_inVitro_neutrophil_monocyte_trajectory.txt"
trajectory_df = pd.read_csv(trajectory_file, sep="\t")

# Load the metadata file
metadata_file = "/home/users/kzlin/kzlinlab/data/larry_hematopoiesis/stateFate_inVitro_metadata.txt"
metadata_df = pd.read_csv(metadata_file, sep="\t")

# Filter metadata based on the cell indices in trajectory_df
filtered_metadata = metadata_df.iloc[trajectory_df['Cell index'].values]

# Concatenate 'Library' and 'Cell barcode' in filtered_metadata to create unique barcodes
filtered_metadata['combined_barcode'] = filtered_metadata['Library'] + ':' + filtered_metadata['Cell barcode']

# Update kept_cell_barcodes to use the new combined barcodes
kept_cell_barcodes = filtered_metadata['combined_barcode'].tolist()

# Optionally, print the updated list of combined barcodes
print(len(kept_cell_barcodes))



96373


In [7]:
# Convert the 'Library' and 'Cell barcode' columns to string before concatenating
adata.obs['Library_str'] = adata.obs['Library'].astype(str)
adata.obs['Cell barcode_str'] = adata.obs['Cell barcode'].astype(str)

# Ensure 'adata' also has 'Library' and 'Cell barcode' columns in the same format
# Create a new 'combined_barcode' column in adata
adata.obs['combined_barcode'] = adata.obs['Library_str'] + ':' + adata.obs['Cell barcode_str']

In [8]:
# Filter the AnnData object based on the combined barcodes
filtered_adata = adata[adata.obs['combined_barcode'].isin(kept_cell_barcodes), :]

# Optionally, drop the 'combined_barcode' column after filtering if not needed anymore
filtered_adata.obs = filtered_adata.obs.drop(columns=['combined_barcode'])

# Print the filtered AnnData object
print(filtered_adata)

AnnData object with n_obs × n_vars = 29137 × 2000
    obs: 'Library', 'Cell barcode', 'time_info', 'Starting population', 'state_info', 'Well', 'SPRING-x', 'SPRING-y', 'Time point', 'clone_id', 'n_genes', 'Library_str', 'Cell barcode_str'
    var: 'Accession', 'Chromosome', 'End', 'Start', 'Strand', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'data_des', 'hvg', 'log1p'
    obsm: 'X_clone', 'X_emb', 'X_scVI'
    layers: 'ambiguous', 'matrix', 'raw_counts', 'spliced', 'unspliced'


In [9]:
filtered_adata.obs['state_info'].value_counts()

state_info
Undifferentiated    14360
Neutrophil           7427
Monocyte             7349
Erythroid               1
Name: count, dtype: int64

In [10]:
# Filter the filtered_adata based on 'time_info' being equal to 2
filtered_adata_time2 = filtered_adata[filtered_adata.obs['time_info'] == 2, :]

# Print or inspect the new filtered AnnData object
print(filtered_adata_time2)

View of AnnData object with n_obs × n_vars = 1406 × 2000
    obs: 'Library', 'Cell barcode', 'time_info', 'Starting population', 'state_info', 'Well', 'SPRING-x', 'SPRING-y', 'Time point', 'clone_id', 'n_genes', 'Library_str', 'Cell barcode_str'
    var: 'Accession', 'Chromosome', 'End', 'Start', 'Strand', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'data_des', 'hvg', 'log1p'
    obsm: 'X_clone', 'X_emb', 'X_scVI'
    layers: 'ambiguous', 'matrix', 'raw_counts', 'spliced', 'unspliced'


In [11]:
filtered_adata_time2.obs['state_info'].value_counts()

state_info
Undifferentiated    1355
Monocyte              32
Neutrophil            19
Name: count, dtype: int64

In [12]:
sc.pp.calculate_qc_metrics(filtered_adata_time2, inplace=True)
sc.pp.filter_genes(filtered_adata_time2, min_cells=1)

In [13]:
filtered_adata_time2.layers["raw_counts_csc"] = filtered_adata_time2.layers["raw_counts"].tocsc()

In [14]:
# Create the Hotspot object with latent embeddings and normalized data

hs = hotspot.Hotspot(
    filtered_adata_time2,
    layer_key="raw_counts_csc",
    model='danb',
    latent_obsm_key="X_scVI"
)

In [15]:
# Create the neighborhood graph
hs.create_knn_graph(
    weighted_graph=False, n_neighbors=30,
)

In [16]:
# Determine informative genes
hs_results = hs.compute_autocorrelations(jobs=4)
print(hs_results.head(15))

100%|██████████| 1548/1548 [00:01<00:00, 1287.72it/s]


                C           Z  Pval  FDR
Gene                                    
Ly6a     0.400947  134.746323   0.0  0.0
Dlk1     0.453625  116.716438   0.0  0.0
Rsad2    0.335796   98.505096   0.0  0.0
H2afy    0.417695   83.724739   0.0  0.0
Gzmb     0.171027   72.907715   0.0  0.0
Prtn3    0.695086   72.106552   0.0  0.0
Mpo      0.698842   71.570610   0.0  0.0
Ctsg     0.648183   59.776047   0.0  0.0
Irgm1    0.233706   59.211861   0.0  0.0
Akr1c13  0.240318   58.160702   0.0  0.0
Elane    0.768466   57.477669   0.0  0.0
Igtp     0.206139   55.746254   0.0  0.0
Pim2     0.208554   54.696297   0.0  0.0
Iigp1    0.179566   53.232399   0.0  0.0
Muc13    0.204629   52.511684   0.0  0.0


In [17]:
import pandas as pd

# Assuming df is your DataFrame
hs_results.to_csv('/home/users/kzlin/kzlinlab/projects/scContrastiveLearn/git/SCSeq_LineageBarcoding_kevin/csv/kevin/Writeup5/Writeup5_LARRY_scVI_hotspot_day2_autocorrelations.csv')  