# Thymus Ageing Atlas: Load velocyto outputs

In [None]:
import os
import sys
import session_info
from datetime import datetime
today = datetime.today().strftime('%Y-%m-%d')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import scanpy as sc
import anndata as ad
import hdf5plugin

# Add repo path to sys path (allows to access scripts and metadata from repo)
#repo_path,_ = os.path.split(os.path.split(os.getcwd())[0])
repo_path = '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis'
sys.path.insert(1, repo_path) 
sys.path.insert(2, '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/scripts')

%load_ext autoreload
%autoreload 2

# Define paths
plots_path = f'{repo_path}/plots/preprocessing'
data_path = f'{repo_path}/data'
general_data_path = f'{repo_path}/data'

In [None]:
# Update metadata
from utils import get_latest_version,update_obs

latest_meta_path = get_latest_version(dir = f'{general_data_path}/metadata', file_prefix='Thymus_ageing_metadata')
meta = pd.read_excel(latest_meta_path)

In [None]:
#Â Load adata
object_version = 'v4_2025-02-04'
adata = ad.read_h5ad(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_{object_version}.zarr')

# Add new annotations to adata
ct_anno = pd.read_csv(f'{general_data_path}/objects/rna/thyAgeing_all_scvi_{object_version}_curatedAnno_v8.csv', index_col = 0)
for c in ct_anno.columns:
    if c in adata.obs.columns:
        adata.obs.drop(c, axis = 1, inplace = True)
adata.obs = adata.obs.join(ct_anno)

In [None]:
# Create dictionary with paths to velocyto data
velocyto_meta = meta.loc[(meta['chemistry_simple'] == '5GEX') & (~pd.isna(meta['path_raw_gex'])) & (meta['health_status'] == 'healthy')]
demux_libs = velocyto_meta.loc[(velocyto_meta['library'].str.count('_') ==2) & (velocyto_meta['study'] == 'Notarangelo2024')]
velocyto_meta = velocyto_meta.loc[~velocyto_meta['library'].isin(demux_libs['library'])]

velocyto_meta['study'].value_counts()

In [None]:
velocyto_meta['path_raw_gex'].isna().sum()

In [None]:
samples_wo_gex = []
missing_mat = []
for sample,gex_path in zip(velocyto_meta['library'],velocyto_meta['path_raw_gex']):
    velo_path = gex_path.replace('Gene','Velocyto/raw')
    if os.path.exists(velo_path):
        if not os.path.exists(f'{velo_path}/spliced.mtx') and not os.path.exists(f'{velo_path}/spliced.mtx.gz'):
            print(f'{sample} does not have spliced.mtx')
            missing_mat.append(sample)
        else:
            pass
    else:
        print(f'{sample} does not exist')
        samples_wo_gex.append(sample)
        
# Could not find that data

In [None]:
# Remove samples without gex data
velocyto_meta = velocyto_meta.loc[~velocyto_meta['library'].isin(samples_wo_gex)]
velocyto_meta.shape

# Write to file
velocyto_meta.to_csv(f'{data_path}/objects/velocyto/thyAgeing_all_scvi_{object_version}_velocyto_meta.csv')

In [None]:
barcode_dir = f'{data_path}/objects/velocyto/compartment_barcodes'
for c in adata.obs['taa_l1'].unique():
    barcodes = adata.obs_names[adata.obs['taa_l1'] == c].tolist()
    barcodes = pd.Series(barcodes)
    barcodes.to_csv(f'{barcode_dir}/thyAgeing_all_scvi_{object_version}_{c}_barcodes.tsv', sep='\t', index=False, header=False)

In [None]:
# Test snippet
# sys.path.insert(1, f'{repo_path}/scripts')
# from utils import velocyto_to_anndata

# barcodes_path = '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/data/objects/velocyto/compartment_barcodes/thyAgeing_all_scvi_v4_2025-02-04_B_barcodes.tsv'
# meta_path = '/lustre/scratch126/cellgen/team205/lm25/thymus_projects/thymus_ageing_atlas/General_analysis/data/objects/velocyto/thyAgeing_all_scvi_v4_2025-02-04_velocyto_meta.csv'

# # Read barcodes and meta
# meta = pd.read_csv(meta_path, index_col = 0)
# barcodes = pd.read_csv(barcodes_path, sep='\t', header=None)[0].tolist()

# velocyto_adata = velocyto_to_anndata(meta = velocyto_meta.iloc[:8], subset_barcodes=barcodes, n_cpu = 4)

In [None]:
import subprocess

# Define columns
cell_types = adata.obs['taa_l1'].unique()
velocyto_dir = f'{data_path}/objects/velocyto'
meta_path = f'{velocyto_dir}/thyAgeing_all_scvi_v4_2025-02-04_velocyto_meta.csv'
n_cpu = 4

for ct in cell_types[1:]:
    barcodes_path = f'{velocyto_dir}/compartment_barcodes/thyAgeing_all_scvi_v4_2025-02-04_{ct}_barcodes.tsv'
    out_name = f'{velocyto_dir}/thyAgeing_all_scvi_v4_2025-02-04_{ct}_velocyto.zarr'
    
    # Create the command to run the Python function
    command = (
        f'source /nfs/users/nfs_l/lm25/.bashrc ; conda activate /nfs/team205/lm25/condaEnvs/thymusAgeing ;'
        f'python {repo_path}/notebooks/preprocessing/01b_dataIntegration_velocyto.py'
        f' --barcodes_path {barcodes_path}'
        f' --velocyto_meta_path {meta_path}'
        f' --out_file_name {out_name}'
        f' --n_cpu {n_cpu}'
    )

    # Submit the command as an LSF job
    subprocess.run([
        'bsub',
        '-q', 'hugemem',
        '-G', 'team361',
        '-J', f'velocyto_{ct}',
        '-o', f'{velocyto_dir}/logs/velocyto_{ct}_%J.out',
        '-e', f'{velocyto_dir}/logs/velocyto_{ct}_%J.err',
        '-n', f'{n_cpu}',
        "-M250000",
        "-R", "span[hosts=1] select[mem>250000] rusage[mem=250000]",
        '-W', '03:00', 
        f" eval {command}"
    ])

## Velocyto on multiplexed libraries

In [None]:
# Create dictionary with paths to velocyto data
velocyto_meta_demux = meta.loc[meta['library'].isin(demux_libs['library'].tolist())]
velocyto_meta_demux = velocyto_meta_demux.drop_duplicates(subset = 'library')

velocyto_meta_demux['path_raw_gex'].isna().sum(), velocyto_meta_demux.shape

In [None]:
velocyto_meta_demux

In [None]:
%%capture output
# Test snippet
sys.path.insert(1, f'{repo_path}/scripts')
from utils import velocyto_to_anndata

velocyto_adata_demux = velocyto_to_anndata(meta = velocyto_meta_demux, col_library = 'library', col_prefix = 'index', n_cpu = 4)

In [None]:
# Remove sample from obs (needs to be added through demuxing)
velocyto_adata_demux.obs.drop(columns=['sample','index'], inplace=True)
velocyto_adata_demux.obs['barcode'] = velocyto_adata_demux.obs_names.str.split('-').str[-1]

velocyto_adata_demux.obs.head()

In [None]:
barcode_assignments = pd.read_csv('/lustre/scratch126/cellgen/team205/lm25/raw_data/Notarangelo2024/HTO_CITEseq_count_outputs/Notarangelo2024_HTO_barcode_assignments.csv')
barcode_assignments = barcode_assignments.merge(velocyto_meta_demux[['index', 'library']].drop_duplicates(), on = 'index')

barcode_assignments.head()

In [None]:
velocyto_adata_demux.obs = velocyto_adata_demux.obs.merge(barcode_assignments, on = ['barcode', 'library'], how = 'left')
velocyto_adata_demux.obs_names = velocyto_adata_demux.obs['hto_assignment.1']

velocyto_adata_demux.obs.head()

In [None]:
# Filter adata to only contain barcodes from samples of interest and add metadata
velocyto_adata_demux = velocyto_adata_demux[~velocyto_adata_demux.obs['index'].isna()]
velocyto_adata_demux.obs.drop(columns=['hto_assignment', 'hto_assignment_orig', 'hto_assignment.1'], inplace=True)

velocyto_adata_demux.obs = pd.merge(left = velocyto_adata_demux.obs.reset_index(names = 'names'), right = velocyto_meta_demux, how = "left", on=['index', 'library']).set_index('names')

velocyto_adata_demux.obs

In [None]:
velocyto_adata_demux.obs.dtypes

In [None]:
for c in velocyto_adata_demux.obs.columns:
    if velocyto_adata_demux.obs[c].dtype == 'object':
        velocyto_adata_demux.obs[c] = velocyto_adata_demux.obs[c].astype('str')
        
velocyto_adata_demux.write_h5ad( f'{velocyto_dir}/thyAgeing_Notarangelo2024_velocyto.zarr',
                compression=hdf5plugin.FILTERS["zstd"],
                compression_opts=hdf5plugin.Zstd(clevel=5).filter_options,
        )

In [None]:
## Some of the samples require correction of the format to make them readable by scanpy functions (hence this function)
def feature_check_correct (sample, feature_path):
    features = pd.read_csv(f"{feature_path}/features.tsv.gz", sep = '\t', header = None)
    if (features.shape[1]<3):
        features.loc[:,2]= 'Gene Expression'
    #   shutil.copyfile(f"{cr_gene_filtered_mtx}/features.tsv.gz", f"{cr_gene_filtered_mtx}/features2.tsv.gz")
        features.to_csv(f"{cr_gene_filtered_mtx}/features.tsv.gz", header=False, index=False, sep = '\t')
        print(f"{sample} features corrected")
### Actual reader 
def AnnData_from_files_v2(samples, starsolo_path, cellbender_path, meta):
    #Writing output from separate samples, processed using CellRanger, into a dictionary of Scanpy objects:
    import numpy as np
    from scipy import sparse
    ad = []
    #Generate AnnData for each sample
    for sid in samples:
        cr_filt_path = f"{starsolo_path}/{sid}/logs/GeneFull/filtered"
        cr_velo_path = f"{starsolo_path}/{sid}/logs/Velocyto/raw"
        cb_filt_path = f"{cellbender_path}/{sid}/{sid}_filtered.h5"
        feature_check_correct(sample = sid, feature_path=cr_filt_path)
        feature_check_correct(sample = sid, feature_path=cr_velo_path)
        
        cr_gene_filtered_ad = sc.read_10x_mtx(cr_filt_path)
        print("cr_gene_filtered_mtx read")
        cb_gene_filtered_ad = sc.read_10x_h5(cb_filt_path)
        print("cb_filtered_h5 read")
        
        velocyto_ad = sc.read_10x_mtx(cr_velo_path)
        print("Incorrect velocyto is read")
        shapex = np.loadtxt(f'{cr_velo_path}/matrix.mtx', skiprows=2, max_rows = 1, delimiter=' ')[0:2].astype(int)
        mtx = np.loadtxt(f'{cr_velo_path}/matrix.mtx', skiprows=3, delimiter=' ')
        spliced = sc.AnnData(X = sparse.csr_matrix((mtx[:,2], (mtx[:,0]-1, mtx[:,1]-1)), shape = shapex).T, 
                             dtype = 'float32', obs = velocyto_ad.obs, var = velocyto_ad.var)
        unspliced = sc.AnnData(X = sparse.csr_matrix((mtx[:,3], (mtx[:,0]-1, mtx[:,1]-1)), shape = shapex).T, 
                              dtype = 'float32', obs = velocyto_ad.obs, var = velocyto_ad.var)
        ambiguous = sc.AnnData(X = sparse.csr_matrix((mtx[:,4], (mtx[:,0]-1, mtx[:,1]-1)), shape = shapex).T, 
                              dtype = 'float32', obs = velocyto_ad.obs, var = velocyto_ad.var)
        print("velocyto mtx done")

        common_cells = list(
            set(cr_gene_filtered_ad.obs_names.tolist()) & set(cb_gene_filtered_ad.obs_names.tolist()))
        
        ad1 = sc.AnnData(
            X=cb_gene_filtered_ad[common_cells, :].X,
            obs=cb_gene_filtered_ad.obs.loc[common_cells,:].copy(),
            var=cb_gene_filtered_ad.var.copy(),
            layers={
                "raw": cr_gene_filtered_ad[common_cells, :].X,
                "spliced": spliced[common_cells, :].X,
                "unspliced": unspliced[common_cells, :].X,
                "ambiguous": ambiguous[common_cells, :].X,
            },
        )
        ad1.var.rename(columns = {'gene_ids':'ENSEMBL'}, inplace = True)
        ad1.var['SYMBOL'] = ad1.var.index
        ad1.var_names_make_unique() 
        ad1.obs['SampleID'] = sid
        ad1.obs['barcode'] = ad1.obs_names
        ad1.obs_names = ad1.obs['SampleID']+"-"+ad1.obs['barcode']
        ad.append(ad1)
        print(f"{sid} anndata created")
    from anndata import AnnData
    adata = ad[0].concatenate (ad[1:], batch_key = 'concat_sample_no', index_unique = None)
    #Add cleaned metadata to the Anndata.obs table
    obs_merged = pd.merge(left = adata.obs, right = meta, how = "left", left_on="SampleID", right_on="SampleID")
    obs_merged.index = obs_merged['SampleID']+"-"+obs_merged['barcode']
    adata.obs = obs_merged
    return adata