# Step2: Using scanpy to preprocess the scRNAseq data

## Importing libraries

In [None]:
import os
import sys
import subprocess
import pycisTopic
pycisTopic.__version__
import subprocess
from pycisTopic.cistopic_class import *
from pycisTopic.utils import *
from pycisTopic.lda_models import * 
import anndata as ad
import scanpy as sc

## Creating the merged adata object

In [None]:
# Determine the folder in which the code is executed
WORKING_DIR = os.getcwd()
sys.path.append(os.path.abspath( WORKING_DIR))

# Run the params codes
%run -i ../../globalParams.py #GlobalParams
%run -i ../../sampleParams.py #sampleParams
%run -i ./analysisParams.py #AnalysisParams

In [None]:
# Get a list of all directories in the specified path
directories = [d for d in os.listdir(PATH_TO_VELOCITY_FOLDER) if os.path.isdir(os.path.join(PATH_TO_VELOCITY_FOLDER, d))]
print(directories)

In [None]:
# Import them and merge them in a list
import scanpy as sc

adata_CSS1 = sc.read_10x_mtx(
    os.path.join( PATH_TO_VELOCITY_FOLDER, "CSS1/CSS1_filtered_feature_bc_matrix"),
    var_names = "gene_symbols"
)


In [None]:
import os
import scanpy as sc
import anndata as ad

# Dictionary to store AnnData objects with their respective names
adata_dict = {}

# Loop over directories and load the AnnData object
for dir_name in directories:
    # Construct the full path to the 10x matrix for each directory
    path_to_matrix = os.path.join(PATH_TO_VELOCITY_FOLDER, f"{dir_name}/{dir_name}_filtered_feature_bc_matrix")
    
    # Read the AnnData object
    adata = sc.read_10x_mtx(
        path_to_matrix,
        var_names="gene_symbols"
    )
    
    # Store in the dictionary with the directory name
    adata_dict[dir_name] = adata

# Add an identifier (like the directory name) as a column in `obs` of each AnnData object
for name, adata in adata_dict.items():
    adata.obs['sample'] = name  # Add the directory name as a 'sample' identifier in the observation data


In [None]:
# Merge all the AnnData objects into one using the sample name to track origins
merged_adata = ad.concat(adata_dict, label='sample', join='outer', merge='same',  index_unique="_" )

# Optional: Verify the shape of the merged object and show the first few rows
print(merged_adata)


In [None]:
#Example of cell names
merged_adata.obs_names

In [None]:
import pandas as pd
cell_data = pd.read_csv(PATH_TO_CELLDATA_CSV, index_col = 0)
cell_data

In [None]:
cell_data.index

In [None]:
# Function to modify the cell_data.index format
def reformat_index(index):
    # Split the current index string (e.g., 'CSS1_AAACCGCGTGGATTAT-1')
    # into the sample ID ('CSS1') and the cell barcode ('AAACCGCGTGGATTAT-1')
    return [f"{idx.split('_')[1]}_{idx.split('_')[0]}" for idx in index]

# Apply the reformatting function to the index
cell_data.index = reformat_index(cell_data.index)

In [None]:
len(list(set(merged_adata.obs_names) & set(cell_data.index)))

In [None]:
# Keep intersection of adata and cell_data
adata = merged_adata[list(set(merged_adata.obs_names) & set(cell_data.index))].copy()

# Add the metadata
adata.obs = cell_data.loc[adata.obs_names]

In [None]:
# Calculate QC
adata.var["mt"] = adata.var_names.str.startswith("MT-")
sc.pp.calculate_qc_metrics(
    adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True
)

In [None]:
# Data normalization
adata.raw = adata
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
adata = adata[:, adata.var.highly_variable]
sc.pp.scale(adata, max_value=10)

In [None]:
adata.obs

In [None]:
# PCA
sc.tl.pca(adata)
sc.pl.pca(adata, color = CELL_TYPE_COLNAME)

In [None]:
# Neigbors and umap
sc.pp.neighbors(adata)
sc.tl.umap(adata)
sc.pl.umap(adata, color = CELL_TYPE_COLNAME)


In [None]:
adata.write(os.path.join(PATH_ANALYSIS_OUTPUT , "adata.h5ad"))