In [None]:
#All the following script and comments have been made in accordance to Merged heart data##

#Importing packages # Make sure that you activate correct environment 
import pandas as pd
import scanpy as sc

sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor="white")

#writing output file
results_file = "./outputs/heart_global_scanpy.h5ad"

In [None]:
#Reading the h5ad file
heart_global = sc.read_h5ad("/Users/srivalli/Desktop/SCA-Uni/Single-cell-data-analysis/Cardiac_cell_analysis/outputs/merged_heart.h5ad")
heart_global

In [None]:
heart_global.obs

In [None]:
heart_global.var

In [None]:
heart_global.var_names_make_unique()

In [None]:
#PREPROCESSING#

#Viewing genes that contributes the largest portion in a cell
sc.pl.highest_expr_genes(heart_global)

In [None]:
#Filtering genes and cells
sc.pp.filter_cells(heart_global, min_genes=200)
sc.pp.filter_genes(heart_global, min_cells=3)

#Above filtering considers only cells having min 200 genes as a primary criteria and filters the genes which are found in minimum of 3 cells

In [None]:
sc.pl.violin(heart_global,["n_genes_by_counts", "total_counts", "pct_counts_mt"],jitter=0.4,multi_panel=True)

In [None]:
#Scatter plots
sc.pl.scatter(heart_global, x="total_counts", y="pct_counts_mt")
sc.pl.scatter(heart_global, x="total_counts", y="n_genes_by_counts")

In [None]:
#Normalizing data matrix using CPM
sc.pp.normalize_total(heart_global,target_sum=1e6)

#Logarithmization of data
sc.pp.log1p(heart_global,base=2)

In [None]:
#Identifying high variable genes
sc.pp.highly_variable_genes(heart_global, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(heart_global)

In [None]:
#Scaling data
#Filtering or selecting columns (genes) based on the highly variable genes identified
#heart_global = heart_global[:, heart_global.var.highly_variable]

#Regression of data i.e., Removing batch effects based on criteria of interest
#sc.pp.regress_out(heart_global, ["n_counts", "pct_counts_mt"])

#Scaling gene expression to make it comparable across cells
#sc.pp.scale(heart_global)

#To view it
heart_global

In [None]:
##PRINCIPAL COMPONENT ANALYSIS##

#Reducing dimensions
sc.tl.pca(heart_global, svd_solver="arpack")

#To view the principal components
heart_global.obsm['X_pca']

#Scatter plot for PCA components for visualization 
sc.pl.pca(heart_global)

#Scatter plot for PCA components for visualization based on coloring of genes
sc.pl.pca(heart_global,color= "CST3")

In [None]:
#Number of PCs to be considered for the data
sc.tl.tsne(heart_global)

#To know the values and count of Principal components
heart_global.obsm['X_pca']

In [None]:
#Estimates of Principal components contribution to the total variance of the data
sc.pl.pca_variance_ratio(heart_global, log=True)

#Saving results
heart_global.write(results_file)
heart_global

In [None]:
#COMPUTING NEIGHBOUIRHOOD GRAPH#

#General method
sc.pp.neighbors(heart_global)
heart_global
#Can add n_neighbors and n_pca parameters if we would like to consider making clusters based on given params#

In [None]:
#Computing by bbknn - Batch balanced KNN

bbknn_data = sc.external.pp.bbknn(heart_global, batch_key= 'cell_source')

bbknn_data

In [None]:
#EMBEDDING THE NEIGHBOURHOOD GRAPH

#Assinging cells to clusters
sc.tl.louvain(heart_global)

#Partioning data and identifying relationships between clusters
sc.tl.paga(heart_global)
sc.pl.paga(heart_global)

#Data visualization
sc.tl.umap(heart_global)

#Giving colour codes for better visulauization based on genes
sc.pl.umap(heart_global, color=["CST3", "NKG7", "PPBP"])


In [None]:
##CLUSTERING NEIGHBORHOOD GRAPH#

#Recommendded method Leiden graph-clustering method
sc.tl.leiden(
    heart_global,
    resolution=0.9,
    random_state=0,
    n_iterations=2,
    directed=False,
)

sc.pl.umap(heart_global, color=["leiden"])

#To see the cluster labels
heart_global.obs['leiden']

In [None]:
#Saving file
heart_global.write(results_file)

In [None]:
#FINDING MARKER GENES#

#Ranking genes using t-test
sc.tl.rank_genes_groups(heart_global, "leiden", method="t-test")

#Plotting 
sc.pl.rank_genes_groups(heart_global, n_genes=25, sharey=False)

In [None]:
#Ranking genes using wilcoxon method
sc.tl.rank_genes_groups(heart_global, "leiden", method="wilcoxon")

#Plotting 
sc.pl.rank_genes_groups(heart_global, n_genes=25, sharey=False)

#Saving data
heart_global.write(results_file)

In [None]:
#Using logistic regression
sc.tl.rank_genes_groups(heart_global, "leiden", method="logreg", max_iter=1000)
sc.pl.rank_genes_groups(heart_global, n_genes=25, sharey=False)

In [None]:
#To get list of gene names across clusters
pd.DataFrame(heart_global.uns["rank_genes_groups"]["names"])

#Can add .head(10) at the end to give output of only 10 hits

In [None]:
#Getting table with scores and group
heart_global = sc.read(results_file)
result = heart_global.uns["rank_genes_groups"]
groups = result["names"].dtype.names
scores = pd.DataFrame(
    {
        group + "_" + key[:1]: result[key][group]
        for group in groups
        for key in ["names", "pvals"]
    }
).head(5)

scores.write("/outputs/scores.csv")
#low p-value indicates that the gene is likely differentially expressed in that cluster compared to others