# Processing, Clustering and Visualising in MDV the TAURUS data

#### Importing the required packages for data preprocessing

In [15]:
import numpy as np
import pandas as pd
import gc

In [16]:
import scanpy as sc

#### Importing the required packages for MDV set up and visualisation

In [17]:
import os
from mdvtools.mdvproject import MDVProject
from mdvtools.charts.dot_plot import DotPlot
from mdvtools.charts.scatter_plot import ScatterPlot
from mdvtools.charts.box_plot import BoxPlot
from mdvtools.charts.stacked_row_plot import StackedRowChart

## Data analysis section

In [18]:
# scanpy parameters for feedback level setting
sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header() # printing a header of introductory information about the environment/library used

scanpy==1.10.1 anndata==0.9.2 umap==0.5.6 numpy==1.26.4 scipy==1.13.0 pandas==2.2.2 scikit-learn==1.4.2 statsmodels==0.14.2 igraph==0.11.4 pynndescent==0.5.12


In [19]:
# adata_bcell = sc.read_h5ad("../../../../../Documents/TAURUS_data/bcell_viz_ready_revised.h5ad",)

bcell_data = sc.read_h5ad('../../../../../../mariak/anndata_obj/sub_buckets/bcells_final.h5ad')
cd4tcell_data = sc.read_h5ad('../../../../../../mariak/anndata_obj/sub_buckets/cd4tcells_final.h5ad')
cd8tcell_data = sc.read_h5ad('../../../../../../mariak/anndata_obj/sub_buckets/cd8tcells_final.h5ad')
epithelium_colonic_data = sc.read_h5ad('../../../../../../mariak/anndata_obj/sub_buckets/epicolonic_final.h5ad')
endothelial_data = sc.read_h5ad('../../../../../../mariak/anndata_obj/sub_buckets/vasc_final.h5ad') 
epi_ileal_data = sc.read_h5ad('../../../../../../mariak/anndata_obj/sub_buckets/ilealepi_final.h5ad')
fib_peri_data = sc.read_h5ad('../../../../../../mariak/anndata_obj/sub_buckets/fibperi_final.h5ad')
ilc_data = sc.read_h5ad('../../../../../../mariak/anndata_obj/sub_buckets/ilc_final.h5ad') 
myeloid_data = sc.read_h5ad('../../../../../../mariak/anndata_obj/sub_buckets/myeloid_final.h5ad') 
plasmacell_data = sc.read_h5ad('../../../../../../mariak/anndata_obj/sub_buckets/plasmacells_final.h5ad') 

In [20]:
sc.pp.normalize_total(bcell_data, target_sum=1e4)
sc.pp.log1p(bcell_data)

sc.pp.normalize_total(cd4tcell_data, target_sum=1e4)
sc.pp.log1p(cd4tcell_data)

sc.pp.normalize_total(cd8tcell_data, target_sum=1e4)
sc.pp.log1p(cd8tcell_data)

sc.pp.normalize_total(epithelium_colonic_data, target_sum=1e4)
sc.pp.log1p(epithelium_colonic_data)

sc.pp.normalize_total(endothelial_data, target_sum=1e4)
sc.pp.log1p(endothelial_data)

sc.pp.normalize_total(epi_ileal_data, target_sum=1e4)
sc.pp.log1p(epi_ileal_data)

sc.pp.normalize_total(fib_peri_data, target_sum=1e4)
sc.pp.log1p(fib_peri_data)

sc.pp.normalize_total(ilc_data, target_sum=1e4)
sc.pp.log1p(ilc_data)

sc.pp.normalize_total(myeloid_data, target_sum=1e4)
sc.pp.log1p(myeloid_data)

sc.pp.normalize_total(plasmacell_data, target_sum=1e4)
sc.pp.log1p(plasmacell_data)

normalizing counts per cell
    finished (0:00:00)
normalizing counts per cell
    finished (0:00:00)
normalizing counts per cell
    finished (0:00:00)
normalizing counts per cell
    finished (0:00:02)
normalizing counts per cell
    finished (0:00:00)
normalizing counts per cell
    finished (0:00:00)
normalizing counts per cell
    finished (0:00:00)
normalizing counts per cell
    finished (0:00:00)
normalizing counts per cell
    finished (0:00:00)
normalizing counts per cell
    finished (0:00:00)


In [21]:
# sc.pp.subsample(bcell_data, 0.1)#0.05)
# sc.pp.subsample(cd4tcell_data, 0.1)#0.05)
# sc.pp.subsample(cd8tcell_data, 0.1)#0.05)
# sc.pp.subsample(epithelium_colonic_data, 0.1)#0.05)
# sc.pp.subsample(endothelial_data, 0.1)#0.05)
# sc.pp.subsample(epi_ileal_data, 0.1)#0.05)
# sc.pp.subsample(fib_peri_data, 0.1)#0.05)
# sc.pp.subsample(ilc_data, 0.1)#0.05)
# sc.pp.subsample(myeloid_data, 0.1)#0.05)
# sc.pp.subsample(plasmacell_data, 0.1)#0.05)

In [22]:
# B cells
# Saving count data
bcell_data.layers["counts"] = bcell_data.X.copy()
# total-count normalising the anndata object to 10,000 reads per cell, so that counts become comparable among cells
#sc.pp.normalize_total(bcell_data, target_sum=1e4)
# logarithmising the data
#sc.pp.log1p(bcell_data)

In [23]:
## B CELLS
# cells dataframe 
cells_df_bcell = pd.DataFrame(bcell_data.obs)

# adding the umap data to the cells dataframe
cells_df_bcell["UMAP 1"] = np.array(bcell_data.obsm["X_umap"])[:, 0]
cells_df_bcell["UMAP 2"] = np.array(bcell_data.obsm["X_umap"])[:, 1]
cells_df_bcell["Cell ID"] = bcell_data.obs.index
cells_df_bcell.rename(columns={"sub_bucket": "Cell type", "final_analysis": "Cell state", "MM_scaled": "Inflammation score", 
                         "sample_id": "Sample ID"}, inplace=True)
cells_df_bcell = cells_df_bcell.iloc[:, [0,1,2,3,4,6,10,33,37,38,39,40]]

# genes dataframe
gene_table_bcell = bcell_data.var
gene_table_bcell["gene_id"]=gene_table_bcell.index

In [24]:
## B CELLS

# Cells charts
# creating a row chart based on the Row Chart implementation to show the leiden clustering
stacked_row_chart_bcell = StackedRowChart(
    title="Abundance plot",
    params=["Patient", "Cell state"],
    position=[930, 10],
    size=[580, 450]
)
# configuring the row chart
stacked_row_chart_bcell.set_axis_properties("x", {"textSize": 13, "label": "", "tickfont": 10})
#stacked_row_chart_bcell.set_color_legend(display= True, params=[10,10])


#creating a dot plot based on the DotPlot implementation to show the gene expression of selected gene markers
gene_name = "HSPE1"

dot_plot_bcell = DotPlot(
    title= f"Percent of gene expression per cell state",
    #params=["Cell state", f"Gene expression|{gene_name}(Gene expression)|{gene_table_bcell.index.get_loc(gene_name)}"],
    params = ["Cell state",{"linkedDsName":"B genes","maxItems":10,"type":"RowsAsColsQuery"}], 
    size=[450, 300],
    position=[10, 470]
)

# configuring the dot plot
dot_plot_bcell.set_axis_properties("x", {"label": "", "textSize": 13, "tickfont": 10})
dot_plot_bcell.set_axis_properties("y", {"label": "", "textSize": 13, "tickfont": 10})
dot_plot_bcell.set_axis_properties("ry", {"label": "", "textSize": 13, "tickfont": 10})
dot_plot_bcell.set_color_scale(log_scale=False)
dot_plot_bcell.set_color_legend(True, [40, 10])
dot_plot_bcell.set_fraction_legend(True, [140, 10])


# # creating a histogram plot based on the HistogramPlot implementation to show the distribution of the number of genes per counts
box_plot_bcell = BoxPlot(
    title=f"Gene expression per cell state",
    #params=["Cell state", f"Gene expression|{gene_name}(Gene expression)|{gene_table_bcell.index.get_loc(gene_name)}"],
    params = ["Cell state", {"linkedDsName":"B genes","maxItems":1,"type":"RowsAsColsQuery"}],
    size=[650, 300],
    position=[470, 470]
)

# creating a scatter plot based on the ScatterPlot3D implementation to show the 3 PCA clustering components
scatter_plot_bcell = ScatterPlot(
    title="B cells",
    params=["UMAP 1", "UMAP 2"],
    size=[450, 450],
    position=[10, 10],
    default_color="#377eb8",
    brush="default",
    on_filter="hide",
    radius=5,
    opacity=0.8,
)

# configuring the scatter plot
scatter_plot_bcell.set_color_by("Cell state")


from mdvtools.charts.scatter_plot import ScatterPlot

# # creating a scatter plot based on the ScatterPlot3D implementation to show the 3 PCA clustering components
scatter_plot2_bcell = ScatterPlot(
    title="B cells",
    params=["UMAP 1", "UMAP 2"],
    size=[450, 450],
    position=[470, 10],
    default_color="#377eb8",
    brush="default",
    on_filter="hide",
    radius=5,
    opacity=0.8,
    color_by= {"linkedDsName":"B genes","maxItems":1,"type":"RowsAsColsQuery"}
)

# configuring the scatter plot
#scatter_plot2_bcell.set_color_by(f"Gene expression|{gene_name}(Gene expression)|{gene_table_bcell.index.get_loc(gene_name)}")
#scatter_plot2_bcell.set_color_by{"linkedDsName":"genes","maxItems":1,"type":"RowsAsColsQuery"}

In [25]:
## cd8T CELLS

# Cells charts
# creating a row chart based on the Row Chart implementation to show the leiden clustering
stacked_row_chart_cd8t = StackedRowChart(
    title="Abundance plot",
    params=["Patient", "Cell state"],
    position=[930, 10],
    size=[580, 450]
)
# configuring the row chart
stacked_row_chart_cd8t.set_axis_properties("x", {"textSize": 13, "label": "", "tickfont": 10})
#stacked_row_chart_cd8t.set_color_legend(display= True, params=[10,10])


#creating a dot plot based on the DotPlot implementation to show the gene expression of selected gene markers
gene_name = "HSPE1"

dot_plot_cd8t = DotPlot(
    title= f"Percent of gene expression per cell state",
    #params=["Cell state", f"Gene expression|{gene_name}(Gene expression)|{gene_table_cd8t.index.get_loc(gene_name)}"],
    params = ["Cell state",{"linkedDsName":"CD8T genes","maxItems":10,"type":"RowsAsColsQuery"}],
    size=[450, 300],
    position=[10, 470]
)

# configuring the dot plot
dot_plot_cd8t.set_axis_properties("x", {"label": "", "textSize": 13, "tickfont": 10})
dot_plot_cd8t.set_axis_properties("y", {"label": "", "textSize": 13, "tickfont": 10})
dot_plot_cd8t.set_axis_properties("ry", {"label": "", "textSize": 13, "tickfont": 10})
dot_plot_cd8t.set_color_scale(log_scale=False)
dot_plot_cd8t.set_color_legend(True, [40, 10])
dot_plot_cd8t.set_fraction_legend(True, [140, 10])


# # creating a histogram plot based on the HistogramPlot implementation to show the distribution of the number of genes per counts
box_plot_cd8t = BoxPlot(
    title=f"Gene expression per cell state",
    #params=["Cell state", f"Gene expression|{gene_name}(Gene expression)|{gene_table_cd8t.index.get_loc(gene_name)}"],
    params = ["Cell state", {"linkedDsName":"CD8T genes","maxItems":1,"type":"RowsAsColsQuery"}],
    size=[650, 300],
    position=[470, 470]
)

# creating a scatter plot based on the ScatterPlot3D implementation to show the 3 PCA clustering components
scatter_plot_cd8t = ScatterPlot(
    title="CD8+ T innate T NK cells",
    params=["UMAP 1", "UMAP 2"],
    size=[450, 450],
    position=[10, 10],
    default_color="#377eb8",
    brush="default",
    on_filter="hide",
    radius=5,
    opacity=0.8,
)

# configuring the scatter plot
scatter_plot_cd8t.set_color_by("Cell state")

# creating a scatter plot based on the ScatterPlot3D implementation to show the 3 PCA clustering components
scatter_plot2_cd8t = ScatterPlot(
    title="CD8+ T innate T NK cells",
    params=["UMAP 1", "UMAP 2"],
    size=[450, 450],
    position=[470, 10],
    default_color="#377eb8",
    brush="default",
    on_filter="hide",
    radius=5,
    opacity=0.8,
    color_by= {"linkedDsName":"CD8T genes","maxItems":1,"type":"RowsAsColsQuery"},

)

# configuring the scatter plot
# scatter_plot2_cd8t.set_color_by(f"Gene expression|{gene_name}(Gene expression)|{gene_table_cd8t.index.get_loc(gene_name)}")

In [26]:
# setting up and serving the MDV project
base = os.path.expanduser('~/mdv')
project_path = os.path.join(base, 'taurus_new_14') # defining the location where the project metadata will be stored
p = MDVProject(os.path.expanduser(project_path), delete_existing=True)

# # adding the two data sources to the project
# # B cells
p.add_datasource("B cells", cells_df_bcell)
p.add_datasource("B genes", gene_table_bcell)

del cells_df_bcell
del gene_table_bcell
gc.collect()

starting add_datasource
is ds None? None
got passed the ds check
created h5 group without error
- adding column 'Sample ID' to datasource 'B cells'
- adding column 'Patient' to datasource 'B cells'
- adding column 'Disease' to datasource 'B cells'
- adding column 'Site' to datasource 'B cells'
- adding column 'Treatment' to datasource 'B cells'
- adding column 'Inflammation' to datasource 'B cells'
- adding column 'Inflammation_score' to datasource 'B cells'
- adding column 'Cell state' to datasource 'B cells'
- adding column 'bucket' to datasource 'B cells'
- adding column 'Remission_status' to datasource 'B cells'
- adding column 'UMAP 1' to datasource 'B cells'
- adding column 'UMAP 2' to datasource 'B cells'
 - non-dodgy columns: [{'datatype': 'text', 'name': 'Sample ID', 'field': 'Sample ID', 'values': ['CID003355-1', 'CID003386-1', 'CID004751-1', 'CID004712-1', 'CID006559-1', 'CID005708-1', 'CID005709-1', 'CID003669-1', 'CID006565-1', 'CID006563-1', 'CID004730-1', 'CID003670-1', 

4793

In [27]:
# creating the link between the two datasets so that selecting a subset of genes to add the expression in cells is enabled
p.add_rows_as_columns_link("B cells","B genes","gene_id","Gene expression")
p.add_rows_as_columns_subgroup("B cells","B genes","Gene expression",bcell_data.layers["counts"].toarray()) #add the gene expression 

del bcell_data
gc.collect()

691

In [28]:
# # converting the chart implementation outputs to JSON and setting up the project view
list_charts_cells_bcell = []
list_charts_cells_cd4t = []
list_charts_cells_cd8t = []
list_charts_cells_col_epi = []
list_charts_cells_endo = []
list_charts_cells_fib = []
list_charts_cells_ileal = []
list_charts_cells_ilc = []
list_charts_cells_myel = []
list_charts_cells_plasma = []

# cells panel
list_charts_cells_bcell.extend([stacked_row_chart_bcell.plot_data, dot_plot_bcell.plot_data, box_plot_bcell.plot_data, 
                                scatter_plot_bcell.plot_data, scatter_plot2_bcell.plot_data])

# setting the config combining the two panels
view_config_bcell = {'initialCharts': {"B cells": list_charts_cells_bcell, "B genes": []}}


# adding the view to the project configuration
p.set_view("B cells", view_config_bcell)

p.set_editable(True)

In set_view
B cells


In [29]:
# # setting up and serving the MDV project
# base = os.path.expanduser('~/mdv')
# project_path = os.path.join(base, 'taurus_new_3') # defining the location where the project metadata will be stored
# p = MDVProject(os.path.expanduser(project_path), delete_existing=False)


# serving the project
p.serve()


created Flask <Flask 'mdvtools.server'>
 * Serving Flask app 'mdvtools.server'
 * Debug mode: on


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5050
 * Running on http://192.168.10.101:5050
Press CTRL+C to quit
127.0.0.1 - - [10/Mar/2025 15:46:59] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [10/Mar/2025 15:46:59] "GET /static/js/mdv.js HTTP/1.1" 200 -


recieved request to project_index


127.0.0.1 - - [10/Mar/2025 15:46:59] "GET /static/assets/mdv.css HTTP/1.1" 304 -
127.0.0.1 - - [10/Mar/2025 15:46:59] "GET /static/assets/ExpandMore--uyez2mu.js HTTP/1.1" 200 -
127.0.0.1 - - [10/Mar/2025 15:46:59] "GET /static/assets/datasourceWorker-BDVgcx14.js HTTP/1.1" 304 -
127.0.0.1 - - [10/Mar/2025 15:46:59] "GET /datasources.json HTTP/1.1" 200 -
127.0.0.1 - - [10/Mar/2025 15:46:59] "GET /state.json HTTP/1.1" 200 -
127.0.0.1 - - [10/Mar/2025 15:46:59] "GET /views.json HTTP/1.1" 200 -
127.0.0.1 - - [10/Mar/2025 15:46:59] "POST /get_view HTTP/1.1" 200 -
127.0.0.1 - - [10/Mar/2025 15:46:59] "GET /static/assets/filteredIndexWorker-CEl1713S.js HTTP/1.1" 304 -
127.0.0.1 - - [10/Mar/2025 15:46:59] "GET /static/assets/filteredIndexWorker-CEl1713S.js HTTP/1.1" 304 -
127.0.0.1 - - [10/Mar/2025 15:46:59] "GET /static/img/fa-solid-900.woff2 HTTP/1.1" 304 -
127.0.0.1 - - [10/Mar/2025 15:46:59] "GET /static/img/roboto-latin-400-normal.woff2 HTTP/1.1" 304 -
127.0.0.1 - - [10/Mar/2025 15:46:59] 

Access level required: editable
In set_view
default


127.0.0.1 - - [10/Mar/2025 15:48:28] "GET /static/img/roboto-latin-500-normal.woff2 HTTP/1.1" 304 -


In [None]:
#p.convert_to_static_page("../../../../../Desktop/TAURUS_static_new_5/")