In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
repo_dir = '/home/labs/amit/noamsh/repos/MM_2023'
sys.path.append(repo_dir)

In [None]:
from pathlib import Path
from omegaconf import OmegaConf

import anndata as ad
import pandas as pd
import scanpy as sc

import pyreadr
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
plt.rcParams['figure.figsize'] = [6.4, 4.8] # default [6.4, 4.8]
config_path = Path(repo_dir, 'config.yaml')
conf = OmegaConf.load(config_path)

## data_loading

In [None]:
from io_utils import generate_path_in_output_dir

In [None]:
load_ts_iso = "2024-05-21" # "2024-05-19"
data_version = "20240519"  # "20240515"

adata = ad.read_h5ad(generate_path_in_output_dir(conf, conf.outputs.inferred_missing_annotation_file_name,
                                                 with_version=data_version, with_date_timestamp=load_ts_iso))
adata = adata[adata.obs['Gating'] == "CD45"]
adata

In [None]:
raw_adata = ad.read_h5ad(generate_path_in_output_dir(conf, conf.outputs.loaded_adata_file_name, with_version=data_version))
raw_adata = raw_adata[raw_adata.obs['Gating'] == "CD45"]
sc.pp.calculate_qc_metrics(raw_adata,  percent_top=None, log1p=False, inplace=True)
raw_adata

In [None]:
label_col = "Populations"
super_pop = "super_Population"
sample_col = 'Hospital.Code'
time_col = "Time"
method_col = 'Method'

p_id_col_name = f"{sample_col}|{time_col}|{method_col}"

all_cols = [sample_col, time_col, method_col, label_col]

comp_df = adata.obs[(adata.obs["Gating"]=="CD45") & (adata.obs[super_pop] == "CD45")][all_cols].dropna()
comp_df.shape


In [None]:
comp_df[p_id_col_name] = comp_df.apply(lambda row: f"{row[sample_col]}|{row[time_col]}|{row[method_col]}", axis=1)
comp_df = comp_df[[p_id_col_name, label_col]]
comp_df.head()

In [None]:
adata.obs[p_id_col_name] = adata.obs.apply(lambda row: f"{row[sample_col]}|{row[time_col]}|{row[method_col]}", axis=1)
raw_adata.obs[p_id_col_name] = raw_adata.obs.apply(lambda row: f"{row[sample_col]}|{row[time_col]}|{row[method_col]}", axis=1)
adata.obs[p_id_col_name].value_counts()

In [None]:
population_counts = comp_df.reset_index().groupby([p_id_col_name, label_col]).count()
comp_table = population_counts.unstack(level=-1)
comp_table = comp_table.droplevel(None, axis=1)
comp_table["TOTAL"] = comp_table.sum(axis=1)
comp_table

## QC at sample level

In [None]:
bad_SPID_plates_sb = set([f"SB{i}" for i in range(190, 220)])

In [None]:
adata.obs['from_bad_SB'] = (adata.obs['Seq.Batch.ID'].apply(lambda x: x in bad_SPID_plates_sb)) & (adata.obs['Method'] == "SPID")

In [None]:
df = adata.obs[[p_id_col_name,'from_bad_SB', 'Method']].groupby(p_id_col_name).agg(pd.Series.mode)
df = df.reset_index().groupby(['from_bad_SB', 'Method']).count().reset_index()
fig = px.bar(df, x='from_bad_SB',y=p_id_col_name, color='Method', height=400, title="from bad spid seq batches")
fig.show()

In [None]:
df[p_id_col_name].sum()

In [None]:
adata.obs['from_bad_SB'] = adata.obs['from_bad_SB'].astype(int)
adata_both_methods = adata.copy()
adata = adata[adata.obs["Method"]=="SPID"]

In [None]:
comp_table.columns

In [None]:
mye = ['Mf', 'Mo', 'Mo_CD16', 'Mo_Pro', 'DC', 'DC_IRF8']
T = ['T_Effector', 'T_Effector_GZMB', 'T_Naive', 'NK']
mye_proportions = comp_table[mye].sum(axis=1) / comp_table["TOTAL"]
T_proportions = comp_table[T].sum(axis=1) / comp_table["TOTAL"]

In [None]:
p_data = []
for p_id, group in adata.obs[['from_bad_SB', "Amp.Batch.ID", p_id_col_name]].groupby(p_id_col_name):
    n_plates = group["Amp.Batch.ID"].nunique()
    n_cells = len(group)
    good_cell_prop = n_cells / (n_plates * 380)
    proporion_from_bad_plates = (group['from_bad_SB']).mean()
    mye_proportion = mye_proportions[p_id]
    T_proportion = T_proportions[p_id]
    p_data.append((p_id, n_plates, n_cells, proporion_from_bad_plates, good_cell_prop, mye_proportion, T_proportion))
pdf = pd.DataFrame(p_data, columns=("p_id", "n_plates", "n_cells", "proporion_from_bad_plates", "good_cell_prop", "mye_proportion", "T_proportion"))

In [None]:

fig = px.scatter(pdf, x="mye_proportion", y="good_cell_prop", color="proporion_from_bad_plates",
                 title="SPID patients", hover_data=['n_plates', 'p_id'], trendline="ols",width=800, height=400)
fig.show()
fig = px.scatter(pdf, x="T_proportion", y="good_cell_prop", color="proporion_from_bad_plates",
                 title="SPID patients", hover_data=['n_plates', 'p_id'], trendline="ols", width=800, height=400)
fig.show()

In [None]:
# adata.obs[['from_bad_SB', p_id_col_name]].groupby(p_id_col_name).value_counts(normalize=True).value_counts()
adata.obs[['from_bad_SB', p_id_col_name]].groupby(p_id_col_name).count().hist(bins=100)

### simulation and violin

In [None]:
order_path = "/home/labs/amit/shuangyi/Project_MM3/Atlas/Notebooks/pct_MM3_CD45_PID_order.Rds"

order_result = pyreadr.read_r(order_path)
order_df = order_result[None]
order_df.head()

In [None]:
adata = adata[adata.obs[p_id_col_name].apply(lambda x: x in order_df['value'].values)]
raw_adata = raw_adata[raw_adata.obs[p_id_col_name].apply(lambda x: x in order_df['value'].values)]
adata

In [None]:
from datetime import date
figures_dir = ts_iso = date.today().isoformat()
figures_dir = Path(conf.outputs.output_dir, "figures", ts_iso)

In [None]:
figures_dir

In [None]:
def simulate_goodcells_ratio(thresh):
    adata_thresh = raw_adata[raw_adata.obs['total_counts'] >= thresh]
    good_cells_ratio = adata_thresh.obs.groupby(p_id_col_name).Time.count() / raw_adata.obs.groupby(p_id_col_name).Time.count()
    name = f"good_cells_ratio_{thresh}"
    good_cells_ratio = good_cells_ratio.rename(name)
    return good_cells_ratio

In [None]:
tresholds_to_simulate = [200, 300, 700]
cur_good_cells_ratio = adata.obs.groupby(p_id_col_name).Time.count() / raw_adata.obs.groupby(p_id_col_name).Time.count()
cur_good_cells_ratio = cur_good_cells_ratio.rename("good_cells_ratio_400")
sim_ratios = [simulate_goodcells_ratio(thresh) for thresh in tresholds_to_simulate]

In [None]:
df = pd.concat([cur_good_cells_ratio] + sim_ratios, axis=1)
df = df.reindex(list(order_df['value']))
df = df.reset_index()
df

In [None]:
df.set_index(p_id_col_name)['good_cells_ratio_400'].sort_values().to_csv("/home/labs/amit/noamsh/repos/MM_2023/outputs/good_cells_ratio.csv")

In [None]:
import matplotlib.pyplot as pyplt

pyplt.rcParams["figure.figsize"] = (40, 6)

fig, ax = plt.subplots()
for col in df.columns:
    if "good_cells_ratio" in col:
        ax.plot(df["Hospital.Code|Time|Method"], df[col], label=col)

ax.set(xlabel="Hospital.Code|Time|Method", ylabel='good_cells_ratio')
ax.set_xticklabels(df["Hospital.Code|Time|Method"], rotation=90)
ax.grid()

# fig.savefig("test.png")
plt.legend() 
plt.show()


In [None]:
cut_df = df.set_index("Hospital.Code|Time|Method")
(cut_df >= 0.6).mean()


In [None]:
with plt.rc_context():  # Use this to set figure params like size and dpi

    sc.set_figure_params(figsize = [40, 6], dpi=300, fontsize=12)
    sc.pl.violin(adata, keys='total_counts', groupby=p_id_col_name, rotation=90, log=True, order=list(order_df['value']), show=False)
    plt.savefig(Path(figures_dir, "pp_data_counts_per_patient.pdf"), bbox_inches="tight", format="pdf")

In [None]:
with plt.rc_context():  # Use this to set figure params like size and dpi
    sc.set_figure_params(figsize = [40, 6], dpi=300, fontsize=12)
    sc.pl.violin(raw_adata, keys='total_counts', groupby=p_id_col_name, rotation=90, log=True, order=list(order_df['value']), show=False)
    plt.savefig(Path(figures_dir, "raw_data_counts_per_patient.pdf"), bbox_inches="tight", format="pdf")

# save orgenize cell type counts for analysis

In [None]:
good_cell_prop_dict = {}
for p_id, group in adata_both_methods.obs[['from_bad_SB', "Amp.Batch.ID", p_id_col_name]].groupby(p_id_col_name):
    n_plates = group["Amp.Batch.ID"].nunique()
    n_cells = len(group)
    good_cell_prop = n_cells / (n_plates * 380)
    good_cell_prop_dict[p_id] = good_cell_prop

good_cell_prop_series = pd.Series(good_cell_prop_dict)
good_cell_prop_series.shape

In [None]:
comp_table["good_cell_prop"] = good_cell_prop_series
comp_table

In [None]:
comp_tbl_path = Path(conf.outputs.output_dir, f"composition_table_data_v_{data_version}_ts_{load_ts_iso}.csv")
comp_table.to_csv(comp_tbl_path)

In [None]:
pd.read_csv(comp_tbl_path)


In [None]:
comp_tbl_path