In [3]:
%load_ext autoreload
%autoreload 2
import sys
repo_dir = '/home/labs/amit/noamsh/repos/MM_2023'
sys.path.append(repo_dir)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
from pathlib import Path
from omegaconf import OmegaConf

import pandas as pd
import numpy as np

import anndata as ad

from data_loading.utils import load_dataframe_from_file
from io_utils import generate_path_in_output_dir

In [5]:
config_path = Path(repo_dir, 'config.yaml')
conf = OmegaConf.load(config_path)

## load anndata and hospital data

In [6]:
new_hospital_path = Path('/home/labs/amit/noamsh/data/mm_2023/clinical_prediction/Annonymized_CRF_BP_07072024.xlsx')
new_hospital_dataset = load_dataframe_from_file(new_hospital_path)
new_hospital_dataset.shape

(262, 127)

In [7]:
ouput_dir = Path(conf.outputs.output_dir)
# adata_to_update_path = Path(ouput_dir, 'raw_adata_data_v_20240619.h5ad')
# adata_to_update_path = Path(ouput_dir, 'pp_adata_data_v_20240619.h5ad')
# adata_to_update_path = Path(ouput_dir, 'adata_with_scvi_annot_pred_data_v_20240619_ts_2024-06-20.h5ad')
# adata_to_update_path = Path(ouput_dir, 'adata_with_scvi_annot_pred_data_v_20240619_ts_2024-06-20_only_pc_annotated_filtered.h5ad')
adata_to_update_path = Path(ouput_dir, 'adata_with_scvi_annot_pred_data_v_20240619_ts_2024-06-28_only_pc_annotated_filtered.h5ad')
adata_to_update_path

PosixPath('/home/labs/amit/noamsh/repos/MM_2023/outputs/adata_with_scvi_annot_pred_data_v_20240619_ts_2024-06-28_only_pc_annotated_filtered.h5ad')

In [8]:
adata = ad.read_h5ad(adata_to_update_path)
adata

AnnData object with n_obs × n_vars = 163913 × 3862
    obs: 'Amp.Batch.ID', 'Seq.Batch.ID', 'Batch.Set.ID', 'Included', 'Tissue', 'Gating', 'Total.PC', 'Total.NonPC', 'Initial.Code', 'Disease', 'Time', 'Project', 'Cohort', 'Weizmann.Code', 'Hospital.Code', 'Clinical.Data', 'Biopsy.Sequence', 'Previous.Hospital.Code', 'Method', 'cID', 'super_Population', 'Populations', 'n_genes', 'n_counts', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', '_scvi_batch', '_scvi_labels', 'leiden', 'Predicted', 'count_of_PC_in_neighborhood', 'count_of_CD45_in_neighborhood', 'pc_with_tme_environment', 'number_of_diffrent_patients_in_nighborhood', 'MGUS', 'SMM', 'NDMM', 'pc_annotation', 'Healthy', 'one_pateint_in_nighborhood', 'small_pateint_in_nighborhood', 'log_total_counts', 'noisy_malignant', 'noisy_malignant_by_umi'
    var: 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'Disease_colors', 'Healthy_colors', 'MGUS_colors', 'Me

In [23]:
metadata_df = adata.obs

## change transplantation Hospital.Code to lower case

In [25]:
metadata_df["Hospital.Code"] = metadata_df["Hospital.Code"].str.replace("Trans", "trans")
metadata_df[metadata_df["Hospital.Code"].str.startswith("trans")]["Hospital.Code"]

index
W3435303    trans_TLV001
W3435895    trans_TLV001
W3436176    trans_TLV001
W3436355    trans_TLV001
W3436585    trans_TLV001
                ...     
W5348551    trans_TLV013
W5348584    trans_TLV013
W5348605    trans_TLV013
W5348612    trans_TLV013
W5348692    trans_TLV013
Name: Hospital.Code, Length: 4126, dtype: object

## update Disease columns

In [133]:
disease_col = "Disease"

#### from clinical data

In [134]:
hospital_stage = 'Plasma cell dyscrasia at Bx time(0=NDMM, 1=RRMM, 2=SMM 3=MGUS,4=NDAL, 5=RRAL, 6=NDSPC, 7=MGRS, 8=None)'
hospital_stage_map = {0: 'NDMM', 1: "RRMM", 2: "SMM", 3: "MGUS", 4: "AL", 5:"AL", 6:"MGUS", 7:"MGUS", 8:None}
    
hispital_disease = new_hospital_dataset[["Code", hospital_stage]].drop_duplicates().set_index("Code")
hispital_disease[disease_col] = hispital_disease[hospital_stage].map(hospital_stage_map)

hispital_disease = dict(hispital_disease[disease_col])

In [135]:
def update_row_disease_if_in_hospital_data(row):
    patient_id = row["Hospital.Code"]
    cur_disease = row[disease_col]
    updated_disease = hispital_disease[patient_id] if patient_id in hispital_disease else cur_disease
    return updated_disease

In [136]:
new_disease_col = metadata_df[[disease_col, "Hospital.Code"]].apply(update_row_disease_if_in_hospital_data, axis=1)
new_disease_col = new_disease_col.astype("category")
new_disease_col.value_counts()

RRMM          103234
NDMM           26126
MGUS           11980
SMM             7599
AL              7300
Healthy         6466
MM_Unknown      1197
Name: count, dtype: int64

In [137]:
metadata_df[disease_col] = new_disease_col

#### change PRMM to RRMM

In [138]:
metadata_df[disease_col] = metadata_df.apply(lambda row: "RRMM" if row[disease_col] == "PRMM" else row[disease_col], axis=1)

In [139]:
metadata_df.shape

(163913, 46)

## update Time column

In [140]:
time_col = "Time"
metadata_df[time_col] = metadata_df.apply(lambda row: "Post" if (row["Hospital.Code"] == "CSA-01-03") and (row["Seq.Batch.ID"] == "SB440") else row[time_col], axis=1)

In [141]:
metadata_df.shape

(163913, 46)

## add Clinical.Trial column

In [142]:
allowed_MARS_trails = ["CART", "KPT", "Kydar", "PPIA"]
allowed_SPID_trails =  ["CART", "BISE JnJ", "Transplantation"]

def get_clinical_trail(row):
    method = row['Method']
    if method == "MARS":
        clinical_trial = row['Project'] if row['Project'] in allowed_MARS_trails else None
    if method == "SPID":
        clinical_trial = row['Cohort'] if row['Cohort'] in allowed_SPID_trails else None
    return clinical_trial

metadata_df['Clinical.Trial'] = metadata_df.apply(get_clinical_trail, axis=1)
metadata_df['Clinical.Trial'] = metadata_df['Clinical.Trial'].astype('category')

In [143]:
metadata_df['Clinical.Trial'] = metadata_df.apply(get_clinical_trail, axis=1)
metadata_df['Clinical.Trial'] = metadata_df['Clinical.Trial'].astype('category')

In [144]:
print(metadata_df.shape)
metadata_df['Clinical.Trial'].value_counts()

(163913, 47)


Clinical.Trial
Kydar              40656
BISE JnJ           25984
CART               13807
KPT                 9793
PPIA                5386
Transplantation     4126
Name: count, dtype: int64

## save updated adata

In [145]:
adata.obs = metadata_df
adata.obs.shape

(163913, 47)

In [146]:
from datetime import date
date.today().isoformat()

'2024-07-21'

In [147]:
adata_new_path = adata_to_update_path.with_stem(f"{adata_to_update_path.stem}_md_v_{date.today().isoformat()}")
adata_new_path

PosixPath('/home/labs/amit/noamsh/repos/MM_2023/outputs/adata_with_scvi_annot_pred_data_v_20240619_ts_2024-06-28_only_pc_annotated_filtered_md_v_2024-07-21.h5ad')

In [148]:
adata.write_h5ad(adata_new_path)