In [1]:
import os, sys
import numpy as np
import pandas as pd

## Obtain ROI List

In [2]:
roi_list = []
study_fea_dir = "../StudyProcessing/CellFeas"
study_roi_list = [os.path.splitext(ele)[0] for ele in os.listdir(study_fea_dir) if ele.endswith(".npy")]
roi_list.extend(study_roi_list)
print("Number of study ROI is: {}".format(len(study_roi_list)))
tonsil_fea_dir = "../TonsilProcessing/CellFeas"
tonsil_roi_list = [os.path.splitext(ele)[0] for ele in os.listdir(tonsil_fea_dir) if ele.endswith(".npy")]
roi_list.extend(tonsil_roi_list)
print("Number of tonsil ROI is: {}".format(len(tonsil_roi_list)))
print("Total number of ROI is: {}".format(len(roi_list)))

Number of study ROI is: 1623
Number of tonsil ROI is: 55
Total number of ROI is: 1678


## Obtain Batch ID

In [3]:
batch_list = []
study_slide_info_path = "../Metadata/StudySlide_Info.xlsx"
study_slide_info = pd.read_excel(study_slide_info_path, sheet_name = "Sheet1", header= 0, index_col=None)
study_batch_dict = {key: value for (key, value) in zip(study_slide_info["Slide_ID"], study_slide_info["Staining_ID"])}
study_batch_list = [study_batch_dict[ele[:ele.rfind("-")]] for ele in study_roi_list]
batch_list.extend(study_batch_list)
tonsil_batch_list = [ele[:ele.rfind("-")] for ele in tonsil_roi_list]
batch_list.extend(tonsil_batch_list)
print("Length of batch list is: {}".format(len(batch_list)))

Length of batch list is: 1678


## Obtain ROI Location

In [4]:
loc_list = []
study_roi_info_path = "../Metadata/StudyROI_Info.xlsx"
slide_roi_info = pd.read_excel(study_roi_info_path, sheet_name = "Sheet1", header= 0, index_col=None)
name_loc_dict = {key: value for (key, value) in zip(slide_roi_info["ROI_ID"], slide_roi_info["ROI_Location"])}
name_diag_dict = {key: value for (key, value) in zip(slide_roi_info["ROI_ID"], slide_roi_info["ROI_Diag"])}

In [5]:
study_loc_list = []
for ele in study_roi_list:
    roi_loc = name_loc_dict[ele]
    if roi_loc in ["AdjacentNormal", "DistantNormal", "Normal"]:
        study_loc_list.append(name_loc_dict[ele])
    elif roi_loc == "Tumor":
        study_loc_list.append("Tumor" + "-" + name_diag_dict[ele])
    else:
        print("Unknow location: {}".format(roi_loc))
loc_list.extend(study_loc_list)

In [6]:
print("Total number of study list is: {}".format(len(study_loc_list)))
(unique, counts) = np.unique(study_loc_list, return_counts=True)
for x, y in zip (unique, counts):
    print("{} has {} ROIs".format(x, y))

Total number of study list is: 1623
AdjacentNormal has 94 ROIs
DistantNormal has 67 ROIs
Normal has 50 ROIs
Tumor-AAH has 226 ROIs
Tumor-ADC has 563 ROIs
Tumor-AIS has 314 ROIs
Tumor-MIA has 309 ROIs


In [7]:
tonsil_loc_list = ["Normal" for ele in tonsil_roi_list]
# print("Total number of tonsil list is: {}".format(len(tonsil_loc_list)))
# print(list(set(tonsil_loc_list)))
loc_list.extend(tonsil_loc_list)
print("Total number of loc is: {}".format(len(loc_list)))

Total number of loc is: 1678


## Obtain Replicate information

In [8]:
anchor_list = []
study_anchor_list = ["nonrep{}".format(ind+1) for ind, val in enumerate(study_roi_list)]
anchor_list.extend(study_anchor_list)
tonsil_anchor_list = ["Replicates" for ele in tonsil_roi_list]
anchor_list.extend(tonsil_anchor_list)
print("total number of anchor is: {}".format(len(anchor_list)))

total number of anchor is: 1678


## Save With Control Meta Information

In [9]:
withcontrol_meta_df = pd.DataFrame({
    "Filename": roi_list,
    "Batch": batch_list,
    "Condition": loc_list,
    "Anchor": anchor_list
})
withcontrol_meta_path = os.path.join("../BatchCorrection/Metadata", "WithControlMetaIMC.csv")
withcontrol_meta_df.to_csv(withcontrol_meta_path, index=False)