# Format Functional marker input
This notebook is for formatting raw QuPath measurements so that Xgboost can be trained.  
For each marker, the labels will be saved as a separate file. The numerical data will also be saved at the end.  

In [1]:
import pandas as pd
import numpy as np
import re

In [10]:
batch_name = "UKcohort_nonSIMS_1"

In [3]:
folder = "/Users/yokote.k/Desktop/MIBI/xgboost/resources/original/UK cohort/"

In [5]:
whole_data_fm_df = pd.read_csv("/Users/yokote.k/Desktop/MIBI/xgboost/resources/original/UK cohort/UKcohort_nonSIMS_measurements.csv")
# functional_marker_labels  = whole_data_fm_df.loc[:,["Image", "Name"]]

## Training or applying to unlabelled data
If you are preprocessing the data to train an xgboost model, please leave as True.

If you are preprocessing the data to apply to an already trained model, please change to False

In [6]:
is_for_functional_marker_training = False
is_for_training = False

In [7]:
import json
old_labels = pd.read_csv("/Users/yokote.k/Desktop/MIBI/xgboost/resources/formatted/old_data/labels_ver6.csv.gz")
with open('/Users/yokote.k/Desktop/MIBI/xgboost/resources/formatted/old_data/decoder.json') as json_file:
    decode = json.load(json_file)

decode_ = {}
encode = {}
for k, v in decode.items():
    decode_[int(k)] = v
    encode[v] = int(k)
decode = decode_

## All the extra markers 

In [None]:
# functional_markers = ["CTLA4", "GrzB", "ICOS", "IFNG", "Ki67", "LAG3", "OX40", "PD1", "Tim3"]
# other_phenotypic_markers = ["CD103","CD14", "CD16", "CD163", "CD206", "CD45RA", "CD45RO", "CD49a", "CD69", r"MHC I (HLA Class1)", r"MHC II (HLA-DR)"]

functional_markers = ["CTLA4", "GrzB", "ICOS", "Ki67", "LAG3", "OX40", "CD45RA"]

## Read the QuPath data

## Format the functional marker labels
Using one hot encode scheme

In [8]:
functional_marker_labels  = whole_data_fm_df.loc[:,["Image", "Name"]]
functional_marker_labels.loc[:,  "Name"] = functional_marker_labels.loc[:,  "Name"].str.replace("_", " ")
if is_for_functional_marker_training:
    functional_marker_labels.loc[:, functional_markers] = 0

    for fm in functional_markers:
        # Just be careful. If you have astir results then might need ":"
        str_ = re.escape(fm + "+")
        bool_arr = functional_marker_labels.loc[:, "Name"].str.split(': ', expand=True) == str_
        functional_marker_labels.loc[bool_arr.sum(axis=1) > 0, fm] = 1

In [None]:
if is_for_functional_marker_training:
    for marker in functional_markers:
        filename = folder + "{}_{}_labels.gz".format(batch_name, marker)
        functional_marker_labels.loc[:, marker].to_csv(filename, index=False, compression='gzip')

In [None]:
if is_for_training:
    filename = folder + "{}_labels.gz".format(batch_name)
    labels = whole_data_fm_df.loc[:, ["Name"]]
    labels = labels.replace({"Name" : encode})
    labels.to_csv(filename, index=False, compression='gzip')

## Save images which each of the cells belong to

In [11]:
whole_data_fm_df.loc[:, "Image"].to_csv(folder + "{}_images.csv".format(batch_name), index=False)

## Format the numerical data

This is to remove columns which will not be used for the training. 

We only want the columns which are marker measurements. This starts from column number 20. We are not inputting any of the shape measurements. The last measurement will be 2300. If you are unsure, please check the data frame. The very first measurement after removing all of the unwanted columns should be "Beta-Tubulin: Cell: Mean" and the last measurements should be "Vimentin: Cell: Percentile: 99.9"

In [12]:
for image in whole_data_fm_df.loc[:, "Image"].unique():
    bool_arr = whole_data_fm_df.loc[:, "Image"] == image
    filtered_df = whole_data_fm_df.loc[bool_arr, :]
    print("Num cells in {}: {}".format(image,filtered_df.shape[0]))

Num cells in Mar21_LLL-2.tiff: 6363
Num cells in Jun20_LT-5.tiff: 2399
Num cells in Dec20_LING-3.tiff: 1168
Num cells in Jan20_LLL-2.tiff: 1630
Num cells in Mar21_LT-1.tiff: 601
Num cells in Jan20_LLL-3.tiff: 1275


In [13]:
new_whole_data_fm_df = whole_data_fm_df

In [15]:
new_whole_data_fm_df.columns = new_whole_data_fm_df.columns.str.replace("Target:", "")
new_whole_data_fm_df.columns = new_whole_data_fm_df.columns.str.replace("_", " ")

In [17]:
for col in new_whole_data_fm_df.columns:
    null_arr = new_whole_data_fm_df.loc[:, col].isnull()
    if null_arr.values.any():
        if "Cytoplasm" in col: 
            new_col = col.replace("Cytoplasm", "Membrane", 1)
            new_whole_data_fm_df.loc[null_arr.values, col] = new_whole_data_fm_df.loc[null_arr.values, new_col]

In [18]:
new_whole_data_fm_df.loc[null_arr.values, col]


Series([], Name: Vimentin: Cell: Percentile: 99.9, dtype: float64)

In [19]:
new_whole_data_fm_df = new_whole_data_fm_df.loc[:, ~new_whole_data_fm_df.columns.duplicated()]

In [20]:
new_whole_data_fm_df.columns[new_whole_data_fm_df.isna().any()].values

array(['Class'], dtype=object)

In [21]:
new_whole_data_fm_df

Unnamed: 0,Image,Name,Class,Parent,ROI,Centroid X px,Centroid Y px,Nucleus: Area px^2,Nucleus: Length px,Nucleus: Circularity,...,Vimentin: Membrane: Percentile: 98.0,Vimentin: Membrane: Percentile: 99.0,Vimentin: Membrane: Percentile: 99.5,Vimentin: Membrane: Percentile: 99.9,Vimentin: Cell: Percentile: 96.0,Vimentin: Cell: Percentile: 97.0,Vimentin: Cell: Percentile: 98.0,Vimentin: Cell: Percentile: 99.0,Vimentin: Cell: Percentile: 99.5,Vimentin: Cell: Percentile: 99.9
0,Mar21_LLL-2.tiff,PathCellObject,,PathAnnotationObject,Polygon,1380.10,49.80,315.7316,84.6482,0.5537,...,0.00,0.0,0,0,0.00,0.00,2.28,4.00,5.000,7.0
1,Mar21_LLL-2.tiff,PathCellObject,,PathAnnotationObject,Polygon,1370.40,64.52,127.5231,52.6173,0.5788,...,3.00,3.0,3,3,1.00,2.00,2.40,3.00,3.000,3.0
2,Mar21_LLL-2.tiff,PathCellObject,,PathAnnotationObject,Polygon,1387.80,66.11,55.9859,33.8894,0.6126,...,2.00,2.0,2,2,0.00,0.00,1.76,2.00,2.000,2.0
3,Mar21_LLL-2.tiff,PathCellObject,,PathAnnotationObject,Polygon,1456.10,70.34,377.7602,91.4318,0.5678,...,0.00,0.0,0,0,0.00,0.00,0.00,0.00,1.755,4.0
4,Mar21_LLL-2.tiff,PathCellObject,,PathAnnotationObject,Polygon,1326.40,74.08,225.9946,67.9238,0.6155,...,0.00,0.0,0,0,0.00,0.00,0.00,1.00,2.000,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13431,Jan20_LLL-3.tiff,PathCellObject,,PathAnnotationObject,Polygon,1119.90,1006.90,141.0000,56.0000,0.5650,...,4.56,5.0,5,5,2.00,2.00,2.66,3.33,4.165,5.0
13432,Jan20_LLL-3.tiff,PathCellObject,,PathAnnotationObject,Polygon,1196.10,1009.30,78.0000,42.0000,0.5557,...,4.98,5.0,5,5,4.00,4.00,4.00,5.00,5.740,6.0
13433,Jan20_LLL-3.tiff,PathCellObject,,PathAnnotationObject,Polygon,1071.60,1011.10,75.0000,40.0000,0.5890,...,4.80,5.0,5,5,3.00,3.62,4.00,4.00,5.270,6.0
13434,Jan20_LLL-3.tiff,PathCellObject,,PathAnnotationObject,Polygon,1217.20,1015.00,93.9998,43.9883,0.6105,...,5.90,6.0,6,6,4.92,5.00,5.96,6.98,7.000,7.0


In [None]:
cleaned_df = whole_data_fm_df.iloc[:, :20]

In [None]:
try:
    null_arr = cleaned_df.loc[:, "Centroid X µm"].isnull()
    if null_arr.any() != False:
        cleaned_df.loc[null_arr.values, "Centroid X µm"] = whole_data_fm_df.loc[null_arr.values, "Centroid X px"] * 0.3906
        cleaned_df.drop(["Centroid X px"], axis=1)
except:
    cleaned_df.loc[:, "Centroid X µm"] = whole_data_fm_df.loc[:, "Centroid X px"] * 0.3906
    cleaned_df = cleaned_df.drop(["Centroid X px"], axis=1)

try:
    null_arr = cleaned_df.loc[:, "Centroid Y µm"].isnull()
    if null_arr.any() != False:
        cleaned_df.loc[null_arr.values, "Centroid Y µm"] = whole_data_fm_df.loc[null_arr.values, "Centroid Y px"] * 0.3906
        cleaned_df = cleaned_df.drop(["Centroid Y px"], axis=1)
except:
    cleaned_df.loc[:, "Centroid Y µm"] = whole_data_fm_df.loc[:, "Centroid Y px"] * 0.3906
    cleaned_df = cleaned_df.drop(["Centroid Y px"], axis=1)

In [None]:
cleaned_df = pd.concat([cleaned_df, new_whole_data_fm_df], axis=1)

In [None]:
cleaned_df.to_csv(folder + "{}_measurements_cleaned.csv".format(batch_name), index=False)

### Remove certain markers
We will remove, Beta-Tubulin, dsDNA, and CD39 measurements as either the staining is bad or the marker is not informative

In [None]:
cols_to_remove = [x for x in new_whole_data_fm_df.columns if ("Beta-Tubulin" in x) or ("dsDNA" in x) or ("CD39" in x) or ("IgG4" in x) or ("TCF1" in x) or ("Tantalum" in x)]
new_whole_data_fm_df = new_whole_data_fm_df.drop(columns=cols_to_remove)

In [None]:
all_means_all_percentiles_col = [col for col in new_whole_data_fm_df.columns if "Percentile" in col] + [col for col in new_whole_data_fm_df.columns if "Mean" in col] + [col for col in new_whole_data_fm_df.columns if "Std.Dev." in col]
all_means_all_percentiles_df = new_whole_data_fm_df.loc[:, all_means_all_percentiles_col]

In [None]:
all_means_all_percentiles_df

In [None]:
all_means_all_percentiles_df.to_csv(folder + "{}_am_ap_astd_data.csv".format(batch_name), index=False)

## 1680 columns