In [None]:
# import libraries
from xgboost import XGBClassifier
import xgboost as xgb
import anndata as ad
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

## Get the data and annotations

In [None]:
# Read the annotated labels
adata = ad.read_h5ad('../cells_annotated.h5ad')
df = adata.obs.copy()
df

Unnamed: 0,X_centroid,Y_centroid,Area,MajorAxisLength,MinorAxisLength,Eccentricity,Solidity,Extent,Orientation,CellID,imageid,leiden,leiden_phenotype
exemplar-001--unmicst_cell_1,1767.692308,257.290598,117,12.402944,12.006487,0.250814,0.959016,0.812500,-1.146733,1,exemplar-001--unmicst_cell,1,Tumor
exemplar-001--unmicst_cell_2,1107.173913,665.869565,92,11.874070,9.982065,0.541562,0.948454,0.696970,-0.435290,2,exemplar-001--unmicst_cell,5,Tumor
exemplar-001--unmicst_cell_3,1116.413793,671.068966,58,10.113305,7.629922,0.656364,0.878788,0.585859,1.221658,3,exemplar-001--unmicst_cell,5,Tumor
exemplar-001--unmicst_cell_4,982.728625,677.029740,269,25.433196,15.183300,0.802251,0.835404,0.531621,-0.705293,4,exemplar-001--unmicst_cell,0,Immune
exemplar-001--unmicst_cell_5,1141.071078,680.125000,408,26.604670,19.759781,0.669604,0.937931,0.739130,-0.711002,5,exemplar-001--unmicst_cell,1,Tumor
...,...,...,...,...,...,...,...,...,...,...,...,...,...
exemplar-001--unmicst_cell_11197,1270.593750,3131.731250,160,19.414487,11.039993,0.822582,0.893855,0.701754,-1.364872,11197,exemplar-001--unmicst_cell,0,Immune
exemplar-001--unmicst_cell_11198,1177.349057,3130.839623,106,14.080819,10.062622,0.699499,0.876033,0.706667,1.478579,11198,exemplar-001--unmicst_cell,0,Immune
exemplar-001--unmicst_cell_11199,1255.904762,3131.285714,105,15.623503,9.143181,0.810875,0.882353,0.596591,-1.065479,11199,exemplar-001--unmicst_cell,0,Immune
exemplar-001--unmicst_cell_11200,1354.448276,3131.810345,58,9.779089,7.836216,0.598231,0.878788,0.725000,-1.072712,11200,exemplar-001--unmicst_cell,0,Immune


In [None]:
# Read the feature table and add the cell type column
feature_table_path = ["../scimapExampleData/quantification/exemplar-001--unmicst_cell.csv"]
feature_table = pd.read_csv(feature_table_path[0], index_col=0)
# Create a new column in the feature table for the cell type
feature_table['cell_type'] = df['leiden_phenotype'].values

# sanity check
feature_table

Unnamed: 0_level_0,DNA_6,ELANE,CD57,CD45,DNA_7,CD11B,SMA,CD16,DNA_8,ECAD,...,X_centroid,Y_centroid,Area,MajorAxisLength,MinorAxisLength,Eccentricity,Solidity,Extent,Orientation,cell_type
CellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,17449.786325,1179.529915,161.410256,708.111111,5956.128205,378.692308,246.256410,332.769231,6960.008547,707.367521,...,1767.692308,257.290598,117,12.402944,12.006487,0.250814,0.959016,0.812500,-1.146733,Tumor
2,18053.913043,1124.728261,177.217391,790.271739,11803.586957,910.434783,423.445652,649.989130,13424.717391,1027.836957,...,1107.173913,665.869565,92,11.874070,9.982065,0.541562,0.948454,0.696970,-0.435290,Tumor
3,8418.017241,1249.500000,156.637931,791.189655,5159.948276,1034.827586,477.362069,599.482759,5636.793103,1334.327586,...,1116.413793,671.068966,58,10.113305,7.629922,0.656364,0.878788,0.585859,1.221658,Tumor
4,22444.189591,1159.676580,187.431227,853.814126,13773.092937,816.914498,2221.639405,375.884758,15566.044610,1030.483271,...,982.728625,677.029740,269,25.433196,15.183300,0.802251,0.835404,0.531621,-0.705293,Immune
5,8076.987745,1568.526961,162.904412,318.497549,4991.470588,827.286765,362.482843,366.365196,5639.095588,1325.740196,...,1141.071078,680.125000,408,26.604670,19.759781,0.669604,0.937931,0.739130,-0.711002,Tumor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11197,21648.418750,1200.631250,225.137500,1134.343750,12591.818750,772.681250,1073.600000,371.543750,15254.543750,960.762500,...,1270.593750,3131.731250,160,19.414487,11.039993,0.822582,0.893855,0.701754,-1.364872,Immune
11198,20916.537736,1141.783019,201.990566,1270.433962,11992.084906,693.405660,2030.113208,358.471698,14429.518868,868.452830,...,1177.349057,3130.839623,106,14.080819,10.062622,0.699499,0.876033,0.706667,1.478579,Immune
11199,10625.914286,1184.076190,202.657143,792.704762,5659.580952,902.647619,1226.819048,374.428571,7477.866667,985.828571,...,1255.904762,3131.285714,105,15.623503,9.143181,0.810875,0.882353,0.596591,-1.065479,Immune
11200,22182.327586,993.827586,202.568966,551.258621,13602.982759,622.362069,367.689655,330.293103,15571.603448,839.431034,...,1354.448276,3131.810345,58,9.779089,7.836216,0.598231,0.878788,0.725000,-1.072712,Immune


## Train a XGB classifier on the data

In [None]:
# split the data into training and test sets
cols_to_drop = [col for col in ['cell_type', 'cell_id', 'image_id'] if col in feature_table.columns]
X = feature_table.drop(columns=cols_to_drop)
y = feature_table['cell_type']

# Manually encode string labels to integers
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2, random_state=42)

# train the XGBoost classifier
xgb_model = xgb.XGBClassifier(use_label_encoder=True, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)

# evaluate the model
y_pred = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.97
              precision    recall  f1-score   support

           0       0.95      1.00      0.98        21
           1       0.98      0.97      0.97       968
           2       0.95      0.94      0.95       355
           3       0.96      0.96      0.96        83
           4       0.96      0.98      0.97       650
           5       0.96      0.97      0.96       164

    accuracy                           0.97      2241
   macro avg       0.96      0.97      0.97      2241
weighted avg       0.97      0.97      0.97      2241



- Why do you think is the performance so high for this model? Is the current way we split into train and test similar in real-scenario? 

### Estimate the least amount of data you would need to train a classifier that still has decent performance (F1 > 0.8)