In [None]:
# import libraries
from xgboost import XGBClassifier
import xgboost as xgb
import anndata as ad
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

## Get the data and annotations

In [None]:
# Read the annotated labels
adata = ad.read_h5ad('../cells_annotated.h5ad')
df = adata.obs.copy()
df

In [None]:
# Read the feature table and add the cell type column
feature_table_path = ["../scimapExampleData/quantification/exemplar-001--unmicst_cell.csv"]
feature_table = pd.read_csv(feature_table_path[0], index_col=0)
# Create a new column in the feature table for the cell type
feature_table['cell_type'] = df['leiden_phenotype'].values

# sanity check
feature_table

## Train a XGB classifier on the data

In [None]:
# split the data into training and test sets
cols_to_drop = [col for col in ['cell_type', 'cell_id', 'image_id'] if col in feature_table.columns]
X = feature_table.drop(columns=cols_to_drop)
y = feature_table['cell_type']

# Manually encode string labels to integers
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2, random_state=42)

# train the XGBoost classifier
xgb_model = xgb.XGBClassifier(use_label_encoder=True, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)

# evaluate the model
y_pred = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred))


- Why do you think is the performance so high for this model? Is the current way we split into train and test similar in real-scenario? 

### Estimate the least amount of data you would need to train a classifier that still has decent performance (F1 > 0.8)