# Identify Single Cell Anomalies
Here we train Anomalyze models on different pre-computed datasets.

### Import Libraries

In [None]:
import pathlib
import sys

import joblib
import pandas as pd
import pyarrow.parquet as pq
from sklearn.ensemble import IsolationForest

## Define paths

### Inputs

In [1]:
plate_data_name = pathlib.Path(sys.argv[1]).name
sampled_plate_jump_data_path = sys.argv[2]

sampled_platedf = pd.read_parquet(
    f"{sampled_plate_jump_data_path}/{plate_data_name}.parquet"
)

[0;31m--------------------------------------------------------------[0m
[0;31mFileNotFoundError[0m            Traceback (most recent call last)
Cell [0;32mIn[7], line 4[0m
[1;32m      1[0m big_drive_path [38;5;241m=[39m [38;5;124mf[39m[38;5;124m"[39m[38;5;132;01m{[39;00mroot_dir[38;5;132;01m}[39;00m[38;5;124m/big_drive[39m[38;5;124m"[39m
[1;32m      3[0m [38;5;66;03m# Plate morphology data[39;00m
[0;32m----> 4[0m plate_paths [38;5;241m=[39m [43mpathlib[49m[38;5;241;43m.[39;49m[43mPath[49m[43m([49m[43msys[49m[38;5;241;43m.[39;49m[43margv[49m[43m[[49m[38;5;241;43m1[39;49m[43m][49m[43m)[49m[38;5;241;43m.[39;49m[43mresolve[49m[43m([49m[43mstrict[49m[38;5;241;43m=[39;49m[38;5;28;43;01mTrue[39;49;00m[43m)[49m
[1;32m      6[0m [38;5;66;03m# Boolean flag for if the data is single-cell[39;00m
[1;32m      7[0m is_sc [38;5;241m=[39m sys[38;5;241m.[39margv[[38;5;241m2[39m][38;5;241m.[39mlower() [38;5;241m==[39m [38;

### Outputs

In [None]:
isoforest_path = pathlib.Path("isolation_forest_models")
isoforest_path.mkdir(parents=True, exist_ok=True)

isoforest_path = pathlib.Path(
    isoforest_path / f"{plate_data_name}_isolation_forest.joblib"
)

## Train Anomalyze Models

In [None]:
meta_cols = [col for col in sampled_platedf.columns if "Metadata" in col]
featdf = sampled_platedf.drop(columns=meta_cols).dropna(axis=1, how="any")

# If 1_600 trees are trained with 256 samples per tree, then
# 1_600 * 256 gives approximately the expected number of samples per tree.
# For some of the plate data, this number of samples can barely fit in memory.
# We also want to maximize the number of trees to learn many patterns for identifying anomalies.
# 256 is empirically the largest number of samples per tree that allowed outliers to be isolated better.
isofor = IsolationForest(n_estimators=1_600, random_state=0, n_jobs=-1)
isofor.fit(featdf)

joblib.dump(isofor, isoforest_path)