In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path

from thera_panacea.etl.extract import extract_to_df


data_root_dir = Path("data")
train_img_dir = data_root_dir / "train_img"
label_file = data_root_dir / "label_train.txt"

df = extract_to_df(train_img_dir, label_file)

# Set a copy of original df to keep track of
# out of sample probs and preds
oos_probs_df = df.copy()
oos_probs_df["pred"] = [-1 for _ in range(len(oos_probs_df))]
oos_probs_df["prob_0"] = [-1 for _ in range(len(oos_probs_df))]
oos_probs_df["prob_1"] = [-1 for _ in range(len(oos_probs_df))]

X = df["path"]
y = df["label"]

df.head()

In [None]:
import torch


device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from scipy.special import softmax
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch.nn import CrossEntropyLoss

from thera_panacea.dataset.dataset import TherasDS
from thera_panacea.model.baseline_model import BaselineModel, preprocess
from thera_panacea.trainer.trainer import Trainer
from thera_panacea.utils import get_class_weights


# CV
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits)
splits = skf.split(X, y)

n_epochs = 25
batch_size = 2048


for i, split in enumerate(splits):
    print(30*"#", f"split {i+1} / {n_splits}", 30*"#")

    # Data
    train_idx, test_idx = split
    train_df = df.loc[train_idx]
    test_df = df.loc[test_idx]

    train_ds = TherasDS(train_df, device, preprocess)
    test_ds = TherasDS(test_df, device, preprocess)

    train_dl = DataLoader(train_ds, batch_size=batch_size)
    test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

    # Model
    model = BaselineModel().to(device)

    # Trainer
    loss_function = CrossEntropyLoss(
        weight=get_class_weights(train_df, device),
        label_smoothing=0.1
    )
    opt = Adam(model.parameters())
    trainer = Trainer(model, opt, loss_function)

    # Train
    trainer.train(train_dl, test_dl, n_epochs)

    # Predict
    model.eval()
    probs = None
    for imgs, labels in tqdm(test_dl):
        logits = model(imgs).detach().cpu().numpy()
        if probs is None:
            probs = softmax(logits, axis=-1)
        else:
            probs = np.concatenate([probs, softmax(logits, axis=-1)], axis=0)

    preds = np.argmax(probs, axis=-1)

    # Feed results
    oos_probs_df.loc[test_idx, "pred"] = preds
    oos_probs_df.loc[test_idx, "prob_0"] = probs[:, 0]
    oos_probs_df.loc[test_idx, "prob_1"] = probs[:, 1]

In [None]:
import pickle


with open(Path("oos_probs_df.pkl"), "wb") as f:
    pickle.dump(oos_probs_df, f)

In [None]:
import pickle
from thera_panacea.utils import get_results, display_results


labels = oos_probs_df["label"]
preds = oos_probs_df["pred"]

results = get_results(preds, labels)
display_results(results)

with open(Path("raw_data_results.pkl"), "wb") as f:
    pickle.dump(results, f)

In [None]:
import numpy as np


probs = np.zeros((len(oos_probs_df), 2))
probs[:, 0] = oos_probs_df["prob_0"]
probs[:, 1] = oos_probs_df["prob_1"]

In [None]:
from cleanlab.filter import find_label_issues

ranked_label_issues = find_label_issues(
    oos_probs_df["label"],
    probs,
    return_indices_ranked_by="self_confidence",
)

len(ranked_label_issues)

In [None]:
cleaned_df = oos_probs_df.drop(ranked_label_issues, axis=0)

with open(Path("data/cleaned_df.pkl"), "wb") as f:
    pickle.dump(cleaned_df, f)