# Fit and evaluate model for automatically labelling images in the collection

This notebook is included to show how the classification model was trained. However, it requires the embeddings to run, which are not included in this GitHub repository. 

## Imports and type aliases

In [1]:
import os

os.environ["OMP_NUM_THREADS"] = "32"

In [2]:
import itertools
import json
import warnings
from collections.abc import Sequence
from collections import defaultdict
from pathlib import Path

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import pandas as pd
import sklearn.metrics as metrics

from IPython.display import Markdown, display, HTML
from pandas.io.formats.style import Styler
from scipy.interpolate import interp1d
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold
from tqdm.auto import tqdm

### Type aliases

In [3]:
EmbeddingType = str
URL = str
Label = int
EvaluationDict = list[dict[str, float | int | EmbeddingType | list[Label] | list[URL]]]

## Load image embeddings

In [4]:
data_dir = Path("../../image-embeddings/data")

embedding_dataframes = {}
for embedding_type in ["ViT", "CLIP", "SigLIP"]:
    print(f"Loading {embedding_type}", flush=True)
    embedding_dir = data_dir / "vectors/all_images" / embedding_type
    embedding_dataframes[embedding_type] = pd.concat(
        [pd.read_parquet(p) for p in sorted(embedding_dir.glob("*.parquet"))],
        axis="index"
    ).reset_index(drop=True)

Loading ViT
Loading CLIP
Loading SigLIP


## Load results from manual labelling

In [5]:
label_df = pd.read_json("../data/labelled_data.json")
label_df.index.name = "url"
label_df = label_df.reset_index()

## Merge training dataset with embeddings

In [6]:
train_dfs = {embedding: pd.merge(label_df, df, on="url", how="inner") for embedding, df in embedding_dataframes.items()}

In [7]:
embeddings = {embedding:  np.stack((df["embedding"].to_numpy().squeeze()), axis=0) for embedding, df in train_dfs.items()}

## Functions to fit and evaluate models

In [8]:
def evaluate(
    classifier: LogisticRegression,
    embedding_type: EmbeddingType,
    Xs_train: dict[EmbeddingType, np.ndarray],
    y_train: np.ndarray,
    Xs_test: dict[EmbeddingType, np.ndarray],
    y_test: np.ndarray,
    urls: list[str]
) -> EvaluationDict:
    X_train, X_test = Xs_train[embedding_type], Xs_test[embedding_type]
    
    # Evaluate on train set
    y_hat_train = classifier.predict(X_train)
    f1_train = metrics.f1_score(y_train, y_hat_train, average="micro")
    
    # Evaluate on test set
    y_hat_test = classifier.predict(X_test)
    
    y_proba_test = classifier.predict_proba(X_test)[:, 1]
    f1_test = metrics.f1_score(y_test, y_hat_test, average="macro")
    confusion_matrix_test = metrics.confusion_matrix(y_test, y_hat_test)
    
    num_negative = confusion_matrix_test[0].sum()
    num_positive = confusion_matrix_test[1].sum()
    
    # Assemble results
    return {
        "Embedding type": embedding_type,
        "C": classifier.C,
        "Number of iterations": classifier.n_iter_.item(),
        
        "F1 (train)": f1_train,
        "F1 (test)": f1_test,
        
        "y_hat_test": y_hat_test,
        "y_test": y_test,
        "urls": urls,
    }


def run_cross_validation_model_selection(
    Xs: dict[EmbeddingType, np.ndarray],
    y: np.ndarray,
    folds: KFold
) -> tuple[LogisticRegression, EmbeddingType]:
    Cs = np.geomspace(1e-2, 1e4, 10)
    performance: dict[tuple[float, EmbeddingType], float] = defaultdict(list)
    
    for train_idx, test_idx in tqdm(folds.split(y), total=folds.n_splits):
        for C, embedding_type in itertools.product(Cs, Xs.keys()):
            X_train = Xs[embedding_type][train_idx]
            y_train = y[train_idx]
            X_test = Xs[embedding_type][test_idx]
            y_test = y[test_idx]
            
            model = LogisticRegression(C=C, random_state=2, max_iter=1000)  # Random state should not matter as logistic regression is unique
            model.fit(X_train, y_train)
            
            y_test_hat = model.predict(X_test)
            f1 = metrics.f1_score(y_test, y_test_hat, average="micro")
            performance[C, embedding_type].append(f1)

    avg_performance = {k: np.mean(all_f1) for k, all_f1 in performance.items()}
    C, embedding_type = pd.Series(avg_performance).idxmax()
    X = Xs[embedding_type]
    model = LogisticRegression(C=C, random_state=2, max_iter=200).fit(X, y)

    return model, embedding_type


def run_nested_crossvalidated_logistic_regression(
    Xs: dict[EmbeddingType, np.ndarray],
    y: np.ndarray,
    urls: Sequence[URL]
) -> list[EvaluationDict]:
    n_outer_splits = 20
    n_inner_splits = 10
    outer_folds = KFold(n_splits=n_outer_splits, shuffle=True, random_state=0)
    inner_folds = KFold(n_splits=n_inner_splits, shuffle=True, random_state=1)

    results = []
    false_positive_urls = []
    false_negative_urls = []
    for train_idx, test_idx in tqdm(outer_folds.split(y), total=n_outer_splits):
        Xs_train = {embedding: X[train_idx] for embedding, X in Xs.items()}
        y_train = y[train_idx]
        Xs_test = {embedding: X[test_idx] for embedding, X in Xs.items()}
        y_test = y[test_idx]
        urls_test = urls[test_idx]


        classifier, embedding_type = run_cross_validation_model_selection(Xs_train, y_train, folds=inner_folds)
        res = evaluate(
            classifier,
            embedding_type,
            Xs_train,
            y_train,
            Xs_test,
            y_test,
            urls_test,
        )

        results.append(res)

    return results

In [9]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    results = run_nested_crossvalidated_logistic_regression(
        embeddings,
        np.argmax(label_df.drop("url", axis=1).values, axis=1),
        label_df["url"].values
    )


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

## Look at hyperparameters for the different models selected in the outer loops

### Embedding types:

In [10]:
pd.DataFrame(results)["Embedding type"].value_counts()

Embedding type
SigLIP    20
Name: count, dtype: int64

### Complexity parameter:

In [11]:
pd.DataFrame(results)["C"].value_counts()

C
0.215443        11
0.046416         5
1.000000         2
10000.000000     1
464.158883       1
Name: count, dtype: int64

## Store a CSV file with the true and estimated value for each URL

In [12]:
res_df = pd.DataFrame(results).head()
estimated = [label_df.drop("url", axis=1).columns[i] for row in res_df.itertuples() for i in row.y_hat_test]
true = [label_df.drop("url", axis=1).columns[i] for row in res_df.itertuples() for i in row.y_test]
urls = [url for row in res_df.itertuples() for url in row.urls]

eval_df = pd.DataFrame({"url": urls, "true_label": true, "estimated_label": estimated})
eval_df.to_csv("../data/test_fold_estimated_labels.csv")

## Look at cross-validated evaluation metrics

## F1 score

In [13]:
pd.DataFrame(results)["F1 (test)"].mean(), pd.DataFrame(results)["F1 (test)"].std()

(0.9164543944213788, 0.05101429983736076)

## Confusion matrix

In [14]:
cm = np.zeros((7, 7))

for res in results:
    for yi, yhat_i in zip(res["y_test"], res["y_hat_test"]):
        cm[yi, yhat_i] += 1

        
classes = [
    "Segmentation anomaly",
    "Blank page",
    "Graphical element",
    "Illustration or photograph",
    "Musical notation",
    "Map",
    "Mathematical chart"
]
cols = label_df.drop("url", axis=1).columns
cm = pd.DataFrame(cm, index=cols.copy(), columns=cols.copy())
cm.index.name = "True class"
cm.columns.name = "Predicted class"
cm = cm.loc[classes, classes].T
cm

True class,Segmentation anomaly,Blank page,Graphical element,Illustration or photograph,Musical notation,Map,Mathematical chart
Predicted class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Segmentation anomaly,496.0,5.0,28.0,8.0,2.0,1.0,2.0
Blank page,11.0,339.0,8.0,1.0,0.0,0.0,0.0
Graphical element,14.0,2.0,278.0,15.0,1.0,0.0,2.0
Illustration or photograph,1.0,3.0,16.0,558.0,1.0,2.0,5.0
Musical notation,1.0,0.0,0.0,0.0,109.0,0.0,0.0
Map,1.0,0.0,0.0,2.0,0.0,41.0,0.0
Mathematical chart,0.0,0.0,0.0,8.0,0.0,0.0,39.0


## Format matrix as a LaTeX table

In [15]:
cm_latex = cm.copy()

cm_latex.columns = [
    r'Segmentation\\anomaly', 
    r'Blank\\page',
    r'Graphical\\element',
    r'Illustration or\\photograph', 
    r'Musical\\notation', 
    r'Map',
    r'Mathematical\\chart'
]
cm_latex.columns = [
    f"\\rotatebox{{90}}{{\\makecell{{{c}}}}}" for c in cm_latex.columns
]
cm_latex.columns.name = cm.columns.name.replace(' ', r'\\')
cm_latex.columns.name = f"\\rotatebox{{90}}{{\\makecell{{{cm_latex.columns.name}}}}}"
styler = Styler(cm_latex, precision=0)
(
    styler.background_gradient(cmap="Greens", vmin=0, vmax=cm.max().max())
)

print(styler.to_latex(convert_css=True, hrules=True, column_format="@{}lrrrrrrr"))

\begin{tabular}{@{}lrrrrrrr}
\toprule
\rotatebox{90}{\makecell{True\\class}} & \rotatebox{90}{\makecell{Segmentation\\anomaly}} & \rotatebox{90}{\makecell{Blank\\page}} & \rotatebox{90}{\makecell{Graphical\\element}} & \rotatebox{90}{\makecell{Illustration or\\photograph}} & \rotatebox{90}{\makecell{Musical\\notation}} & \rotatebox{90}{\makecell{Map}} & \rotatebox{90}{\makecell{Mathematical\\chart}} \\
Predicted class &  &  &  &  &  &  &  \\
\midrule
Segmentation anomaly & {\cellcolor[HTML]{00682A}} \color[HTML]{F1F1F1} 496 & {\cellcolor[HTML]{F6FCF4}} \color[HTML]{000000} 5 & {\cellcolor[HTML]{F0F9ED}} \color[HTML]{000000} 28 & {\cellcolor[HTML]{F5FBF3}} \color[HTML]{000000} 8 & {\cellcolor[HTML]{F7FCF5}} \color[HTML]{000000} 2 & {\cellcolor[HTML]{F7FCF5}} \color[HTML]{000000} 1 & {\cellcolor[HTML]{F7FCF5}} \color[HTML]{000000} 2 \\
Blank page & {\cellcolor[HTML]{F4FBF2}} \color[HTML]{000000} 11 & {\cellcolor[HTML]{48AE60}} \color[HTML]{F1F1F1} 339 & {\cellcolor[HTML]{F5FBF3}} \color[

## Train final classifier and estimate label distribution on full dataset

In [16]:
train_labels = label_df.drop("url", axis=1)
folds = KFold(n_splits=10, shuffle=True, random_state=1)
classifier, embedding_type = run_cross_validation_model_selection(
    embeddings, np.argmax(train_labels.values, axis=1), folds=folds
)

  0%|          | 0/10 [00:00<?, ?it/s]

In [17]:
train_labels = label_df.drop("url", axis=1)

In [18]:
selected_embeddings = np.stack(embedding_dataframes["SigLIP"]["embedding"].values)
urls = embedding_dataframes["SigLIP"]["url"]
classes = classifier.predict(selected_embeddings)

In [19]:
result_df = pd.DataFrame({'url': urls.values, 'label': [train_labels.columns[c] for c in classes]})
result_df.to_json("../data/estimated_labels.csv")

## Inspect predicted class on subset of all images in the collection

In [20]:
def format_image(url):
    return f'<img src="{url}" width="100px" />'
HTML(
    result_df.sample(100, random_state=0).to_html(
        formatters={
            "url": format_image,
          },
        escape=False,
        index=False
      )
  )

url,label
,Illustration or photograph
,Graphical element
,Blank page
,Segmentation anomaly
,Graphical element
,Illustration or photograph
,Illustration or photograph
,Segmentation anomaly
,Blank page
,Musical notation
