This script computes the share of multicut vs color encoding in the image, 
once in generall and once across the classes of the dataset

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import mlcv_py
import batch
from pathlib import Path
import cv2
from tqdm import tqdm
from random import sample

In [2]:
from mlcv_py import PARTITION_CODEC as pc
from mlcv_py import MULTICUT_CODEC as mc
from mlcv_py import OPTIMIZER as opt

In [3]:

def get_distribution(multi, part, opti, lvl, N=None) -> pd.DataFrame:
    
    root = Path("../data/images")
    
    imgs_by_category = dict()
    for dir in root.glob("*"):
        if not dir.is_dir(): continue

        paths = list(dir.glob("*.png"))
        if N is not None:
            paths = sample(paths, k=min(N, len(paths)))

        
        imgs = [cv2.imread(img_p, cv2.IMREAD_COLOR) for img_p in paths]
        imgs_by_category[dir.name] = imgs

    res_dict = dict()

    for name, imgs in tqdm(imgs_by_category.items()):

        inputs_masks = [(img, multi, part, opti, lvl) for img in imgs]

        res = batch.batch_execute("make_mask_with_size", inputs_masks, cpu_count=2)
        masks = [t[0] for t in res]
        mask_sizes = [t[1] for t in res]

        inputs_encode = [(img, mask, multi, part, False) for img, mask in zip(imgs, masks)]
        res = batch.batch_execute("encode_mask_with_size", inputs_encode)
        total_sizes = [t[1] for t in res]

        res_dict[name] = [(ms, ts - ms) for ms, ts in zip(mask_sizes, total_sizes)]

    return res_dict


In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

def make_plot(title, filename, multi, part, opti, lvl, N=None):

    res = get_distribution(multi, part, opti, lvl, N)

    names = list(res.keys())
    data = {
        "category": names,
        "multicut size": [sum(t[0] for t in res[name]) for name in names],
        "partition size": [sum(t[1] for t in res[name]) for name in names]
    }

    df = pd.DataFrame(data)

    # Add total row
    total_values = df[['multicut size', 'partition size']].sum()
    total_row = pd.DataFrame({'category': ['Total'], 'multicut size': [total_values['multicut size']], 'partition size': [total_values['partition size']]})
    df = pd.concat([df, total_row], ignore_index=True)

    # Normalize values so each bar has the same total height
    df[['multicut size', 'partition size']] = df[['multicut size', 'partition size']].div(df[['multicut size', 'partition size']].sum(axis=1), axis=0)

    # Stacked Bar Chart
    fig, ax = plt.subplots(figsize=(8, 5))
    df.set_index('category').plot(kind='bar', stacked=True, ax=ax, color=['#8B0000', '#1E3A8A'])  # Dark Red and Dark Blue

    plt.title(title)
    plt.legend(title="Size Type")

    # Save the figure with 600dpi and thin margins
    plt.savefig(filename, dpi=600, bbox_inches='tight')
    plt.show()


In [5]:
N = 25
make_plot(
    "Verhätnis Multicut zu Partitionskodierung: alter Algorithmus, niedrige Qualität",
    "ratio/legacy-low-quality.png",
    mc.HUFFMAN,
    pc.SIMPLE,
    opt.GREEDY_GRID,
    25.0,
    N=N
)





In [6]:
make_plot(
    "Verhätnis Multicut zu Partitionskodierung: alter Algorithmus, hohe Qualität",
    "ratio/legacy-high-quality.png",
    mc.HUFFMAN,
    pc.SIMPLE,
    opt.GREEDY_GRID,
    1.0,
    N=N
)



