# Imports

In [None]:
# Notebook extensions
%load_ext autoreload

In [None]:
# Std Python Lib
import itertools
from collections import defaultdict
from collections import Counter
import time
from pprint import pprint

# Requirements
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import colors
from matplotlib.lines import Line2D
from scipy.stats import wilcoxon
from scipy.stats import linregress
from scipy.stats import beta
from adjustText import adjust_text
from tqdm.notebook import tqdm
from sklearn.linear_model import LogisticRegression

# Project
import distances
import compressions
import s_curves

from corpus import brunet
from corpus import oxquarry
from corpus import st_jean
from corpus import pan16


from rank_list_fusion import fusion_z_score
from rank_list_fusion import fusion_regression_training
from rank_list_fusion import fusion_regression_trainings
from rank_list_fusion import fusion_regression

from evaluate import evaluate_linking
from evaluate import evaluate_clustering

from linking import compute_links
from linking import most_frequent_word

from clustering import dist_thresh_logistic_regression
from clustering import dist_thresh_two_beta
from clustering import clustering_at_dist_thresh
from clustering import silhouette_based_clustering
from clustering import clustering_at_every_n_clusters
from clustering import agglomerative_clustering
from clustering import best_clustering

from misc import sign_test
from misc import first_letters_cut
from misc import word_n_grams
from misc import last_letters_cut
from misc import sigmoid
from misc import sigmoid_r
from misc import compute_r
from misc import normalize
from misc import rank_list_distance
from misc import dataset_infos
from misc import sort_Y_and_distance_matrix
from misc import subset_Y_and_distance_matrix
from misc import distances_matrix_from_rank_list
from misc import fit_beta
from misc import find_two_beta_same_area
from misc import features_from_rank_list
from misc import labels_from_rank_list

%autoreload 2

## Settings

In [None]:
np.set_printoptions(precision=2, floatmode="fixed", suppress=True)
mpl.rcParams["figure.facecolor"] = "w"

# Corpus

## Load

In [None]:
print("Loading Oxquarry")
info_oxquarry, X_token_oxquarry, Y_oxquarry = oxquarry.parse()
print("Loading Brunet")
info_brunet, X_lemma_brunet, X_token_brunet, Y_brunet = brunet.parse()
print("Loading St-Jean A")
info_st_jean_A, X_pos_st_jean_A, X_lemma_st_jean_A, X_token_st_jean_A, Y_st_jean_A = st_jean.parse_A()
print("Loading St-Jean B")
info_st_jean_B, X_pos_st_jean_B, X_lemma_st_jean_B, X_token_st_jean_B, Y_st_jean_B = st_jean.parse_B()
print("Loading St-Jean")
info_st_jean, X_pos_st_jean, X_lemma_st_jean, X_token_st_jean, Y_st_jean = st_jean.parse()

## Informations

In [None]:
print("Name, Language, authors, texts, r, true_links, links, true_links_ratio, mean_length")
id, x, y = oxquarry.parse()
print("Oxquarry EN", *dataset_infos(x, y))
id, x_lemma, x, y = brunet.parse()
print("Brunet FR", *dataset_infos(x, y))
id, x_pos, x_lemma, x, y = st_jean.parse()
print("St-Jean FR", *dataset_infos(x, y))
print("St-Jean A 001-100 FR", *dataset_infos(x[:100], y[:100]))
print("St-Jean B 101-200 FR", *dataset_infos(x[100:], y[100:]))

problems = pan16.parse_train()
for (info, _, x, y) in problems:
    print(f"PAN16 {info['language']} {info['folder']}", *dataset_infos(x, y))
problems = pan16.parse_test()
for (info, _, x, y) in problems:
    print(f"PAN16 {info['language']} {info['folder']}", *dataset_infos(x, y))

# Individual methods

## MF Token and Lemma

In [None]:
def print_mfw(mfw, n):
    for i, (k, v) in enumerate(list(mfw.items())[0:n]):
        print(f"{i+1} & {k} & {v} \\\\")


features, mfw = most_frequent_word(X_token_st_jean, 500, z_score=False, lidstone_lambda=0.1, remove_hapax=True)
print_mfw(mfw, 40)

features, mfw = most_frequent_word(X_lemma_st_jean, 500, z_score=False, lidstone_lambda=0.1, remove_hapax=True)
print_mfw(mfw, 40)

### Evaluation

In [None]:
mfs = np.arange(250, 2000 + 1, 250)
distances_ = distances.vector_distances

def evaluate(X, Y):
    M = []
    for mf, (zscore, distance) in itertools.product(mfs, distances_):
        print(mf, zscore, distance.__name__)
        rl_token = compute_links([X, 0, mf, zscore, 1e-1, distance])
        Mi = evaluate_linking(rl_token, Y)
        M.append(Mi)

    M = np.array(M).reshape(-1, len(distances_), 3)
    return M

M_tokens = evaluate(X_token_st_jean, Y_st_jean)
M_lemmas = evaluate(X_lemma_st_jean, Y_st_jean)

In [None]:
def plot(M, f):
    custom_lines = [Line2D([0], [0], color=f"C{i}", lw=2) for i in range(len(distances_))]
    labels = [d.__name__ for z, d in distances_]

    plt.figure(figsize=(6, 8), dpi=200)
    for i in range(len(distances_)):
        plt.plot(mfs, M[:, i, 0], c=f"C{i}")
    plt.xlabel("$n$-MF")
    plt.ylabel("Average Precision (AP)")
    plt.legend(custom_lines, labels, loc="lower center", ncol=2)
    plt.tight_layout()
    plt.grid()
    plt.savefig(f"img/mf_{f}.png")
    
plot(M_tokens, "tokens")
plot(M_lemmas, "lemmas")

In [None]:
def table(M):
    d_names = [d[-1].__name__ for d in distances_]
    a = M[:,:,0].T
    print("\n".join(d_names))
    print("\n".join(mfs.astype(str)))
    print(a)
        
table(M_tokens)
table(M_lemmas)

### Importance of the text size in stylometry

In [None]:
M = []

sizes = np.arange(9000, 0, -250, dtype=int)

for i in sizes:
    # limitate the data size
    Xi = [x[:i] for x in X_token_st_jean]
    rl = compute_links([Xi, 0, 750, True, 0.1, distances.cosine_distance])
    m = evaluate_linking(rl, Y_st_jean)
    print(i, m)
    M.append(m)

M = np.array(M)

In [None]:
print(sizes)
print(M)

fig, ax1 = plt.subplots(figsize=(6, 4), dpi=200)
ax2 = ax1.twinx()
ax1.plot(sizes, M[:, 0], c="C0", ls="solid",
         label="Average Precision (AP)")
ax1.plot(sizes, M[:, 1], c="C0", ls="dashed", label="RPrec")
ax2.plot(sizes, M[:, 2], c="C0", ls="dotted", label="HPRec")
ax1.set_xlabel("#Tokens per texts")
plt.gca().invert_xaxis()
ax1.set_ylabel("AP/RPrec")
ax2.set_ylabel("HPrec")
plt.xticks(np.arange(9000, -1, -1000, dtype=int))
fig.legend()
plt.tight_layout()
plt.savefig("img/degradation.png")

### Frequent errors

In [None]:
rls = rls_st_jean[0:4]
Y = Y_st_jean
X = X_token_st_jean

top_n = 20
keep = 2

incorrectly_ranked = defaultdict(lambda: 0)

for rl in rls:
    m = evaluate_linking(rl, Y)
    print(m)
    i = 0
    for (a, b), s in rl:
        if Y[a] != Y[b]:
            i += 1
            incorrectly_ranked[(a, b)] += 1
            if i > top_n:
                break

top_errors = Counter(dict(incorrectly_ranked)).most_common(keep)
print(top_errors)

features, mfw = most_frequent_word(X, 750, lidstone_lambda=1e0)

In [None]:
def plot(features, a, b, filename):
    A, B = features[a, :], features[b, :]
    mean = np.mean(np.array([A, B]), axis=0)
    order_indices = np.argsort(mean)[::-1]
    A = A[order_indices]
    B = B[order_indices]
    plt.figure(figsize=(4, 3), dpi=200)
    plt.yscale("log")
    x = range(len(A))
    plt.bar(x, A, width=1, label=f"{Y[a]} ({a+1})", alpha=0.5)
    plt.bar(x, B, width=1, label=f"{Y[b]} ({b+1})", alpha=0.5)
    plt.legend()
    plt.xticks([], [])
    plt.xlabel("750-MF tokens vector")
    plt.ylabel("Relative word frequency")
    plt.tight_layout()
    plt.savefig(filename)

for i, ((a, b), errors) in enumerate(top_errors):
    plot(features, a, b, f"img/mf_vector_error_{i}.png")

rl = rls[0]
(a, b), score = rl[0]
plot(features, a, b, f"img/mf_vector_first_rl.png")

(a, b), score = rl[int(m[-1] - 1)]
plot(features, a, b, f"img/mf_vector_first_last_rl.png")

(a, b), score = rl[-1]
plot(features, a, b, f"img/mf_vector_last_rl.png")

## MF Letters/In-word $n$-grams

### Letters $n$-grams

In [None]:
# X, Y = X_token_oxquarry, Y_oxquarry
X, Y = X_token_brunet, Y_brunet
# X, Y = X_token_st_jean, Y_st_jean

M = defaultdict(list)

mfws = np.arange(500, 15000 + 1, 500)

ngrams_types = [3, 4, 5, (2, 3), (3, 4), (4, 5)]
for ngrams_type in ngrams_types:
    print(ngrams_type)
    for mfw in mfws:
        rep = [X, ngrams_type, mfw, True, 1e-1, distances.cosine_distance]
        rl = compute_links(rep)
        m = evaluate_linking(rl, Y)
        M[ngrams_type].append(m)
        print(mfw, m)

M = dict(M)

In [None]:
plt.figure(figsize=(6, 4), dpi=200)
for ngrams_type in ngrams_types:
    X = mfws
    Y = [i[0] for i in M[ngrams_type]]
    plt.plot(X, Y, label=f"Letters {str(ngrams_type)}-grams")
plt.legend()
plt.xlabel("$n$-MF letters $n$-grams")
plt.ylabel("Average Precision (AP)")
plt.tight_layout()
plt.savefig("img/letter_ngrams.png")

---

In [None]:
# X, Y = X_token_oxquarry, Y_oxquarry
# X, Y = X_token_brunet, Y_brunet
X, Y = X_token_st_jean, Y_st_jean

configurations = [
    (3, 3000),
    (4, 8000),
]

for n_grams_type, mfw in configurations:
    for zscore, distance in distances.vector_distances:
        rep = [X, n_grams_type, mfw, zscore, 1e-1, distance]
        rl = compute_links(rep)
        m = evaluate_linking(rl, Y)
        print(n_grams_type, mfw, distance.__name__, m)

### In-word $n$-grams, $n$-First letters, $n$-Last letters

In [None]:
# X, Y = X_token_oxquarry, Y_oxquarry
# X, Y = X_token_brunet, Y_brunet
X, Y = X_token_st_jean, Y_st_jean

plt.figure(figsize=(6, 4), dpi=200)

for n, c in zip([3, 4, 5], ["C0", "C1", "C2"]):
    print(n)
    word_begin_X = first_letters_cut(X, n)
    word_ngrams_X = word_n_grams(X, n)
    word_end_X = last_letters_cut(X, n)

    M_ngrams = []
    M_first = []
    M_last = []

    mfs = np.arange(200, 4000 + 1, 100)

    for mf in mfs:
        print(mf)
        rep = [word_ngrams_X, 0, mf, True, 1e-1, distances.cosine_distance]
        rl = compute_links(rep)
        m = evaluate_linking(rl, Y)
        M_ngrams.append(m[0])
        rep = [word_begin_X, 0, mf, True, 1e-1, distances.cosine_distance]
        rl = compute_links(rep)
        m = evaluate_linking(rl, Y)
        M_first.append(m[0])
        rep = [word_end_X, 0, mf, True, 1e-1, distances.cosine_distance]
        rl = compute_links(rep)
        m = evaluate_linking(rl, Y)
        M_last.append(m[0])

    plt.plot(mfs, M_ngrams, c=c, ls="solid")
    plt.plot(mfs, M_first, c=c, ls="dotted")
    plt.plot(mfs, M_last, c=c, ls="dashed")

custom_lines = [
    Line2D([0], [0], color="C0", lw=2),
    Line2D([0], [0], color="C1", lw=2),
    Line2D([0], [0], color="C2", lw=2),
    Line2D([0], [0], color="k", lw=2, ls="solid"),
    Line2D([0], [0], color="k", lw=2, ls="dotted"),
    Line2D([0], [0], color="k", lw=2, ls="dashed"),
]

plt.legend(custom_lines, ["n = 3", "n = 4", "n = 5", "In-word $n$-grams",
                          "$n$-First", "$n$-Last"], loc="lower right")
plt.xlabel("$n$-MF")
plt.ylabel("Average Precision (AP)")
plt.tight_layout()
plt.savefig("img/first_last_letters_ngrams.png")

## MF POS $n$-grams

In [None]:
X, Y = X_pos_st_jean, Y_st_jean

M = defaultdict(list)

mfws = np.arange(100, 2000 + 1, 100)

ngrams_types = [2, 3, 4, (2, 3)]
for ngrams_type in ngrams_types:
    print(ngrams_type)
    for mf in mfs:
        rep = [X, ngrams_type, mf, True, 1e-1, distances.cosine_distance]
        rl = compute_links(rep)
        m = evaluate_linking(rl, Y)
        M[ngrams_type].append(m)
        print(mf, m)

M = dict(M)

In [None]:
plt.figure(figsize=(6, 4), dpi=200)
for ngrams_type in ngrams_types:
    X = mfs
    Y = [i[0] for i in M[ngrams_type]]
    plt.plot(X, Y, label=f"{str(ngrams_type)}-POS")
plt.legend()
plt.xlabel("$n$-MF")
plt.ylabel("Average Precision (AP)")
plt.tight_layout()
plt.savefig("img/n_pos.png")

---

In [None]:
X, Y = X_pos_st_jean, Y_st_jean

M = defaultdict(list)

configurations = [
    (2, 250),
    (3, 1000),
]

for ngrams_type, mf in configurations:
    for zscore, distance in distances.vector_distances:
        rep = [X, ngrams_type, mf, zscore, 1e-1, distance]
        rl = compute_links(rep)
        print(ngrams_type, mf, distance.__name__, evaluate_linking(rl, Y))

## Every tokens

In [None]:
params = [
    distances.vector_distances,
    [
        (X_token_oxquarry, Y_oxquarry),
        (X_token_brunet, Y_brunet),
        (X_token_st_jean, Y_st_jean),
    ],
]

def do(mfw, remove_hapax):
    M = []
    for param in tqdm(list(itertools.product(*params))):
        (zscore, dist), (X, Y) = param
        rl = compute_links([X, 0, mfw, zscore, 1e-1, dist, remove_hapax])
        m = evaluate_linking(rl, Y)
        print(m, zscore, dist)
        M.append(m[0])

    M = np.array(M)
    M = M.reshape([len(p) for p in params])
    return M

M_750 = do(750, True)
M_without_hapax = do(np.inf, True)
M_with_hapax = do(np.inf, False)

In [None]:
def print_M(M):
    for l, m in zip(M, M.mean(axis=1)):
        print(l, f"{m:.2f}")
    print(f"-- {M.mean(axis=0)} {M.mean():.2f}")
    print()
    
for d in distances.vector_distances:
    print(d[-1].__name__.capitalize())

print_M(M_with_hapax)
print_M(M_without_hapax - M_with_hapax)
print_M(M_750 - M_with_hapax)

## Compression based distances

In [None]:
# X, Y = X_token_oxquarry, Y_oxquarry
X, Y = X_token_brunet, Y_brunet
# X, Y = X_token_st_jean, Y_st_jean

compression_methods = [
    compressions.lzma,
    compressions.bz2,
    compressions.gzip,
]
distance_funcs = [
    distances.ncd,
    distances.cbc,
]
distances_compressions = list(itertools.product(
    compression_methods, distance_funcs))

M = []
T = []

for i in range(3):
    for compression_method, distance_func in distances_compressions:
        print(compression_method.__name__, distance_func.__name__)
        t0 = time.time()
        rep = (X, compression_method, distance_func)
        rl = compute_links(rep)
        t = time.time() - t0
        m = evaluate_linking(rl, Y)
        M.append(m)
        T.append(t)
        print(m, t)

M = np.array(M).reshape(-1, len(distances_compressions), 3)
T = np.array(T).reshape(-1, len(distances_compressions))
M = M.mean(axis=0)
T = T.mean(axis=0)

In [None]:
print(M)
print(T)

plt.figure(figsize=(6, 4), dpi=200)
x, y, c = M[:, 1], M[:, 0], M[:, 2]
plt.scatter(x, y, c=c, marker=".")
texts = []
for i, (compression_method, distance_func) in enumerate(distances_compressions):
    text = f"({compression_method.__name__}, {distance_func.__name__})"
    xy = (x[i], y[i])
    texts.append(plt.annotate(text, xy))
adjust_text(texts)
cbar = plt.colorbar()
plt.xlabel("RPrec")
plt.ylabel("Average precision (AP)")
cbar.set_label("HPrec")
plt.tight_layout()
plt.savefig("img/compression_evaluation.png")

## Individual methods summary

### Retained text representations

In [None]:
def tr9(X_token, X_pos):
    return [
        [X_token, 0, 750, True, 1e-1, distances.cosine_distance],
        [X_token, 0, 750, False, 1e-1, distances.clark],
        [X_token, 0, 750, True, 1e-1, distances.manhattan],
        [X_token, 0, 750, False, 1e-1, distances.tanimoto],
        [X_token, 3, 3000, True, 1e-1, distances.cosine_distance],
        [X_token, 4, 8000, True, 1e-1, distances.cosine_distance],
        (X_token, compressions.bz2, distances.cbc),
        [X_pos, 2, 250, True, 1e-1, distances.cosine_distance],
        [X_pos, 3, 1000, True, 1e-1, distances.manhattan],
    ]


def tr7(X_token):
    return [
        [X_token, 0, 750, True, 1e-1, distances.cosine_distance],
        [X_token, 0, 750, False, 1e-1, distances.clark],
        [X_token, 0, 750, True, 1e-1, distances.manhattan],
        [X_token, 0, 750, False, 1e-1, distances.tanimoto],
        [X_token, 3, 3000, True, 1e-1, distances.cosine_distance],
        [X_token, 4, 8000, True, 1e-1, distances.cosine_distance],
        (X_token, compressions.bz2, distances.cbc),
    ]


def tr(*X):
    if len(X) == 2:
        return tr9(X[0], X[1])
    else:
        return tr7(X[0])

In [None]:
rls_oxquarry = [compute_links(t) for t in tqdm(tr(X_token_oxquarry))]
print(np.array([evaluate_linking(rl, Y_oxquarry) for rl in rls_oxquarry]))

In [None]:
rls_brunet = [compute_links(t) for t in tqdm(tr(X_token_brunet))]
print(np.array([evaluate_linking(rl, Y_brunet) for rl in rls_brunet]))

In [None]:
rls_st_jean_A = [compute_links(t) for t in tqdm(tr(X_token_st_jean_A, X_pos_st_jean_A))]
print(np.array([evaluate_linking(rl, Y_st_jean_A) for rl in rls_st_jean_A]))

In [None]:
rls_st_jean_B = [compute_links(t) for t in tqdm(tr(X_token_st_jean_B, X_pos_st_jean_B))]
print(np.array([evaluate_linking(rl, Y_st_jean_B) for rl in rls_st_jean_B]))

In [None]:
datasets_rls = [
    ("Oxquarry", rls_oxquarry, Y_oxquarry),
    ("Brunet", rls_brunet, Y_brunet),
    ("St-Jean A", rls_st_jean_A, Y_st_jean_A),
    ("St-Jean B", rls_st_jean_B, Y_st_jean_B),
]

In [None]:
rls_st_jean = [compute_links(t) for t in tqdm(tr(X_token_st_jean, X_pos_st_jean))]
print(np.array([evaluate_linking(rl, Y_st_jean) for rl in rls_st_jean]))

### Publication date differences analysis

In [None]:
rl = rl_z_score_st_jean
Y = Y_st_jean
info = info_st_jean

Y = np.array(Y)
s = 5

dates = [int(i[-1]) for i in info]
plt.figure(figsize=(4, 3), dpi=200)
plt.hist(dates, bins=np.arange(np.min(dates), np.max(dates), s), density=True, alpha=0.7)
plt.xlabel("Date")
plt.ylabel("Density")
plt.tight_layout()
plt.savefig("img/dates_distribution.png")

print(evaluate_linking(rl, Y))

In [None]:
links = np.array([link for link, s in rl])

date_diffs = np.array([np.abs(dates[a] - dates[b]) for (a, b) in links])

true_links = np.array([Y[a] == Y[b] for (a, b) in links])

r = compute_r(Y)
all_links_date_diff = date_diffs
true_links_date_diff = date_diffs[true_links]
false_links_date_diff = date_diffs[~true_links]

top_r_true_links_date_diff = true_links_date_diff[0:r]
top_r_false_links_date_diff = false_links_date_diff[0:r]

print("all_links", all_links_date_diff.mean(), all_links_date_diff.std())
print("true_links", true_links_date_diff.mean(), true_links_date_diff.std())
print("false_links", false_links_date_diff.mean(), false_links_date_diff.std())


print("largest true link date diff", true_links_date_diff.max(), links[true_links][true_links_date_diff.argmax()]+1)
print("true links = 0 : ", np.sum(true_links_date_diff == 0), "true links >= 5 : ", np.sum(true_links_date_diff <= 5), "true links total : ", np.sum(true_links))

print("Common date diffs for top-r false links > 35", Counter(top_r_false_links_date_diff[top_r_false_links_date_diff > 35]).most_common())

common_false_link = links[~true_links][0:r][top_r_false_links_date_diff == 62]
for a, b in common_false_link:
    print(a+1, Y[a], b+1, Y[b])

bins = np.arange(0, np.max(all_links_date_diff), s)
ticks = np.arange(date_diffs.min(), date_diffs.max(), 10)

In [None]:
data = [true_links_date_diff, false_links_date_diff]
colors = ["C2", "C3"]

plt.figure(figsize=(4, 3), dpi=200)

params = {
    "bins" : bins,
    "color" : colors,
    "alpha" : 0.7,
    "density" : True,
    "stacked": True,
}
n, bins, patches = plt.hist(data, **params)
for d, c in zip(data, colors):
    mean, std = d.mean(), d.std()
    params = {
        "c" : c,
        "linestyle" : "dashed",
    }
    plt.axvline(mean, **params)
    params = {
        "y" : (n.max() - n.min()) / 2,
        "xmin" : mean - std // 2,
        "xmax" : mean + std // 2,
        "color" : c,
        "linestyle" : "solid",
    }
    plt.hlines(**params)


plt.xlabel("Date difference")
plt.ylabel("Density")
plt.xticks(ticks)

legend_elements = [
    Line2D([0], [0], color="k", lw=1, ls="dashed", label="Mean"),
    Line2D([0], [0], color="k", lw=1, ls="solid", label="Std"),
    Line2D([0], [0], color="C2", alpha=0.7, lw=4, label="True links"),
    Line2D([0], [0], color="C3", alpha=0.7, lw=4, label="False links"),
]

plt.legend(handles=legend_elements)
plt.tight_layout()
plt.savefig("img/dates_differences_true_false.png")

In [None]:
plt.figure(figsize=(4, 3), dpi=200)
data = top_r_false_links_date_diff
color = "C1"

params = {
    "bins" : bins,
    "color" : color,
    "alpha" : 0.7,
    "density" : True,
}
n, bins, patches = plt.hist(data, **params)
mean, std = data.mean(), data.std()
params = {
    "c" : color,
    "linestyle" : "dashed",
}
plt.axvline(mean, **params)
params = {
    "y" : (n.max() - n.min()) / 2,
    "xmin" : mean - std // 2,
    "xmax" : mean + std // 2,
    "color" : color,
    "linestyle" : "solid",
}
plt.hlines(**params)

plt.xlabel("Date difference")
plt.ylabel("Density")
plt.xticks(ticks)

legend_elements = [
    Line2D([0], [0], color="k", lw=1, ls="dashed", label="Mean"),
    Line2D([0], [0], color="k", lw=1, ls="solid", label="Std"),
    Line2D([0], [0], color="C1", alpha=0.7, lw=4, label="top-r False links"),
]

plt.legend(handles=legend_elements)
plt.tight_layout()
plt.savefig("img/dates_differences_r_false.png")

### Distances matrix visualization

In [None]:
rl = rls_oxquarry[1]
Y = Y_oxquarry
# rl = rls_oxquarry[3]
# Y = Y_oxquarry

distances_matrix = distances_matrix_from_rank_list(rl)

# subset = ["Sand", "Stael", "Gautier", "Regnier"]
# Y, distances_matrix = subset_Y_and_distance_matrix(Y, distances_matrix, subset)
Y, distances_matrix = sort_Y_and_distance_matrix(Y, distances_matrix)

# distances_matrix = np.zeros(distances_matrix.shape)

ticks = []
prev = None
for i, label in enumerate(Y):
    if prev != label:
        ticks.append(i)
    prev = label
labels = np.unique(Y)
    
plt.figure(figsize=(4,3), dpi=200)
plt.imshow(distances_matrix, cmap="Blues")
plt.xticks(ticks, labels, rotation="vertical", fontsize="xx-small")
plt.yticks(ticks, labels, rotation="horizontal", fontsize="xx-small")
plt.tight_layout()
plt.savefig("img/distance_matrix.png")

# Clustering

## Clustering example

In [None]:
A = np.random.normal(loc=1, scale=0.5, size=(100, 2))
B = np.random.normal(loc=-1, scale=0.5, size=(100, 2))

AB = np.array(list(A) + list(B))

plt.figure(figsize=(4,3), dpi=200)
plt.scatter(AB[:,0], AB[:,1], c="k")
plt.axis("off")
plt.tight_layout()
plt.savefig("img/clustering_example_1.png")

plt.figure(figsize=(4,3), dpi=200)
plt.scatter(A[:,0], A[:,1])
plt.scatter(B[:,0], B[:,1])
plt.axis("off")
plt.tight_layout()
plt.savefig("img/clustering_example_2.png")

## Hierachical clustering

In [None]:
Mc = []
Ml = []

for name, rls, Y in datasets_rls:
    for (i, rl) in enumerate(rls):
        f1_a = best_clustering(rl, "single", Y)
        f1_b = best_clustering(rl, "average", Y)
        f1_c = best_clustering(rl, "complete", Y)
        Mc.append([f1_a, f1_b, f1_c])
        Ml.append(evaluate_linking(rl, Y))
        
        print(f"{i} & {name} & {f1_a:.2f} & {f1_b:.2f} & {f1_c:.2f}")

In [None]:
Mc = np.array(Mc)
Ml = np.array(Ml)

i = 0
for name, rls, Y in datasets_rls:
    l = len(rls)
    print(np.mean(Mc[i:i+l, :], axis=0))
    i += l

print(np.mean(Mc, axis=0))

### Correlation AP/B3F1

In [None]:
def plot_linear_regression(X, Y):
    reg = linregress(X, Y)
    slope, intercept, r_value, p_value, std_err = reg
    print(slope, intercept)
    print(f"{r_value:.2f} & {p_value:.2e} & {std_err:.2e}")
    def f(x):
        return slope * x + intercept
    Xs = np.array([np.min(X), np.max(X)])
    Ys = f(Xs)
    plt.plot(Xs, Ys, label="Linear regression")

for i in range(3):
    plt.figure(figsize=(5,4), dpi=200)
    plt.scatter(Ml[:, 0], Mc[:, i], label="Rank list / resulting clustering")
    plot_linear_regression(Ml[:, 0], Mc[:, i])
    plt.xlabel("AP")
    plt.ylabel("$B^3_{F_1}$")
    plt.legend()
    plt.tight_layout()
    plt.savefig(f"img/correlation_average_precision_b3f1_{i}.png")

## Silhouette-based Clustering

### Example

In [None]:
def plot(rl, Y, linkage, f):
    labels, silhouette_scores = silhouette_based_clustering(rl, linkage=linkage)
    ns, labels_list = clustering_at_every_n_clusters(rl, linkage=linkage)
    evaluations = np.array([evaluate_clustering(Y, labels) for labels in labels_list])

    n_clusters_found = len(np.unique(labels))
    n_clusters_actual = len(np.unique(Y))

    plt.figure(figsize=(6, 4), dpi=200)
    plt.plot(ns, evaluations[:, 1], label="$B^3_{precision}$")
    plt.plot(ns, evaluations[:, 2], label="$B^3_{recall}$")
    plt.plot(ns, evaluations[:, 0], label="$B^3_{F_1}$")
    plt.plot(ns, evaluations[:, 3], label="$r_{diff}$")
    plt.plot(*silhouette_scores, label="Mean Silhouette")
    plt.axvline(n_clusters_found, 0, 1, ls="dashed", c="C4", label="Maximal Mean Silhouette")
    plt.axvline(n_clusters_actual, 0, 1, ls="dashed", c="C2", label="Actual #Clusters")
    xmin, xmax, ymin, ymax = plt.axis()
    ypos = ymax / 2 - ymin / 2
    plt.text(n_clusters_found, ypos, f"{n_clusters_found}", c="C4", rotation="vertical")
    plt.text(n_clusters_actual, ypos, f"{n_clusters_actual}", c="C2", rotation="vertical")
    plt.legend(loc="upper right")
    plt.xlabel("#Clusters")
    plt.ylabel("Metric")
    plt.grid()
    plt.tight_layout()
    plt.savefig(f)
    
plot(rl_z_score_st_jean_A, Y_st_jean_A, "average", "img/silhouette_based_clustering_st_jean_A_average.png")
plot(rl_z_score_st_jean_B, Y_st_jean_B, "average", "img/silhouette_based_clustering_st_jean_B_average.png")

### Evaluation

In [None]:
Mc = []
# alpha = 0
alpha = -0.2

for name, rls, Y in datasets_rls:
    for (i, rl) in enumerate(rls):
        labels_a, _ = silhouette_based_clustering(rl, "single", alpha)
        m_a = evaluate_clustering(Y, labels_a)
        
        labels_b, _ = silhouette_based_clustering(rl, "average", alpha)
        m_b = evaluate_clustering(Y, labels_b)
        
        labels_c, _ = silhouette_based_clustering(rl, "complete", alpha)
        m_c = evaluate_clustering(Y, labels_c)
        
        Mc.append([m_a, m_b, m_c])
        
        print(f"{i} & {name} & {m_a[[0,-1]]} & {m_b[[0,-1]]} & {m_c[[0,-1]]}")

In [None]:
Mc = np.abs(np.array(Mc))
Ml = np.abs(np.array(Ml))

i = 0
for name, rls, Y in datasets_rls:
    l = len(rls)
    print(np.mean(Mc[i:i+l, :], axis=0)[:,[0,-1]])
    i += l

print(np.mean(Mc, axis=0)[:,[0,-1]])

## Distribution-based Clustering

### Distribution-based model example

In [None]:
def plot(rl, Y, xlabel, f):
    print(evaluate_linking(rl, Y))
    links = np.array(list(zip(*rl))[0])
    scores = np.array(list(zip(*rl))[1])
    labels = np.array([Y[a] == Y[b] for a, b in links])
    
    # if it's not a probability normalize between 0 and 1
    if np.max(scores) > 1 or np.min(scores) < 0:
        scores = normalize(scores, 0, 1)

    plt.figure(figsize=(4,3), dpi=200)
    bins = 20
    plt.hist(scores[labels], bins=bins, density=True, label="True links", alpha=0.5)
    plt.hist(scores[~labels], bins=bins, density=True, label="False links", alpha=0.5)

    beta_true = fit_beta(scores[labels])
    x_true = np.linspace(np.min(scores[labels])+1e-2, np.max(scores[labels])-1e-2, 200)
    y_true = beta_true.pdf(x_true)
    
    beta_false = fit_beta(scores[~labels])
    x_false = np.linspace(np.min(scores[~labels])+1e-2, np.max(scores[~labels])-1e-2, 200)
    y_false = beta_false.pdf(x_false)
 
    plt.plot(x_true, y_true, c="C0")
    plt.plot(x_false, y_false, c="C1")
    
    a = find_two_beta_same_area(beta_true, beta_false)

    plt.axvline(a, color="k", ls="dashed", label="Equiprobable")
    
    plt.ylabel("Density")
    plt.xlabel(xlabel)
    plt.legend()
    plt.tight_layout()
    plt.savefig(f)

rl = rls_st_jean_B[0]
Y = Y_st_jean_B

plot(rl, Y, "Normalized distance", "img/links_score_density.png")

### Evaluation

In [None]:
Mc = []

dts = []

for name, rls, Y in datasets_rls:
    dts.append([dist_thresh_two_beta(rl, Y) for rl in rls])

for (training_id, dts_), (name, rls, Y) in itertools.product(enumerate(dts), datasets_rls):
    
    Mi = []
    
    for (i, rl), dt in zip(enumerate(rls), dts_):
        labels_a = clustering_at_dist_thresh(rl, "single", dt)
        m_a = evaluate_clustering(Y, labels_a)
        
        labels_b = clustering_at_dist_thresh(rl, "average", dt)
        m_b = evaluate_clustering(Y, labels_b)
        
        labels_c = clustering_at_dist_thresh(rl, "complete", dt)
        m_c = evaluate_clustering(Y, labels_c)
        
        Mi.append([m_a, m_b, m_c])
        
        print(f"{i} & {name} & {m_a[[0,-1]]} & {m_b[[0,-1]]} & {m_c[[0,-1]]}")
        
    Mi = np.array(Mi)
    Mc.append(np.mean(np.abs(Mi), axis=(0)))
        
Mc = np.array(Mc)

In [None]:
Mc = Mc.reshape((-1, len(datasets_rls), 3, 4))
print(Mc.shape)

print("Single")
print(Mc[:,:,0,[0, -1]])
print(np.mean(Mc[:,:,0,[0, -1]],axis=(0)))
print(np.mean(Mc[:,:,0,[0, -1]],axis=(1)))
print(np.mean(Mc[:,:,0,[0, -1]],axis=(0,1)))

print("Average")
print(Mc[:,:,1,[0, -1]])
print(np.mean(Mc[:,:,1,[0, -1]],axis=(0)))
print(np.mean(Mc[:,:,1,[0, -1]],axis=(1)))
print(np.mean(Mc[:,:,1,[0, -1]],axis=(0,1)))

print("Complete")
print(Mc[:,:,2,[0, -1]])
print(np.mean(Mc[:,:,2,[0, -1]],axis=(0)))
print(np.mean(Mc[:,:,2,[0, -1]],axis=(1)))
print(np.mean(Mc[:,:,2,[0, -1]],axis=(0,1)))

## Regression-based clustering

### Regression-based model example

In [None]:
rl = rl_z_score_st_jean_B
Y = Y_st_jean_B

X_rl = features_from_rank_list(rl)
Y_rl = labels_from_rank_list(rl, Y)

model = LogisticRegression(random_state=0).fit(X_rl, Y_rl)

X_rl = np.array(X_rl)
Y_rl = np.array(Y_rl)

X_true = X_rl[Y_rl == 1]
X_false = X_rl[Y_rl == 0]

min_ = X_rl.min(axis=0)
max_ = X_rl.max(axis=0)

print(min_, max_)

n = 250
x = np.linspace(min_[0], max_[0], n)
y = np.linspace(min_[1], max_[1], n)
xv, yv = np.meshgrid(x, y)

X_grid = np.array([xv, yv])
X_grid = X_grid.swapaxes(0, 2).reshape((-1, 2))

p = model.predict_proba(X_grid)

plt.figure(figsize=(6,5), dpi=150)

plt.scatter(X_grid[:, 0], X_grid[:, 1], c=1-p[:, 0], marker=",", alpha=1, s=1, cmap="RdYlGn")

cbar = plt.colorbar()
cbar.set_label("True link model probability")

plt.scatter(X_false[:, 0], X_false[:, 1], alpha=0.5, s=6, c="C1", label="Actual false links")
plt.scatter(X_true[:, 0], X_true[:, 1], alpha=0.5, s=6, c="C0", label="Actual true links")

plt.xlabel("$\log rank/|L|$")
plt.ylabel("Score")
plt.legend()
plt.tight_layout()
plt.savefig("img/logistic_example.png")

### Evaluation

In [None]:
Mc = []

for (name_training, rls_training, Y_training), (name_testing, rls_testing, Y_testing) in itertools.product(datasets_rls, datasets_rls):
        
    Mi = []
    
    for i, (rl_training, rl_testing) in enumerate(zip(rls_training, rls_testing)):
        dt = dist_thresh_logistic_regression(rl_training, Y_training, rl_testing)
        
        labels_a = clustering_at_dist_thresh(rl_testing, "single", dt)
        m_a = evaluate_clustering(Y_testing, labels_a)
        
        labels_b = clustering_at_dist_thresh(rl_testing, "average", dt)
        m_b = evaluate_clustering(Y_testing, labels_b)
        
        labels_c = clustering_at_dist_thresh(rl_testing, "complete", dt)
        m_c = evaluate_clustering(Y_testing, labels_c)
        
        Mi.append([m_a, m_b, m_c])
        
        print(f"{i} {name_training} & {name_testing} & {m_a[[0,-1]]} & {m_b[[0,-1]]} & {m_c[[0,-1]]}")
        
    Mi = np.array(Mi)
    Mc.append(np.mean(np.abs(Mi), axis=(0)))
        
Mc = np.array(Mc)

In [None]:
Mc = Mc.reshape((-1, len(datasets_rls), 3, 4))
print(Mc.shape)

print("Single")
print(Mc[:,:,0,[0, -1]])
print(np.mean(Mc[:,:,0,[0, -1]],axis=(0)))
print(np.mean(Mc[:,:,0,[0, -1]],axis=(1)))
print(np.mean(Mc[:,:,0,[0, -1]],axis=(0,1)))

print("Average")
print(Mc[:,:,1,[0, -1]])
print(np.mean(Mc[:,:,1,[0, -1]],axis=(0)))
print(np.mean(Mc[:,:,1,[0, -1]],axis=(1)))
print(np.mean(Mc[:,:,1,[0, -1]],axis=(0,1)))

print("Complete")
print(Mc[:,:,2,[0, -1]])
print(np.mean(Mc[:,:,2,[0, -1]],axis=(0)))
print(np.mean(Mc[:,:,2,[0, -1]],axis=(1)))
print(np.mean(Mc[:,:,2,[0, -1]],axis=(0,1)))

# Fusion

In [None]:
rl_z_score_oxquarry = fusion_z_score(rls_oxquarry)

In [None]:
rl_z_score_brunet = fusion_z_score(rls_brunet)

In [None]:
rl_z_score_st_jean_A = fusion_z_score(rls_st_jean_A)

In [None]:
rl_z_score_st_jean_B = fusion_z_score(rls_st_jean_B)

In [None]:
datasets_zscores = [
    ("Oxquarry", rl_z_score_oxquarry, Y_oxquarry),
    ("Brunet", rl_z_score_brunet, Y_brunet),
    ("St-Jean A", rl_z_score_st_jean_A, Y_st_jean_A),
    ("St-Jean B", rl_z_score_st_jean_B, Y_st_jean_B),
]

In [None]:
rl_z_score_st_jean = fusion_z_score(rls_st_jean)

In [None]:
M_zscores = []
for name, rl, Y in datasets_zscores:
    m = evaluate_linking(rl, Y)
    print(name, m)
    M_zscores.append(m)

In [None]:
M_zscores = np.array(M_zscores)
print(np.mean(M_zscores, axis=(0)))

In [None]:
rls_regression = []
M_regression = []

for name_testing, rls_testing, Y_testing in datasets_rls:
    rls = []
    ms = [] 
    for name_training, rls_training, Y_training in datasets_rls:
        models = [fusion_regression_training(rl, Y_training)[0] for rl in rls_training]
        rl = fusion_regression(models, rls_testing)
        m = evaluate_linking(rl, Y_testing)
        rls.append(rl)
        ms.append(m)
        print(name_testing, name_training, m)
    rls_regression.append(rls)
    M_regression.append(ms)

In [None]:
M_regression = np.array(M_regression)

print(np.mean(M_regression, axis=(0)))
print(np.mean(M_regression, axis=(1)))
print(np.mean(M_regression, axis=(0,1)))

In [None]:
Ml = []

for name, rls, Y in datasets_rls:
    Mi = []
    for rl in rls:
        m = evaluate_linking(rl, Y)
        Mi.append(m)
    Ml.append(np.mean(Mi, axis=0))
    
Ml = np.array(Ml)
print(np.mean(Ml, axis=0))

## Rank list fusion evaluation

In [None]:
rls_training, Y_training = rls_st_jean_B, Y_st_jean_B
rls_testing, Y_testing = rls_st_jean_A, Y_st_jean_A

# rls_training, Y_training = rls_st_jean_A, Y_st_jean_A
# rls_testing, Y_testing = rls_st_jean_B, Y_st_jean_B

fusion_size = 4

models = []
print("Training rank lists")
for i, rl in enumerate(rls_training):
    model, rmse = fusion_regression_training(rl, Y_training)
    models.append(model)
    mesures = evaluate_linking(rl, Y_training)
    print(i, mesures, np.array([rmse]))

M_single = []
rank_lists = []
print("Testing rank lists")
for i, rl in enumerate(rls_testing):
    rank_lists.append(rl)
    mesures = evaluate_linking(rl, Y_testing)
    M_single.append(mesures)
    print(i, mesures)

M_single = np.array(M_single)

M_single_max = []
M_single_mean = []
M_fusion_z_score = []
M_fusion_regression = []

tr_ids = np.array(
    list(itertools.combinations(range(len(rls_training)), fusion_size)))

for tr_id in tr_ids:
    rls = [rank_lists[i] for i in tr_id]

    m_single_max = np.max(M_single[tr_id, :], axis=0)
    M_single_max.append(m_single_max)

    m_single_mean = np.mean(M_single[tr_id, :], axis=0)
    M_single_mean.append(m_single_mean)

    rl_z_score = fusion_z_score(rls)
    m_z_score = evaluate_linking(rl_z_score, Y_testing)
    M_fusion_z_score.append(m_z_score)

    rl_regression = fusion_regression(models, rls)
    m_regression = evaluate_linking(rl_regression, Y_testing)
    M_fusion_regression.append(m_regression)

M_single_max = np.array(M_single_max)
M_single_mean = np.array(M_single_mean)
M_fusion_z_score = np.array(M_fusion_z_score)
M_fusion_regression = np.array(M_fusion_regression)

In [None]:
print("Plot")
plt.figure(figsize=(6, 4), dpi=200)
x, y, c = M_single[:, 1], M_single[:, 0], M_single[:, 2]
plt.scatter(x, y, c=c, marker="o", label="Single rank list", alpha=0.8)
x, y, c = M_fusion_regression[:,
                              1], M_fusion_regression[:, 0], M_fusion_regression[:, 2]
plt.scatter(x, y, c=c, marker="x",
            label=f"Regression fusions ({fusion_size} lists)", alpha=0.5)
x, y, c = M_fusion_z_score[:,
                           1], M_fusion_z_score[:, 0], M_fusion_z_score[:, 2]
plt.scatter(x, y, c=c, marker="+",
            label=f"Z-score fusions ({fusion_size} lists)", alpha=0.5)
cbar = plt.colorbar()
plt.xlabel("RPrec")
plt.ylabel("Average precision (AP)")
cbar.set_label("HPrec")
plt.legend()
plt.tight_layout()
plt.savefig("img/fusion_evaluation.png")

In [None]:
print("Fusion Statistics")

def print_statistics_latex(M_list):
    values = [f"${v:.2f}$" for v in M_list.min(axis=0)]
    print("Min &", " & ".join(values), r"\\")
    values = [f"${v1:.2f}\pm{v2:.2f}$" for v1, v2 in zip(M_list.mean(axis=0), M_list.std(axis=0))]
    print("Avg$\pm$Std &", " & ".join(values), r"\\")
    values = [f"${v:.2f}$" for v in M_list.max(axis=0)]
    print("Max &", " & ".join(values), r"\\")
    argmin = tr_ids[np.argmin(M_list, axis=0)]
    print("Argmin &", " & ".join(
        [np.array2string(a, separator=",") for a in argmin]), r"\\")
    argmax = tr_ids[np.argmax(M_list, axis=0)]
    print("Argmax &", " & ".join(
        [np.array2string(a, separator=",") for a in argmax]), r"\\")

print("Single mean")
print_statistics_latex(M_single_mean)
print("Single max")
print_statistics_latex(M_single_max)
print("Z-score")
print_statistics_latex(M_fusion_z_score)
print("Regression")
print_statistics_latex(M_fusion_regression)

print("Fusion sign tests")
print("Z-score/T/Single-mean")
print(*sign_test(M_fusion_z_score, M_single_mean))
print("Z-score/T/Single-max")
print(*sign_test(M_fusion_z_score, M_single_max))
print("Regression/T/Single-mean")
print(*sign_test(M_fusion_regression, M_single_mean))
print("Regression/T/Single-max")
print(*sign_test(M_fusion_regression, M_single_max))

## Veto evaluation

In [None]:
def _method(threshold, value):
    def f(scores):
        scores[scores < threshold] = value
        return scores
    return f

def _veto_fusions(rls_training, rls_testing, Y_training, Y_testing, value):
    models = [fusion_regression_training(
        rl, Y_training)[0] for rl in rls_training]
    rl_no_veto = fusion_regression(models, rls_testing)
    baseline = evaluate_linking(rl_no_veto, Y_testing)[0]
    y = []
    for xi in x:
        rl_veto = fusion_regression(
            models, rls_testing, alter_scores=_method(xi, value))
        m = evaluate_linking(rl_veto, Y_testing)
        y.append(m[0])
    y = np.array(y) - baseline
    return y

def _plot(rls_A, rls_B, value, c, x):
    y1 = _veto_fusions(rls_A, rls_B, Y_A, Y_B, value)
    plt.plot(x, y1, ls="dotted", c=c, alpha=0.5)
    y2 = _veto_fusions(rls_B, rls_A, Y_B, Y_A, value)
    plt.plot(x, y2, ls="dashed", c=c, alpha=0.5)
    print(f"Set to {value} & {np.max(y1):.2e}/{x[np.argmax(y1)]:.2f} & {np.max(y2):.2e}/{x[np.argmax(y2)]:.2f}") 
    
rls_A = rls_st_jean_A
rls_B = rls_st_jean_B

x = np.linspace(0.01, 0.25, 25)
values = [0, -1, -len(rls_A), -np.inf]

custom_lines = [
    Line2D([0], [0], color="k", lw=2, ls="dotted"),
    Line2D([0], [0], color="k", lw=2, ls="dashed"),
] + [Line2D([0], [0], color=f"C{i}", lw=2) for i in range(len(values))]
labels = ["Train A / Test B", "Train B / Test A"] + \
    [f"Set {str(v)}" for v in values]

plt.figure(figsize=(6, 4), dpi=200)
for i, value in enumerate(values):
    _plot(rls_A, rls_B, value, f"C{str(i)}", x)
plt.xlabel("Threshold")
plt.ylabel("AP gain")
plt.legend(custom_lines, labels)
plt.tight_layout()
plt.savefig("img/veto.png")

## Distance over rank



In [None]:
rank_list = compute_links([X_token_brunet, 0, 500, True, 1e-1, distances.manhattan])
print(len(rank_list))
print(rank_list[0:10])
plt.figure(figsize=(4, 3), dpi=200)
plt.scatter(range(len(rank_list)), [r[-1] for r in rank_list], s=1, marker=",")
plt.xlabel("Rank")
plt.ylabel("Distance")
plt.tight_layout()
plt.savefig("img/distance_over_rank.png")

## S-Curve

In [None]:
scale = 1000
plt.figure(figsize=(4, 3), dpi=200)

min_ = 0
max_ = 5
zoom_factors = np.arange(min_, max_, 0.01)

plt.rcParams["axes.prop_cycle"] = plt.cycler(
    "color", plt.cm.hsv(np.linspace(0, 1, len(zoom_factors))))

for i in zoom_factors:
    y = s_curves.sigmoid_reciprocal(c=i, r=0.5)(scale)
    plt.plot(y, linewidth=0.2)

cbar = plt.colorbar(plt.cm.ScalarMappable(
    norm=colors.Normalize(min_, max_), cmap="hsv"))
cbar.set_label("c")
plt.xlabel("Corrresponding rank")
plt.ylabel("S-Curve weighting")
plt.tight_layout()
plt.savefig("img/s_curve_c.png")

plt.rcParams['axes.prop_cycle'] = plt.rcParamsDefault['axes.prop_cycle']

---

In [None]:
scale = 1000
plt.figure(figsize=(4, 3), dpi=200)

min_ = 0.1
max_ = 0.9
rs = np.arange(min_, max_, 0.001)

plt.rcParams["axes.prop_cycle"] = plt.cycler(
    "color", plt.cm.hsv(np.linspace(0, 1, len(rs))))

for ri in rs:
    y = s_curves.sigmoid_reciprocal(r=ri)(scale)
    plt.plot(y, linewidth=0.2)

cbar = plt.colorbar(plt.cm.ScalarMappable(norm=colors.Normalize(min_, max_), cmap="hsv"))
cbar.set_label("r")
plt.xlabel("Corrresponding rank")
plt.ylabel("S-Curve weighting")
plt.tight_layout()
plt.savefig("img/s_curve_r.png")

plt.rcParams['axes.prop_cycle'] = plt.rcParamsDefault['axes.prop_cycle']

---

In [None]:
scale = 1000
plt.figure(figsize=(4, 3), dpi=200)
y = s_curves.sigmoid_reciprocal(r=0.85, c=4)(scale)
plt.plot(y)
plt.xlabel("Corrresponding rank")
plt.ylabel("S-Curve weighting")
plt.tight_layout()
plt.savefig("img/s_curve_example.png")

## Soft-veto evaluation

In [None]:
rls = rls_oxquarry
Y = Y_oxquarry
# rls = rls_brunet
# Y = Y_brunet
# rls = rls_st_jean
# Y = Y_st_jean

rl = fusion_z_score(rls)
M_vanilla = evaluate_linking(rl, Y)[0]

resolution = 21
cs = np.linspace(1e-6, 20, resolution)
rs = np.linspace(0.1, 0.9, resolution)
print(cs)
print(rs)

c_r = np.array(list(itertools.product(cs, rs)))
M_softveto = []

for a, b in tqdm(c_r):
    s_curve = s_curves.sigmoid_reciprocal(c=a, r=b)
    rls_veto = [s_curves.soft_veto(rl, s_curve) for rl in rls]
    rl = fusion_z_score(rls_veto)
    M_softveto.append(evaluate_linking(rl, Y)[0])

M_softveto = np.array(M_softveto).reshape((resolution, -1))
M_gain = M_softveto - M_vanilla

vmax = np.max(np.abs([np.min(M_gain), np.max(M_gain)]))

print(f"{np.max(M_gain):.2e} / {c_r[np.argmax(M_gain)]}")

plt.figure(figsize=(4,3), dpi=200)
plt.scatter(x=c_r[:, 0], y=c_r[:, 1], c=M_gain, cmap="RdYlGn", marker="s",vmin=-vmax, vmax=vmax)
cbar = plt.colorbar()
cbar.set_label("AP gain")
plt.xlabel("c")
plt.ylabel("r")
plt.xticks(np.linspace(np.min(cs), np.max(cs), 5))
plt.yticks(np.linspace(np.min(rs), np.max(rs), 5))
plt.tight_layout()
plt.savefig("img/soft_veto.png")

## Average precision fusion gain relation with the rank lists diversity

In [None]:
%autoreload 2

def compute_method_rank_list(rls):
    s = len(rls)
    ids = range(s)
    d = []
    for Ai, Bi in itertools.product(ids, ids):
        A, B = rls[Ai], rls[Bi]
        dist = rank_list_distance(A, B)
        d.append(dist)
    d = np.array(d).reshape((s, s))
    return d

def get_ranks(d, rls, Y, aggregation):
    gain, dist_value = [], []
    for ids in itertools.combinations(range(d.shape[0]), 2):
        fusion_rl = [rls[id] for id in ids]
        
        m_single_rls = [evaluate_linking(rl, Y) for rl in fusion_rl]
        m_single = aggregation(m_single_rls, axis=0)
        m_fuse = evaluate_linking(fusion_z_score(fusion_rl), Y)
        diff = m_fuse - m_single
        
        gain.append(diff[0])
        dist_value.append(d[ids])
        
    return np.array(dist_value), np.array(gain)


rls = rls_oxquarry
Y = Y_oxquarry
# rls = rls_brunet
# Y = Y_brunet
# rls = rls_st_jean
# Y = Y_st_jean

d = compute_method_rank_list(rls)
print(d)

def plot(aggregation):
    plt.figure(figsize=(6,4), dpi=200)
    wilcoxon, gain = get_ranks(d, rls, Y, aggregation)
    reg = linregress(wilcoxon, gain)
    slope, intercept, r_value, p_value, std_err = reg
    print(slope, intercept)
    print(f"{r_value:.2f} & {p_value:.2e} & {std_err:.2e}")
    def f(x):
        return slope * x + intercept
    Xs = np.array([np.min(wilcoxon), np.max(wilcoxon)])
    Ys = f(Xs)
    plt.scatter(wilcoxon, gain, label="2 rank lists z-score fusion")
    plt.plot(Xs, Ys, label="Linear regression")
    plt.xlabel(r"Kendall-$\tau$ coefficient")
    plt.ylabel("AP gain")
    plt.legend()
    plt.tight_layout()
    plt.savefig(f"img/rank_list_correlation_{aggregation.__name__}.png", facecolor="w")
    
plot(np.mean)
plot(np.max)

# Clustering with fused rank lists

## Upper bound

In [None]:
Mc = []

for name, rl, Y in datasets_zscores:
    f1_a = best_clustering(rl, "single", Y)
    f1_b = best_clustering(rl, "average", Y)
    f1_c = best_clustering(rl, "complete", Y)
    Mc.append([f1_a, f1_b, f1_c])

    print(f"{name} & {f1_a:.2f} & {f1_b:.2f} & {f1_c:.2f}")


In [None]:
Mc = np.array(Mc)
print(Mc.shape)

print(np.mean(Mc, axis=0))

## Silhouette-based clustering evaluation

In [None]:
linkages = [
    "single",
    "average",
    "complete",
]

# alpha = 0
alpha = -0.2

M = []

for (_, rl, Y), linkage in itertools.product(datasets_zscores, linkages):
    labels, silhouette_scores = silhouette_based_clustering(rl, linkage=linkage, alpha=alpha)
    m = evaluate_clustering(Y, labels)
    M.append(m)
    
M = np.array(M).reshape(len(datasets_zscores), len(linkages), -1)
print(M[:,:,(0,-1)])
print(np.abs(M).mean(axis=(0))[:,(0,-1)])
# print(np.abs(M).mean(axis=(1))[:,(0,-1)])
# print(np.abs(M).mean(axis=(0,1))[[0, -1]])

## Distribution-based clustering evaluation

In [None]:
Mc = []

dts = []

for name, rl, Y in datasets_zscores:
    dts.append(dist_thresh_two_beta(rl, Y))
    
print(dts)

for (training_id, dt), (name, rl, Y) in itertools.product(enumerate(dts), datasets_zscores):
    labels_a = clustering_at_dist_thresh(rl, "single", dt)
    m_a = evaluate_clustering(Y, labels_a)

    labels_b = clustering_at_dist_thresh(rl, "average", dt)
    m_b = evaluate_clustering(Y, labels_b)

    labels_c = clustering_at_dist_thresh(rl, "complete", dt)
    m_c = evaluate_clustering(Y, labels_c)

    print(f"{training_id} & {name} & {m_a[[0,-1]]} & {m_b[[0,-1]]} & {m_c[[0,-1]]}")
    
    Mc.append([m_a, m_b, m_c])

Mc = np.array(Mc)

In [None]:
Mc = Mc.reshape((-1, len(datasets_rls), 3, 4))
print(Mc.shape)

print("Single")
print(Mc[:,:,0,[0, -1]])
print(np.mean(Mc[:,:,0,[0, -1]],axis=(0)))
print(np.mean(Mc[:,:,0,[0, -1]],axis=(1)))
print(np.mean(Mc[:,:,0,[0, -1]],axis=(0,1)))

print("Average")
print(Mc[:,:,1,[0, -1]])
print(np.mean(Mc[:,:,1,[0, -1]],axis=(0)))
print(np.mean(Mc[:,:,1,[0, -1]],axis=(1)))
print(np.mean(Mc[:,:,1,[0, -1]],axis=(0,1)))

print("Complete")
print(Mc[:,:,2,[0, -1]])
print(np.mean(Mc[:,:,2,[0, -1]],axis=(0)))
print(np.mean(Mc[:,:,2,[0, -1]],axis=(1)))
print(np.mean(Mc[:,:,2,[0, -1]],axis=(0,1)))

## Regression-based clustering evaluation

In [None]:
Mc = []

for (name_training, rl_training, Y_training), (name_testing, rl_testing, Y_testing) in itertools.product(datasets_zscores, datasets_zscores):
    dt = dist_thresh_logistic_regression(rl_training, Y_training, rl_testing)

    labels_a = clustering_at_dist_thresh(rl_testing, "single", dt)
    m_a = evaluate_clustering(Y_testing, labels_a)

    labels_b = clustering_at_dist_thresh(rl_testing, "average", dt)
    m_b = evaluate_clustering(Y_testing, labels_b)

    labels_c = clustering_at_dist_thresh(rl_testing, "complete", dt)
    m_c = evaluate_clustering(Y_testing, labels_c)

    Mi = [m_a, m_b, m_c]

    print(f"{name_training} & {name_testing} & {m_a[[0,-1]]} & {m_b[[0,-1]]} & {m_c[[0,-1]]}")
    
    Mc.append(Mi)
    
Mc = np.array(Mc)

In [None]:
Mc = Mc.reshape((-1, len(datasets_rls), 3, 4))
print(Mc.shape)

print("Single")
print(Mc[:,:,0,[0, -1]])
print(np.mean(Mc[:,:,0,[0, -1]],axis=(0)))
print(np.mean(Mc[:,:,0,[0, -1]],axis=(1)))
print(np.mean(Mc[:,:,0,[0, -1]],axis=(0,1)))

print("Average")
print(Mc[:,:,1,[0, -1]])
print(np.mean(Mc[:,:,1,[0, -1]],axis=(0)))
print(np.mean(Mc[:,:,1,[0, -1]],axis=(1)))
print(np.mean(Mc[:,:,1,[0, -1]],axis=(0,1)))

print("Complete")
print(Mc[:,:,2,[0, -1]])
print(np.mean(Mc[:,:,2,[0, -1]],axis=(0)))
print(np.mean(Mc[:,:,2,[0, -1]],axis=(1)))
print(np.mean(Mc[:,:,2,[0, -1]],axis=(0,1)))