In [None]:
import functools
import os
import pickle
import random

import numpy as np
import pandas as pd
import pylab
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm import tqdm
from scipy import stats

from Project.AutoSimilarityCacheConfiguration.DataAccess import DataAccess
from Project.Utils.Misc.OriginContainer import OriginContainer
from Project.AutoSimilarityCache.Caching.SimilarityRankDatabase import SimilarityRankDatabase
from Project.Utils.FilterCache.FilterCache import FilterCache
from Project.Matching_and_Similarity_Tasks.Semantic_Path import generate_path
from Project.Utils.Misc.Misc import cosine_distance
from Project.Utils.Misc.Nlp import NLP
from Project.Utils.TextTagProcessing.GermanWordSplitter import GermanWordSplitter
from Project.Utils.IconclassCache.IconclassCache import IconclassCache

fc: FilterCache = FilterCache.instance
fc.set_rule_min_nr_of_tags_of_combined_origin(2, 'Icon')


# Note in this configuration it makes no difference whether 'Obj' is considered in the OriginContainer or not
identifiers = fc.get_filtered_identifiers(origin_container=OriginContainer(('Title', 'NotIcon', 'Obj')))

N_SAMPLES = 500

if os.path.exists('pairs.pkl'):
    with open('pairs.pkl', 'rb') as f:
        pairs = pickle.load(f)
else:
    pairs = []
    da: DataAccess = DataAccess.instance

    for p in tqdm(range(N_SAMPLES)):
        while True:
            first_index = random.randint(0, len(identifiers) - 1)
            second_index = random.randint(0, len(identifiers) - 1)
            while second_index == first_index:
                second_index = random.randint(0, len(identifiers) - 1)
            if (identifiers[first_index], identifiers[second_index]) not in pairs:
                break
        pairs.append((identifiers[first_index], identifiers[second_index]))

    with open('pairs.pkl', 'wb+') as f:
        pickle.dump(pairs, f)

In [None]:
print(len(set(pairs)))

In [None]:
unique_identifiers = set()
da: DataAccess = DataAccess.instance

for p in pairs:
    unique_identifiers.add(p[0])
    unique_identifiers.add(p[1])

print(len(unique_identifiers))

In [None]:
if not os.path.exists('generated.pkl'):
    srd: SimilarityRankDatabase = SimilarityRankDatabase.instance
    identifiers_to_generate = set(u for u in unique_identifiers if u not in srd.get_already_generated_entries('Title&NotIcon&Obj'))

    for ind, i in tqdm(enumerate(identifiers_to_generate), total=len(identifiers_to_generate)):
        _ = srd.get_similarities_for_id(i, origin_container=OriginContainer(('Title', 'NotIcon', 'Obj')))

    print('Saving, do not interrupt!')
    srd.save_generated()
    with open('generated.pkl', 'wb+') as f:
        pickle.dump(None, f)
    print('DONE')

In [None]:
@functools.lru_cache()
def category_to_vector(category):
    gws: GermanWordSplitter = GermanWordSplitter()
    nlp: NLP = NLP.instance
    current_words = ''
    for word in category.split(" "):
        if sum(nlp.nlp(word).vector) == 0:
            found = gws.split_german_word(word)

            if found:
                for e in found:
                    current_words += e + ' '
        else:
            current_words += word + ' '
    return nlp.nlp(current_words).vector


@functools.lru_cache()
def get_word_vector_of_average_top_most_iconclass_category(identifier):
    data_access: DataAccess = DataAccess.instance
    icc: IconclassCache = IconclassCache.instance
    top_most_categories = set()

    iconclass_tags = data_access.get_iconclass_tags_from_identifier(identifier)
    assert len(iconclass_tags) > 0

    for label in iconclass_tags:
        category = icc.text_to_category(label)
        top_most_categories.add(icc.category_to_text(icc.get_category_at_level_or_higher(category, 1)))

    category_vectors = []
    for c in top_most_categories:
        category_vectors.append(category_to_vector(c))

    final_vector = []
    for ic in range(len(category_vectors[0])):
        current_position = []
        for c in category_vectors:
            current_position.append(c[ic])
        final_vector.append(sum(current_position) / len(current_position))
    return tuple(final_vector)


def get_distances(identifier, start, end):
    assert start != end
    vector = get_word_vector_of_average_top_most_iconclass_category(identifier)
    distance_to_start = cosine_distance(np.array(vector),
                                        np.array(get_word_vector_of_average_top_most_iconclass_category(start)))
    distance_to_end = cosine_distance(np.array(vector),
                                      np.array(get_word_vector_of_average_top_most_iconclass_category(end)))
    return distance_to_start, distance_to_end


def get_paths(steps, obj_tags):
    paths = []
    for pair in tqdm(pairs, desc=f'Generating paths with {steps} intermediate steps'):
        paths.append(generate_path(start=pair[0], end=pair[1], intermediate_steps=steps, obj_tags=obj_tags))
    return paths


def get_distances_of_path(paths):
    distances = []
    for path in tqdm(paths, desc='Calculating distances of paths'):
        start = path[0]
        end = path[len(path) - 1]
        current_distances = []
        for generated in path:
            current_distances.append(get_distances(generated, start, end))
        distances.append(current_distances)
    return distances


def get_orders(distances):
    orders = []
    for path_distances in tqdm(distances, 'Generating orders'):
        current_ratios = dict()
        for index, distance in enumerate(path_distances):
            ratio = distance[0] / distance[1]
            current_ratios[index] = ratio
        orders.append([r[0] for r in sorted(current_ratios.items(), key=lambda x: x[1], reverse=True)])

    return orders


def get_random_paths(length, samples):
    random_paths = []

    for _ in tqdm(range(samples), desc=f'Generating random path with {length} intermediate steps'):
        current_path = []
        while len(current_path) != length + 2:
            while (current_identifier := identifiers[random.randint(0, len(identifiers) - 1)]) in current_path:
                continue
            current_path.append(current_identifier)
        random_paths.append(current_path)
    return random_paths

char_map = '0123456789abcdefghijklmnopqrstuvwxyz'

def distance_to_string(distance):
    return char_map[distance]

def distances_to_string(distances):
    result = ''
    for d in distances:
        assert isinstance(d, int)
        result += distance_to_string(d)
    return result


def get_number_of_necessary_adjacent_swaps(compare: str):
    assert len(compare) < len(char_map)
    compare_to = ''

    for current in range(0, len(compare)):
        current_string = distance_to_string(current)
        compare_to += current_string
        assert current_string in compare
    assert len(compare) == len(compare_to)

    swaps = 0
    for compare_to_position in range(len(compare_to)):
        compare_position = None
        for current_compare_position in range(len(compare)):
            compare_position = current_compare_position
            if compare[compare_position] == compare_to[compare_to_position]:
                break
        assert compare_position is not None

        if compare_position == compare_to_position:
            continue

        swap_to_right = compare_position < compare_to_position
        offset = 0

        while compare[compare_to_position] != compare_to[compare_to_position]:
            if swap_to_right:
                compare = compare[:compare_position + offset] + compare[compare_position + offset + 1] + compare[
                    compare_position + offset] + compare[compare_position + offset + 2:]
                offset += 1
            else:
                compare = compare[:compare_position + offset - 1] + compare[compare_position + offset] + compare[
                    compare_position + offset - 1] + compare[compare_position + offset + 1:]
                offset -= 1
            swaps += 1
    return swaps

def calculate_experiment_results(steps, experiment_name, obj_tags):
    file_name = f'{experiment_name}_{steps}.pkl'

    result = dict()

    result['path'] = get_paths(steps, obj_tags)

    dl_distances = []
    current_distances = get_orders(get_distances_of_path(result['path']))
    for d in tqdm(current_distances, desc=f'Evaluating generated paths for {file_name}'):
        actual_result = distances_to_string(d)
        dl_distances.append(get_number_of_necessary_adjacent_swaps(actual_result))
    result['dl_distances'] = dl_distances

    with open(file_name, 'wb+') as experiment_f:
        pickle.dump(result, experiment_f)

def calculate_random_experiment_results(steps):
    file_name = f'random_{steps}.pkl'

    result = dict()

    result['random_path'] = get_random_paths(steps, 1000)

    random_dl_distances = []
    current_distances = get_orders(get_distances_of_path(result['random_path']))
    for d in tqdm(current_distances, desc=f'Evaluating random paths for {file_name}'):
        actual_result = distances_to_string(d)
        random_dl_distances.append(get_number_of_necessary_adjacent_swaps(actual_result))
    result['random_dl_distances'] = random_dl_distances

    with open(file_name, 'wb+') as experiment_f:
        pickle.dump(result, experiment_f)

@functools.lru_cache
def get_experiment_results(length, experiment_name):
    with open(f'{experiment_name}_{length}.pkl', 'rb') as experiment_f:
        return pickle.load(experiment_f)

@functools.lru_cache
def get_random_experiment_results(length):
    with open(f'random_{length}.pkl', 'rb') as experiment_f:
        return pickle.load(experiment_f)

def summarize_experiment_results(length, experiment_name):
    dl_distances = get_experiment_results(length, experiment_name)['dl_distances']
    result = dict()
    result['dl_distances'] = dl_distances
    result['average_dl_distance'] = sum(dl_distances) / len(dl_distances)
    return result

def summarize_random_experiment_results(length):
    random_dl_distances = get_random_experiment_results(length)['random_dl_distances']
    result = dict()
    result['random_dl_distances'] = random_dl_distances
    result['average_random_dl_distance'] = sum(random_dl_distances) / len(random_dl_distances)
    return result

def aggregate_experiment(experiment_name, is_random, from_length, to_length):
    aggregation_results = []

    key = 'dl_distances'

    if is_random:
        key = 'random_' + key

    for _ in range(len(pairs)):
        aggregation_results.append([])

    for j in range(from_length, to_length + 1):
        summary = summarize_experiment_results(j, experiment_name)
        normalized_dl_distances = [d / j for d in summary[key]]
        for identifier in range(len(pairs)):
            aggregation_results[identifier].append(normalized_dl_distances[identifier])

    tmp = []
    for r in aggregation_results:
        tmp.append(sum(r) / len(r))
    aggregation_results = tmp

    return aggregation_results


def get_averages_of_experiment(from_path_length, to_path_length, experiment_name, is_random):
    averages_results = []
    if is_random:
        for steps in range(from_path_length, to_path_length + 1):
            averages_results.append(summarize_random_experiment_results(steps)['average_random_dl_distance'])
    else:
        for steps in range(from_path_length, to_path_length + 1):
            averages_results.append(summarize_experiment_results(steps, experiment_name)['average_dl_distance'])

    return sum(averages_results) / len(averages_results)

In [None]:
for name, use_obj_tags in zip(['with_object_tags', 'without_object_tags'], [True, False]):
    for path_length in range(3, 24):
        if not os.path.exists(f'{name}_{path_length}.pkl'):
            calculate_experiment_results(path_length, name, use_obj_tags)

for path_length in range(3, 24):
        if not os.path.exists(f'random_{path_length}.pkl'):
            calculate_random_experiment_results(path_length)

In [None]:
running_number = -1
results = []

for i in range(3, 24):
    with_summary = summarize_experiment_results(i, 'with_object_tags')['dl_distances']
    without_summary = summarize_experiment_results(i, 'without_object_tags')['dl_distances']
    random_summary = summarize_random_experiment_results(i)['random_dl_distances']
    for s in with_summary:
        running_number += 1
        results.append(('With Object Tags', running_number, i, s))
    for s in without_summary:
        running_number += 1
        results.append(('Without Object Tags', running_number, i, s))
    for s in random_summary:
        running_number += 1
        results.append(('Random', running_number, i, s))

plt.figure(figsize=(25, 15))
plt.xticks(np.arange(3, 24, 1))
result_plot = sns.lineplot(data=pd.DataFrame(results, columns=['Type', 'Number', 'Intermediate Steps', 'Error']), x='Intermediate Steps', y='Error', hue='Type').set(title='Quantitative Analysis (Showing 95% Confidence Interval)')
plt.savefig('quantitative_metric_scores.png')

In [None]:
data = dict()
for i in range(4, 24):
    data[i] = (get_averages_of_experiment(3, i, 'with_object_tags', False) / get_averages_of_experiment(3, i, 'without_object_tags', False) - 1) * 100
plt.xticks(np.arange(3, 24, 1))
sns.lineplot(data=data).set(title='Relative Difference of Errors for Paths of Length x in Percent')
plt.savefig('quantitative_metric_scores_differences.png')

In [None]:
absolute_differences = dict()
for i in range(4, 24):
    absolute_differences[i] = (get_averages_of_experiment(3, i, 'with_object_tags', False) - get_averages_of_experiment(3, i, 'without_object_tags', False))
absolute_differences