# Descriptive Analysis

I start by gathering data concerning tags from different (combinations of) origins.

In [None]:
import matplotlib.pyplot as plt
import os
from typing import Dict
from Project.Utils.Misc.OriginContainer import OriginContainer
from Project.AutoSimilarityCacheConfiguration.DataAccess import DataAccess
from Project.Misc.Misc import crop_background

import numpy as np
from tabulate import tabulate

da: DataAccess = DataAccess.instance

number_of_occurrences_per_tag = dict()

origin_specific_data = dict()

origins = ['Icon', 'Exp', 'Title', 'Title&Exp', 'Des', 'Des&Exp', 'Des&Title', 'Des&Title&Exp', 'Obj', 'Title&Exp&Obj']

def origin_to_name(origin):
    if origin == 'Exp':
        return 'Expert Tags'
    if origin == 'Des':
        return 'Description Tags'
    if origin == 'Icon':
        return 'Icon Tags'
    if origin == 'Title':
        return 'Title Tags'
    if origin == 'Title&Exp':
        return 'Title & Expert Tags'
    if origin == 'Des&Exp':
        return 'Description & Expert Tags'
    if origin == 'Des&Title':
        return 'Description & Title Tags'
    if origin == 'Des&Title&Exp':
        return 'Description, Title & Expert Tags'
    if origin == 'Obj':
        return 'Object Tags'
    if origin == 'Title&Exp&Obj':
        return 'Title, Expert & Object Tags'
    raise Exception(f'Unhandled case: {origin}')

origin_names = [origin_to_name(o) for o in origins]

for o in origins:
    origin_specific_data[o]: Dict[str, any] = dict()
    origin_specific_data[o]['number_of_tags_per_origin'] = dict()
    origin_specific_data[o]['number_of_occurrences_per_tag'] = dict()
    origin_specific_data[o]['max_number_of_tags_per_origin'] = 0
    origin_specific_data[o]['all_tags'] = []
    unique_tags = set()

    origin_container_dict = []
    for part in o.split('&'):
        origin_container_dict.append(part)
    origin_container = OriginContainer(tuple(origin_container_dict))

    for i in da.get_ids():
        for tag in da.get_tag_tuples_from_identifier(identifier=i, origin_container=origin_container):
            if tag[1] in o and tag[0] not in origin_specific_data[o][
                'all_tags']:  # Note: Set would be more efficient, but the order should stay the same across runs
                origin_specific_data[o]['all_tags'].append(tag[0])

    for i in da.get_ids():
        current_tags = da.get_tag_tuples_from_identifier(identifier=i, origin_container=origin_container)
        origin_specific_data[o]['number_of_tags_per_origin'][i] = 0
        for c in current_tags:
            if c[1] not in o:
                continue
            if c[0] in origin_specific_data[o]['number_of_occurrences_per_tag']:
                origin_specific_data[o]['number_of_occurrences_per_tag'][c[0]] += 1
            else:
                origin_specific_data[o]['number_of_occurrences_per_tag'][c[0]] = 1
            origin_specific_data[o]['number_of_tags_per_origin'][i] += 1
            origin_specific_data[o]['max_number_of_tags_per_origin'] = max(
                origin_specific_data[o]['max_number_of_tags_per_origin'],
                origin_specific_data[o]['number_of_tags_per_origin'][i])
            unique_tags.add(c)
        origin_specific_data[o]['number_of_unique_tags'] = len(unique_tags)
    origin_specific_data[o]['entities_with_n_tags'] = dict()

    origin_specific_data[o]['max_number_of_occurrences_of_any_tag'] = 0
    for t in origin_specific_data[o]['all_tags']:
        origin_specific_data[o]['max_number_of_occurrences_of_any_tag'] = max(
            origin_specific_data[o]['max_number_of_occurrences_of_any_tag'],
            origin_specific_data[o]['number_of_occurrences_per_tag'][t])

    origin_specific_data[o]['tags_per_number_of_occurrences'] = dict()
    for i in range(1, origin_specific_data[o]['max_number_of_occurrences_of_any_tag'] + 1):
        origin_specific_data[o]['tags_per_number_of_occurrences'][i] = 0

    for i in range(1, origin_specific_data[o]['max_number_of_occurrences_of_any_tag'] + 1):
        for t in origin_specific_data[o]['all_tags']:
            if origin_specific_data[o]['number_of_occurrences_per_tag'][t] == i:
                origin_specific_data[o]['tags_per_number_of_occurrences'][i] += 1

    for i in range(origin_specific_data[o]['max_number_of_tags_per_origin'] + 1):
        origin_specific_data[o]['entities_with_n_tags'][i] = 0

    for i in da.get_ids():
        origin_specific_data[o]['entities_with_n_tags'][origin_specific_data[o]['number_of_tags_per_origin'][i]] += 1

Here are the box plots showing the number of tags of the entities when only considering tags that are from one of the
specified origins.

In [None]:
fig, ax = plt.subplots()
title = 'Distribution of Tags from (Combinations of) Origins'
fig.set_figheight(10)
fig.set_figwidth(8)
ax.set_title(title, fontsize=14)
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.yticks(fontsize=12)
ax.boxplot([[origin_specific_data[o]['number_of_tags_per_origin'][i] for i in da.get_ids()] for o in origins],
           labels=origin_names, showfliers=True)
plt.savefig(title.replace(' ', '_').replace(':', '').replace('%', '').replace(',', '') + '.png')

In the plot above, it can be seen that description tags are distributed extremely unevenly. If we look at the plot
below, the reason becomes apparent: The boxplot for "Des" with outliers ignored is just a straight line. This means that
almost none of the entities have Description tags. From the figure above we know that those entities that do have
description tags have a lot of them.

In [None]:
fig, ax = plt.subplots()
title = 'Outliers ignored: Distribution of Tags from (Combinations of) Origins'
fig.set_figheight(10)
fig.set_figwidth(8)
ax.set_title(title, fontsize=14)
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.yticks(fontsize=12)
ax.boxplot([[origin_specific_data[o]['number_of_tags_per_origin'][i] for i in da.get_ids()] for o in origins],
           labels=origin_names, showfliers=False)
plt.savefig(title.replace(' ', '_').replace(':', '').replace('%', '').replace(',', '') + '.png')

Next a look is taken on how evenly the tags are distributed.

In [None]:
for o in origins:
    _, ax = plt.subplots()
    title = f'Number of Artworks with n tags from Origin(s): {origin_to_name(o)}'
    ax.set_title(title)
    ax.set_xlabel('Number of Tags')
    ax.set_ylabel('Number of Artworks with n Tags')
    ax.bar((tmp := origin_specific_data[o]['entities_with_n_tags']).keys(), tmp.values(), width=1)
    plt.savefig(title.replace(' ', '_').replace(':', '').replace('%', '').replace(',', '') + '.png')

The previous statements to tags from the descriptions are again deductible from the above graphs. The graph generated
for the origin of expert tags looks normally distributed. This is not the case for the origin of titles. The
distribution for the origin of description is that the great majority of the mass lies at 0.
This has an influence on the combinations that description tags are a part of.

In [None]:
for o in origins:
    print(f'There are {origin_specific_data[o]["entities_with_n_tags"][0]} Entities without Tags in {origin_to_name(o)}')

In the output above it can be seen that if the origins of Titles and Expert Tags are considered, there is only one
entity that does not have any tags associated with it.

In [None]:
for o in origins:
    if 'Obj' in o:  # Analyzed separately
        continue
    fig, ax = plt.subplots()
    fig.set_figheight(5)
    fig.set_figwidth(10)
    title = f'Number of Occurrences of Tags from Origin(s): {origin_to_name(o)}'
    ax.set_title(title)
    ax.set_xlabel('Number of Occurrences')
    ax.set_ylabel('Number of Tags with n Occurrences')
    tmp = origin_specific_data[o]['tags_per_number_of_occurrences']
    top_3 = sorted(origin_specific_data[o]['number_of_occurrences_per_tag'].items(), key=lambda x: x[1], reverse=True)[
        0:3]
    cur_max_y = max(tmp.values())
    cur_max_x = max(tmp.keys())
    for index, entry in enumerate(top_3):
        ax.annotate(entry[0], (entry[1], 0), xytext=(entry[1], cur_max_y / (4 - index)),
                    bbox=dict(boxstyle="round", alpha=1, color='lightblue'), arrowprops=dict(color='lightblue', shrink=0.05))

    ax.bar(tmp.keys(), tmp.values(), width=1.5)
    plt.savefig(title.replace(' ', '_').replace(':', '').replace('%', '').replace(',', '') + '.png')

The graphs above are difficult to read, therefore a cut will be made when 95% of the mass is accounted for and the
graphs will be generated again. What can already be seen however, is that most tags that come from the descriptions are
only seen once. This is not necessarily a problem, since the similarity measures are computed by a language model and
therefore does not rely on exact matches.
Since a lot of caches have to be generated and the time complexity for these operations is high and depends on the
number of unique tags, considering tags from the descriptions will result in a significant slow-down.

In [None]:
for o in origins:
    total = 0
    for i in range(1, origin_specific_data[o]['max_number_of_occurrences_of_any_tag'] + 1):
        total += origin_specific_data[o]['tags_per_number_of_occurrences'][i]
    current = 0
    cut_at = None
    for i in range(1, origin_specific_data[o]['max_number_of_occurrences_of_any_tag'] + 1):
        current += origin_specific_data[o]['tags_per_number_of_occurrences'][i]
        if current >= total * 0.95:
            cut_at = i
            break
    cut_off_dict = dict()
    for k in origin_specific_data[o]['tags_per_number_of_occurrences'].keys():
        if k > cut_at:
            break
        cut_off_dict[k] = origin_specific_data[o]['tags_per_number_of_occurrences'][k]
    origin_specific_data[o]['cut_tags_per_number_of_occurrences'] = cut_off_dict

In [None]:
for o in origins:
    _, ax = plt.subplots()
    title = f'Cut off at 95%: Number of Artworks with n Tags from Origin(s): {origin_to_name(o)}'
    ax.set_title(title)
    ax.set_xlabel('Number of occurrences')
    ax.set_ylabel('Number of tags with n occurrences')
    ax.bar((tmp := origin_specific_data[o]['cut_tags_per_number_of_occurrences']).keys(), tmp.values(),
           tick_label=[k for k in tmp.keys()])
    plt.savefig(title.replace(' ', '_').replace(':', '').replace('%', '').replace(',', '') + '.png')

In [None]:
obj_tags = set()
for i in da.get_ids():
    for tag in da.get_tag_tuples_from_identifier(i, OriginContainer(('Obj',))):
        obj_tags.add(tag[0])
obj_tag_occurrences = dict()

for object_tag in obj_tags:
    obj_tag_occurrences[object_tag] = 0

for i in da.get_ids():
    current_tags = tuple(tt[0] for tt in da.get_tag_tuples_from_identifier(i, OriginContainer(('Obj',))))
    for object_tag in obj_tags:
        if object_tag in current_tags:
            obj_tag_occurrences[object_tag] += 1

obj_tag_occurrences = sorted(obj_tag_occurrences.items(), key=lambda x: x[1], reverse=True)

_, ax = plt.subplots()
title = f'Number of Artworks in which an Object Tag appears'
ax.set_title(title)
ax.set_xlabel('Tag')
ax.set_ylabel('Number of Artworks with at least one Occurrence of tag')
ax.bar([tt[0] for tt in obj_tag_occurrences], [tt[1] for tt in obj_tag_occurrences])
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.savefig(title.replace(' ', '_').replace(':', '').replace('%', '').replace(',', '') + '.png')

In [None]:
column_specific_data = dict()
columns_of_interest = ['YearEstimate', 'MaterialTechnique', 'ObjectClass']
methods_for_fetching = [da.get_year_estimate_from_identifier, da.get_material_technique_from_identifier,
                        da.get_object_class_from_identifier]

for c, met in zip(columns_of_interest, methods_for_fetching):
    column_specific_data[c] = dict()
    column_specific_data[c]['number_of_occurrences_per_unique_value'] = dict()
    column_specific_data[c]['max_number_of_occurrences'] = 0
    column_specific_data[c]['number_of_unique_values'] = 0
    for i in da.get_ids():
        if (current := met(i)) in column_specific_data[c]['number_of_occurrences_per_unique_value'].keys():
            column_specific_data[c]['number_of_occurrences_per_unique_value'][current] += 1
        else:
            column_specific_data[c]['number_of_occurrences_per_unique_value'][current] = 1

        column_specific_data[c]['max_number_of_occurrences'] = max(
            column_specific_data[c]['max_number_of_occurrences'],
            column_specific_data[c]['number_of_occurrences_per_unique_value'][current])
        column_specific_data[c]['number_of_unique_values'] += 1

    column_specific_data[c]['unique_values_per_number_of_occurrences'] = dict()

    for i in range(1, column_specific_data[c]['max_number_of_occurrences'] + 1):
        column_specific_data[c]['unique_values_per_number_of_occurrences'][i] = 0

    for i in range(1, column_specific_data[c]['max_number_of_occurrences'] + 1):
        for k in column_specific_data[c]['number_of_occurrences_per_unique_value'].keys():
            if column_specific_data[c]['number_of_occurrences_per_unique_value'][k] == i:
                column_specific_data[c]['unique_values_per_number_of_occurrences'][i] += 1

In [None]:
for c in columns_of_interest:
    fig, ax = plt.subplots()
    fig.set_figheight(5)
    fig.set_figwidth(10)
    title = f'Number of Occurrences in Column: {c}'
    ax.set_title(title)
    ax.set_xlabel('Number of Occurrences')
    ax.set_ylabel('Number of Tags with n Occurrences')
    current_data = column_specific_data[c]['unique_values_per_number_of_occurrences']

    top_3 = sorted(column_specific_data[c]['number_of_occurrences_per_unique_value'].items(), key=lambda x: x[1],
               reverse=True)[0:3]
    cur_max_y = max(current_data.values())
    cur_max_x = max(current_data.keys())
    for index, entry in enumerate(top_3):
        ax.annotate(entry[0], (entry[1], cur_max_y / 20), xytext=(entry[1], cur_max_y / (4 - index)),
                    bbox=dict(boxstyle="round", alpha=1, color='lightblue'), arrowprops=dict(color='lightblue', shrink=0.05))
    locs, labels = plt.xticks()
    step_size = int(len(current_data) / 10)
    if int(step_size) != int(len(current_data) / 10):
        step_size = int(step_size) + 1
    else:
        step_size = int(len(current_data) / 10)
    plt.xticks(np.arange(1, len(current_data) + 1, step_size))
    use_width = 1
    if c != 'YearEstimate':
        use_width = 8
    ax.bar(current_data.keys(), current_data.values(), width=use_width)
    plt.savefig(title.replace(' ', '_').replace(':', '').replace('%', '').replace(',', '') + '.png')

Above we can see that the columns "MaterialTechnique" and "ObjectClass" are dominated by a few classes. The same is
true for the column "YearEstimate", but to a lesser extent.

In [None]:
min_year = np.inf
max_year = -np.inf

for i in da.get_ids():
    current_year = da.get_year_estimate_from_identifier(i)
    if current_year < min_year:
        min_year = current_year
    if current_year > max_year:
        max_year = current_year

year_occurrences = dict()

for y in range(min_year, max_year + 1):
    year_occurrences[y] = 0

for i in da.get_ids():
    year_occurrences[da.get_year_estimate_from_identifier(i)] += 1

_, ax = plt.subplots()
title = f'Number of Artworks per Year'
ax.set_title(title)
ax.set_xlabel('Year')
ax.set_ylabel('Number of Artworks attributed to that Year')
ax.bar(year_occurrences.keys(), year_occurrences.values(), width=1.5)
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.savefig(title.replace(' ', '_').replace(':', '').replace('%', '').replace(',', '') + '.png')

In the following the most frequent tags considering each (combination of) origin(s) are listed.

In [None]:
for o in origins:
    current = dict(
        sorted(origin_specific_data[o]['number_of_occurrences_per_tag'].items(), key=lambda x: x[1], reverse=True)[
        0:25])
    table = []
    for k in current.keys():
        table.append((k, current[k]))
    print(f'There are {len(origin_specific_data[o]["number_of_occurrences_per_tag"].keys())} unique elements for origin {o}\n')
    print(tabulate(table, headers=[f'Most frequent tags of origin {o}', 'Number of occurrences']) + '\n\n\n')

In the following the most frequent values of the columns "YearEstimate", "MaterialTechnique" and "ObjectClass" are
listed.

In [None]:
for c in columns_of_interest:
    current = dict(
        sorted(column_specific_data[c]['number_of_occurrences_per_unique_value'].items(), key=lambda x: x[1],
               reverse=True)[0:25])
    table = []
    for k in current.keys():
        table.append((k, current[k]))
    print(tabulate(table, headers=[f'Most frequent values of column {c}', 'Number of occurrences']) + '\n\n\n')

In [None]:
for o in origins:
    print(f'There are {origin_specific_data[o]["number_of_unique_tags"]} unique tags in {origin_to_name(o)}')

In [None]:
for file in os.listdir(os.getcwd()):
    if file.endswith('.png'):
        crop_background(os.path.join(os.getcwd(), file))