In [None]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def split_dataset():
    string_judgment = 'judgment'
    string_dataset = "dataset"

    df_judgments = pd.read_csv('judgements_all.csv', delimiter='\t', quoting =csv.QUOTE_MINIMAL)

    print("column headers for dataframe: ")
    print(df_judgments.keys())

    dataset_list = df_judgments[string_dataset].unique().tolist()

    for dataset_name in dataset_list:
        unique_labels_of_dataset = df_judgments[df_judgments[string_dataset] == dataset_name][string_judgment].unique().tolist()
        if np.nan in unique_labels_of_dataset:
            print(dataset_name + " has nan value")
        # print(dataset_name + str(unique_labels_of_dataset))

    df_aggregated = df_judgments[df_judgments[string_dataset].isin(['WIC', 'Cosimlex', "TempoWic"])]

    df_judgments = df_judgments[~df_judgments[string_dataset].isin(['WIC', 'Cosimlex', "TempoWic"])]


    # df_judgements_with_vote_five = df_judgments[df_judgments[key_for_vote] == 5]
    # print(df_judgements_with_vote_five["dataset"].unique())

    # df_usim = df_judgments[df_judgments['dataset'] == "USim"]
    # df_judgments = df_judgments[df_judgments['dataset'] != "USim"]
    # print(df_judgments[df_judgments['dataset'] == "USim"]["judgment"].unique())


    # df_aggregated[key_for_vote].replace({'T': 1, 'F': 0}, inplace=True)

    row_index_of_null_vote = df_judgments.loc[df_judgments[string_judgment].isnull()].index.tolist()
    print("number of rows with nan judgment in dataset rushifteval_public: " + str(len(row_index_of_null_vote)))

    # csv_row_index_null_vote = [index + 2 for index in df_judgements.loc[df_judgements[key_for_vote].isnull()].index]
    # print(df_judgements.iloc[csv_row_index_null_vote[0:5]])
    # print(f"The null value in column 'judgment' is located on csv row {csv_row_index_null_vote}.")

    complement =  ~df_judgments.index.isin(row_index_of_null_vote)

    df_judgments[string_judgment][complement] = df_judgments[string_judgment][complement].astype(float)

    df_judgments[string_judgment].replace({0.0: np.nan}, inplace=True)

    # print(df_judgments[string_dataset].unique())

    # print(df_judgments[string_judgment].unique())
    # print(df_usim["judgment"].unique())

    # print(df_aggregated[string_judgment].unique())

    return df_judgments, df_aggregated

df_judgments, df_aggregated = split_dataset()
print(df_aggregated.head())
print(df_judgments.head())

In [None]:
def get_instances_with_multiple_annotations(df_judgments) -> dict:
    """
    does not contain instance with only one annotation
    """
    key_identifier_one = 'identifier1'
    key_identifier_two = 'identifier2'
    key_dataset = 'dataset'
    key_judgment = "judgment"
    dict_of_nonaggregated_instances = dict()
    for index, row in df_judgments.iterrows():
        composite_key = (row[key_dataset], row[key_identifier_one], row[key_identifier_two])
        if composite_key not in dict_of_nonaggregated_instances:
            dict_of_nonaggregated_instances[composite_key] = [tuple(list(composite_key) + [index, row[key_judgment]])]
        else:
            dict_of_nonaggregated_instances[composite_key] = dict_of_nonaggregated_instances[composite_key] + [tuple(list(composite_key) + [index, row[key_judgment]])]

    dict_of_nonaggregated_instances_with_multiple_annotations = dict()
    row_count = 0
    for key, value in dict_of_nonaggregated_instances.items():
        if len(value) != 1:
            dict_of_nonaggregated_instances_with_multiple_annotations[key] = value
            row_count = row_count + len(value)
        else:
            row_count = row_count + 1
    print("row_count: " + str(row_count))
    return dict_of_nonaggregated_instances_with_multiple_annotations

def get_dict_of_nonaggregated_instances(df_judgments) -> dict:
    """
    contain instance with both one annotation or multiple annotations
    """
    key_identifier_one = 'identifier1'
    key_identifier_two = 'identifier2'
    key_dataset = 'dataset'
    key_judgment = "judgment"
    key_lemma = "lemma"

    dict_of_nonaggregated_instances = dict()
    for index, row in df_judgments.iterrows():
        composite_key = (row[key_dataset], row[key_identifier_one], row[key_identifier_two], row[key_lemma])
        if composite_key not in dict_of_nonaggregated_instances:
            dict_of_nonaggregated_instances[composite_key] = [tuple(list(composite_key) + [index, row[key_judgment]])]
        else:
            dict_of_nonaggregated_instances[composite_key] = dict_of_nonaggregated_instances[composite_key] + [tuple(list(composite_key) + [index, row[key_judgment]])]

    return dict_of_nonaggregated_instances

def get_rows_of_instances_with_multiple_annotations(df_judgments) -> list:
    rows_of_nonaggregated_instances_with_multiple_annotations = list()
    key_identifier_one = 'identifier1'
    key_identifier_two = 'identifier2'
    key_dataset = 'dataset'
    key_judgment = "judgment"
    dict_of_nonaggregated_instances = dict()
    for index, row in df_judgments.iterrows():
        composite_key = (row[key_dataset], row[key_identifier_one], row[key_identifier_two])
        if composite_key not in dict_of_nonaggregated_instances:
            dict_of_nonaggregated_instances[composite_key] = [tuple(list(composite_key) + [index, row[key_judgment]])]
        else:
            dict_of_nonaggregated_instances[composite_key] = dict_of_nonaggregated_instances[composite_key] + [tuple(list(composite_key) + [index, row[key_judgment]])]

    row_count = 0
    for key, value in dict_of_nonaggregated_instances.items():
        if len(value) != 1:
            for tuple_annotation in value:
                rows_of_nonaggregated_instances_with_multiple_annotations.append(tuple_annotation[3])
            row_count = row_count + len(value)
        else:
            row_count = row_count + 1
    print("row_count: " + str(row_count))
    return rows_of_nonaggregated_instances_with_multiple_annotations


def get_votes_for_instances_with_multiple_annotations(df_judgments) -> dict:
    """
    return a skeleton dict, the dict only contains info about instance with multiple annotations
    """
    dict_of_nonaggregated_instances_with_multiple_annotations = get_instances_with_multiple_annotations(df_judgments)
    for key, value in dict_of_nonaggregated_instances_with_multiple_annotations.items():
        votes_list = list()
        for tuple_annotation in value:
            votes_list.append(tuple_annotation[-1])
        votes_list.sort()
        dict_of_nonaggregated_instances_with_multiple_annotations[key] = votes_list
    return dict_of_nonaggregated_instances_with_multiple_annotations


def get_votes_for_nonaggregated_instances(df_judgments) -> dict:
    """
    return a skeleton dict, the dict only contains info for both instance with multiple annotations and one annotation
    """
    dict_of_nonaggregated_instances = get_dict_of_nonaggregated_instances(df_judgments)
    for key, value in dict_of_nonaggregated_instances.items():
        votes_list = list()
        for tuple_annotation in value:
            votes_list.append(tuple_annotation[-1])
        votes_list.sort()
        dict_of_nonaggregated_instances[key] = votes_list
    return dict_of_nonaggregated_instances

In [None]:
def aggregation_median_rounding(dict_of_nonaggregated_instances_with_multiple_annotations: dict):
    for key, value in dict_of_nonaggregated_instances_with_multiple_annotations.items():
        array = np.array(value)
        median = np.nanmedian(array)
        if not np.isnan(median):
            dict_of_nonaggregated_instances_with_multiple_annotations[key] = float(round(median))
        else:
            dict_of_nonaggregated_instances_with_multiple_annotations[key] = np.nan
    return dict_of_nonaggregated_instances_with_multiple_annotations

# print(aggregation_median_rounding(get_votes_for_instances_with_multiple_annotations(df_judgments.head(2000))))

def aggregation_median_exclusion(dict_of_nonaggregated_instances_with_multiple_annotations: dict):
    for key, value in dict_of_nonaggregated_instances_with_multiple_annotations.items():
        array = np.array(value)
        median = np.nanmedian(array)
        if median.is_integer():
            dict_of_nonaggregated_instances_with_multiple_annotations[key] = median
        else:   
            dict_of_nonaggregated_instances_with_multiple_annotations[key] = np.nan
    return dict_of_nonaggregated_instances_with_multiple_annotations

# print(aggregation_median_exclusion(get_votes_for_instances_with_multiple_annotations(df_judgments.head(2000))))


In [None]:

def cleaning_remove_nan(dict_of_nonaggregated_instances: dict):
    for key, value in dict_of_nonaggregated_instances.items():
        if (np.nan in value):
            dict_of_nonaggregated_instances[key] = [np.nan for _ in range(len(value))]
    return dict_of_nonaggregated_instances

def cleaning_remove_any_disagreements(dict_of_nonaggregated_instances: dict):
    dict_of_nonaggregated_instances = cleaning_remove_nan(dict_of_nonaggregated_instances)
    for key, value in dict_of_nonaggregated_instances.items():
        unique_labels_list = list(set(value))
        if len(unique_labels_list) > 1:
            dict_of_nonaggregated_instances[key] = [np.nan for _ in range(len(value))]
    return dict_of_nonaggregated_instances
    
def cleaning_allow_disagreements_in_one_point(dict_of_nonaggregated_instances: dict):
    dict_of_nonaggregated_instances = cleaning_remove_nan(dict_of_nonaggregated_instances)
    for key, value in dict_of_nonaggregated_instances.items():
        unique_labels_list = list(set(value))
        if len(unique_labels_list) > 2:
            dict_of_nonaggregated_instances[key] = [np.nan for _ in range(len(value))]
        if (len(unique_labels_list) == 2):
            if abs(unique_labels_list[0] - unique_labels_list[1]) > 1:
                dict_of_nonaggregated_instances[key] = [np.nan for _ in range(len(value))]
    return dict_of_nonaggregated_instances

def cleaning_remove_instance_with_one_valid_vote(dict_of_nonaggregated_instances: dict):
    for key, value in dict_of_nonaggregated_instances.items():
        unique_labels_list = list(set(value))
        if len(unique_labels_list) == 1:
            dict_of_nonaggregated_instances[key] = [np.nan]
    return dict_of_nonaggregated_instances

def cleaning_most_strict_condition(dict_of_nonaggregated_instances: dict):
    for key, value in dict_of_nonaggregated_instances.items():
        unique_labels_list = list(set(value))
        if (np.nan in value):
            dict_of_nonaggregated_instances[key] = [np.nan for _ in range(len(value))]
        if len(unique_labels_list) > 1:
            dict_of_nonaggregated_instances[key] = [np.nan for _ in range(len(value))]
        if len(value) == 1:
            dict_of_nonaggregated_instances[key] = [np.nan]
    return dict_of_nonaggregated_instances

def cleaning_second_strict_condition(dict_of_nonaggregated_instances: dict):
    for key, value in dict_of_nonaggregated_instances.items():
        unique_labels_list = list(set(value))
        if (np.nan in value):
            dict_of_nonaggregated_instances[key] = [np.nan for _ in range(len(value))]
        if len(unique_labels_list) > 2:
            dict_of_nonaggregated_instances[key] = [np.nan for _ in range(len(value))]
        if (len(unique_labels_list) == 2):
            if abs(unique_labels_list[0] - unique_labels_list[1]) > 1:
                dict_of_nonaggregated_instances[key] = [np.nan for _ in range(len(value))]
        if len(unique_labels_list) == 1:
            dict_of_nonaggregated_instances[key] = [np.nan]
    return dict_of_nonaggregated_instances




In [None]:
df_judgments, _ = split_dataset()
df_judgments = df_judgments[df_judgments["dataset"] == "dwug_en"]


import copy
['identifier1', 'identifier2', 'annotator', 'judgment', 'comment',
       'lemma', 'dataset', 'language']
string_identifier_one = "identifier1"
string_identifier_two = 'identifier2'
string_annotator = "annotator"
string_judgment = "judgment"
string_comment = "comment"
string_lemma = "lemma"
string_dataset = "dataset"
string_language = "language"
string_rounding = "rounding"
string_exclusion = "exclusion"
df_judgments[string_rounding] = "not applicable"
df_judgments[string_exclusion] = "not applicable"

dict_of_instances = get_votes_for_nonaggregated_instances(df_judgments)

dict_of_instances = cleaning_most_strict_condition(dict_of_instances)



dict_aggregation_median_rounding = aggregation_median_rounding(copy.copy((dict_of_instances)))
dict_aggregation_median_exclusion = aggregation_median_exclusion(copy.copy(dict_of_instances))

dict_of_aggregated_instances = dict()
for key, value in dict_of_instances.items():
    dict_of_aggregated_instances[key] = {string_rounding: dict_aggregation_median_rounding[key], string_exclusion: dict_aggregation_median_exclusion[key]}

aggregated_rows = list()
for key, value in dict_of_aggregated_instances.items():
    row = {string_identifier_one: key[1], string_identifier_two: key[2], string_annotator: "gold", string_judgment:"not applicable", string_comment:"gone through aggregation", string_lemma: key[3], string_dataset: key[0], string_language: "not applicable", string_rounding:dict_of_aggregated_instances[key][string_rounding], string_exclusion:dict_of_aggregated_instances[key][string_exclusion]}
    aggregated_rows.append(row)

df_output = pd.DataFrame(aggregated_rows)
df_output = df_output.replace(np.nan, 0.0)


judgment_metric_to_use = "exclusion"

min_judgment = df_output[judgment_metric_to_use].min()
max_judgment = df_output[judgment_metric_to_use].max()
num_bins = len(df_output[judgment_metric_to_use].unique())

bin_edges = np.linspace(min_judgment - 0.5, max_judgment + 0.5, num_bins + 1)    
# print(bin_edges)

plt.figure(figsize=[6, 6])  # Adjust the figure size as desired
_, _, bars = plt.hist(df_output[judgment_metric_to_use], bins=bin_edges,rwidth=0.8, color='b')
plt.bar_label(bars, fontsize=10, color='navy')
plt.xlabel('Judgment')
plt.ylabel('Frequency', rotation=0, labelpad=30)
plt.title(f'{len(df_output)} judgements')

plt.show()  # Uncomment this line to display the plot

# df_output.set_index([string_dataset, string_identifier_one, string_identifier_two], inplace=True)
print("numebr of rows with non zero vote", (df_output[judgment_metric_to_use] != 0).sum())

string_token_index_one = "token index 1"
string_token_index_two = "token index 2"
string_context = "context"
string_indexes_target_token = "indexes_target_token"

pipeline_input_column_names = [string_lemma, "sentence1", string_token_index_one, "sentence2", string_token_index_two, string_judgment]
df_pipeline_input = pd.DataFrame(columns=pipeline_input_column_names)

df_uses = pd.read_csv('uses_all.csv', delimiter='\t', quoting =csv.QUOTE_NONE)
print(df_uses.columns)
df_uses = df_uses.drop_duplicates()
df_uses.set_index([string_dataset, string_lemma, "identifier"], inplace=True)


# print(df_uses.head(1))

count = 0
for index, row in df_output.iterrows():
    if row[judgment_metric_to_use] != 0:
        sentence_one_key = (row[string_dataset], row[string_lemma], row[string_identifier_one])
        sentence_two_key = (row[string_dataset], row[string_lemma], row[string_identifier_two])
        new_row_in_df_pipeline_input = [row[string_lemma], df_uses.loc[sentence_one_key][string_context].values[0], df_uses.loc[sentence_one_key][string_indexes_target_token].values[0], df_uses.loc[sentence_two_key][string_context].values[0], df_uses.loc[sentence_two_key][string_indexes_target_token].values[0], row[judgment_metric_to_use]]
        df_pipeline_input.loc[count] = new_row_in_df_pipeline_input  
        count = count + 1
   
print("number of row:", len(df_pipeline_input))
df_pipeline_input.to_csv('instances_with_token_index.csv', sep='\t', header=False, index=False, quoting=csv.QUOTE_NONE) 


In [None]:
df_judgments, _ = split_dataset()
dict_of_nonaggregated_instances = get_votes_for_nonaggregated_instances(df_judgments)
dict_of_nonaggregated_instances = cleaning_most_strict_condition(dict_of_nonaggregated_instances)


count = 0
instance = 0
for key, value in dict_of_nonaggregated_instances.items():
    instance = instance + 1
    # print(instance)
    if np.nan not in value:
        if key[0]!= 'RuSemShift':
            print(key, value)
        count = count + 1
print(count)

In [None]:
df_judgments, _ = split_dataset()
df_dwug_de = df_judgments[df_judgments["dataset"] == "dwug_de"]
print(df_dwug_de["judgment"].value_counts())

result = df_judgments[(df_judgments['dataset'] == 'dwug_sv') & (df_judgments['judgment'] == 5)]
print(result)

In [None]:

df_judgments = df_judgments.replace(np.nan, 0.0)

grouped = df_judgments.groupby('dataset')

# Get the unique values of the 'dataset' column
data_by_dataset = df_judgments['dataset'].unique()

# Create a separate graph for each unique dataset
for data in data_by_dataset:
    # Get the subset of data for the current dataset
    subset = grouped.get_group(data)
    
    min_judgment = subset['judgment'].min()
    max_judgment = subset['judgment'].max()
    num_bins = len(subset['judgment'].unique())

    bin_edges = np.linspace(min_judgment - 0.5, max_judgment + 0.5, num_bins + 1)    
    # print(bin_edges)
    
    plt.figure(figsize=[6, 6])  # Adjust the figure size as desired
    _, _, bars = plt.hist(subset['judgment'], bins=bin_edges,rwidth=0.8, color='b')
    plt.bar_label(bars, fontsize=10, color='navy')
    plt.xlabel('Judgment')
    plt.ylabel('Frequency', rotation=0, labelpad=30)
    plt.title(f'Dataset {data}, {len(subset)} judgements')
    
    # plt.show()  # Uncomment this line to display the plot
    # plt.savefig(f'dataset_{dataset}_hist.png')  # Uncomment this line to save the plot as an image



In [None]:
grouped = df_judgments.groupby('language')

data_by_language = df_judgments['language'].unique()

for data in data_by_language:
    subset = grouped.get_group(data)
    
    min_judgment = subset['judgment'].min()
    max_judgment = subset['judgment'].max()
    num_bins = len(subset['judgment'].unique())

    bin_edges = np.linspace(min_judgment - 0.5, max_judgment + 0.5, num_bins + 1)    
    # print(bin_edges)
    
    plt.figure(figsize=[6, 6])  # Adjust the figure size as desired
    _, _, bars = plt.hist(subset['judgment'], bins=bin_edges,rwidth=0.8, color='b')
    plt.bar_label(bars, fontsize=10, color='navy')
    plt.xlabel('Judgment')
    plt.ylabel('Frequency', rotation=0, labelpad=30)
    plt.title(f'language {data}, {len(subset)} judgements')
    
    plt.show()  # Uncomment this line to display the plot
    # plt.savefig(f'dataset_{dataset}_hist.png')  # Uncomment this line to save the plot as an image

In [None]:
df_judgments, _ = split_dataset()

In [None]:
grouped = df_judgments.groupby('dataset')
data_by_dataset = df_judgments['dataset'].unique()

for data in data_by_dataset[:]:
    subset = grouped.get_group(data)
    total_judgments = len(subset)
    language = str(subset['language'].unique())

    concatenated = np.concatenate((subset['identifier1'].unique(), subset['identifier2'].unique()))
    unique_identifiers = len(set(concatenated.tolist()))

    print(f"dataset: {data}, number of judgments: {total_judgments}, language in this set: {language}, number of words: {len(subset['lemma'].unique())}, number of sentences: {unique_identifiers}")