In [1]:
import os
import json
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON



In [2]:
filename = "assignments_from_pool_37642729__16-02-2023"

# translations of the search topics from English to Russian
en_ru_dict = {
    "Male actor born after 1950": "Актёры-мужчины родившиеся после 1950 года",
    "ColdWar politician": "Политики времён холодной войны",
    "Company": "Компании",
    "Soccer club": "Футбольные клубы",
    "Beverage": "Напитки",
    "Active political parties(last 20 years)": "Действующие политические партии (последние 20 лет)",
    "Natural disaster": "Природные катастрофы",
    "Magazine": "Журналы",
    "Nominee of the Academy Award for Best Actor": "Номинанты на Оскар за лучшую мужскую роль",
    "Museum": "Музеи",
    "Mountain higher than 4000m": "Горы выше чем 4000м",
    "Active political party": "Действующие политические партии",
    "WWII politician": "Политики времён второй мировой войны",
    "News presenter": "Телеведущие",
    "Film festival": "Кинофестивали",
    "Newspaper": "Газеты",
    "Lake with the area more than 100 square km": "Озёра с площадью более 100 квадратных км."
}

# invert a dict
ru_en_dict = {v: k for k, v in en_ru_dict.items()}

In [4]:
df_raw = pd.read_csv(f"/home/ins-alex/Downloads/assignments_from_pool_41212285__24-09-2023(3).tsv", sep="\t")

# Count the correctenss of the control tasks

In [5]:
# filter out rows where column GOLDEN:least is NaN
df = df_raw[df_raw["GOLDEN:least"].notna()]

# calculate how many times value from GOLDEN:least equals OUTPUT:least
res = (df["GOLDEN:least"] == df["OUTPUT:least"]).value_counts().reset_index()
res.columns = ["is_equal", "count"]
res_dict = res.set_index("is_equal").to_dict()["count"]

print("Quality of the results:" + str(res_dict[True] / (res_dict[True] + res_dict[False])))

Quality of the results:0.7564247921390779


In [6]:
df_raw = df_raw[~df_raw["GOLDEN:least"].notna()] # remove control tasks from the dataset

# Prepare data for the external SHR script

## Full

In [7]:
# leave only important columns
df_shr = df_raw[["INPUT:topic", "INPUT:answer_1", "INPUT:answer_2", "INPUT:answer_3", "INPUT:answer_4", "OUTPUT:most", "OUTPUT:least"]]

In [21]:
df_shr_dict = {
    "Item1": list(),
    "Item2": list(),
    "Item3": list(),
    "Item4": list(),
    "BestItem": list(),
    "WorstItem": list()
}

for i, row in df_shr.iterrows():
    df_shr_dict["Item1"].append(row["INPUT:answer_1"])
    df_shr_dict["Item2"].append(row["INPUT:answer_2"])
    df_shr_dict["Item3"].append(row["INPUT:answer_3"])
    df_shr_dict["Item4"].append(row["INPUT:answer_4"])
    
    best_idx = row["OUTPUT:most"]
    worst_idx = row["OUTPUT:least"]
    
    df_shr_dict["BestItem"].append(row[f"INPUT:answer_{best_idx}"])
    df_shr_dict["WorstItem"].append(row[f"INPUT:answer_{worst_idx}"])

df_shr_full = pd.DataFrame(df_shr_dict)
df_shr_full.to_csv("/home/ins-alex/Downloads/assignments_from_pool_41212285__24-09-2023_shr_full.tsv", sep=",", index=False)

## Category Wise

In [9]:
for topic in df_shr["INPUT:topic"].unique():
    df_shr_topic = df_shr[df_shr["INPUT:topic"] == topic]

    df_shr_dict = {
        "Item1": list(),
        "Item2": list(),
        "Item3": list(),
        "Item4": list(),
        "BestItem": list(),
        "WorstItem": list()
    }

    for i, row in df_shr_topic.iterrows():
        df_shr_dict["Item1"].append(row["INPUT:answer_1"])
        df_shr_dict["Item2"].append(row["INPUT:answer_2"])
        df_shr_dict["Item3"].append(row["INPUT:answer_3"])
        df_shr_dict["Item4"].append(row["INPUT:answer_4"])
        
        best_idx = row["OUTPUT:most"]
        worst_idx = row["OUTPUT:least"]
        
        df_shr_dict["BestItem"].append(row[f"INPUT:answer_{best_idx}"])
        df_shr_dict["WorstItem"].append(row[f"INPUT:answer_{worst_idx}"])

    df_shr_topic = pd.DataFrame(df_shr_dict)
    df_shr_topic.to_csv(f"../../data/final-crowdsourcing-experiments/ru/topics/{topic}.csv", sep=",", index=False)

# Create files with scores

## Count SHRs for topics

In [16]:
import re
import subprocess

topics_dir = "/home/ins-alex/multilingual-qa-with-ranking/data/final-crowdsourcing-experiments/ru/topics"
df_dct = {"topic": [], "Spearman": [], "Pearson": []}
for file in os.listdir(topics_dir):
    file_path = os.path.join(topics_dir, file.replace(" ", "\ "))
    output = subprocess.check_output(f"/home/ins-alex/multilingual-qa-with-ranking/external-scripts/Best-Worst-Scaling-Scripts/SHR-BWS.pl {file_path}", shell=True, universal_newlines=True)
    lst = output.split("\n")
    pattern_1 = r"Spearman correlation: ([\d.]+)"
    pattern_2 = r"Pearson correlation: ([\d.]+)"    

    match_1 = re.search(pattern_1, lst[0]).group(1)
    match_2 = re.search(pattern_2, lst[1]).group(1)

    df_dct["topic"].append(file.replace(".csv", ""))
    df_dct["Spearman"].append(match_1)
    df_dct["Pearson"].append(match_2)

Reading the annotation file /home/ins-alex/multilingual-qa-with-ranking/data/final-crowdsourcing-experiments/ru/topics/англичане.csv ...
Read 300 annotations.
Reading the annotation file /home/ins-alex/multilingual-qa-with-ranking/data/final-crowdsourcing-experiments/ru/topics/штат США.csv ...
Read 204 annotations.
Reading the annotation file /home/ins-alex/multilingual-qa-with-ranking/data/final-crowdsourcing-experiments/ru/topics/специалист в области информатики.csv ...
Read 180 annotations.
Reading the annotation file /home/ins-alex/multilingual-qa-with-ranking/data/final-crowdsourcing-experiments/ru/topics/американцы США.csv ...
Read 332 annotations.
Reading the annotation file /home/ins-alex/multilingual-qa-with-ranking/data/final-crowdsourcing-experiments/ru/topics/турки.csv ...
Read 312 annotations.
Reading the annotation file /home/ins-alex/multilingual-qa-with-ranking/data/final-crowdsourcing-experiments/ru/topics/спортивный симулятор.csv ...
Read 421 annotations.
Reading the 

In [18]:
pd.DataFrame.from_dict(df_dct).to_csv("../../data/final-crowdsourcing-experiments/ru/results/category-wise-SHR.csv", sep=',', index=False)

## Count category-wise-scores

In [19]:
topics_dir = "/home/ins-alex/multilingual-qa-with-ranking/data/final-crowdsourcing-experiments/ru/topics"

for file in os.listdir(topics_dir):
    file_path = os.path.join(topics_dir, file.replace(" ", "\ "))
    output_path = os.path.join("/home/ins-alex/multilingual-qa-with-ranking/data/final-crowdsourcing-experiments/ru/topics-scores", file.replace(" ", "\ "))
    output = subprocess.check_output(f"/home/ins-alex/multilingual-qa-with-ranking/external-scripts/Best-Worst-Scaling-Scripts/get-scores-from-BWS-annotations-counting.pl {file_path} > {output_path}", shell=True, universal_newlines=True)

Reading the annotation file /home/ins-alex/multilingual-qa-with-ranking/data/final-crowdsourcing-experiments/ru/topics/англичане.csv ...
Read 300 annotations.
Found 13 unique items.

Writing the scores to STDOUT ...
Finished.

Writing the scores to OUTF ...
Finished.
Reading the annotation file /home/ins-alex/multilingual-qa-with-ranking/data/final-crowdsourcing-experiments/ru/topics/штат США.csv ...
Read 204 annotations.
Found 17 unique items.

Writing the scores to STDOUT ...
Finished.

Writing the scores to OUTF ...
Finished.
Reading the annotation file /home/ins-alex/multilingual-qa-with-ranking/data/final-crowdsourcing-experiments/ru/topics/специалист в области информатики.csv ...
Read 180 annotations.
Found 15 unique items.

Writing the scores to STDOUT ...
Finished.

Writing the scores to OUTF ...
Finished.
Reading the annotation file /home/ins-alex/multilingual-qa-with-ranking/data/final-crowdsourcing-experiments/ru/topics/американцы США.csv ...
Read 332 annotations.
Found 14 u

# Merge scores with wikidata ids and topics

In [25]:
def find_topic_uri(label, file):
    with open(file + ".json") as f:
        data = json.load(f)

    try:
        uri = next(key for key, val in data.items() if val == label)
    except:
        return ""
    
    try:
        return "http://www.wikidata.org/entity/" + uri
    except:
        print(label)
        return ""

In [30]:
scores_path = "/home/ins-alex/multilingual-qa-with-ranking/data/final-crowdsourcing-experiments/ru/topics-scores"
main_path = "/home/ins-alex/multilingual-qa-with-ranking/data/final-crowdsourcing-experiments/ru"
for file in os.listdir(scores_path):
    print(file)
    df = pd.read_csv(os.path.join(scores_path, file), sep='\t', header=None)
    df["URI"] = df[0].apply(lambda x: find_topic_uri(x, os.path.join(main_path, file.replace(".csv", "").replace(" ", "-"))))
    df["topic"] = file.replace(".csv", "")
    df.to_csv(os.path.join(scores_path, file), sep='\t', index=False, header=None)

англичане.csv
штат США.csv
специалист в области информатики.csv
американцы США.csv
турки.csv
спортивный симулятор.csv
рождественский фильм.csv
животные.csv
политолог.csv
криминальный телесериал.csv
научная фантастика на телевидении.csv
литература ужасов.csv
боевые искусства.csv
музыкальная компьютерная игра.csv
бас гитарист.csv
музей современного искусства.csv
президент США.csv
актёр.csv
приключенческий фильм.csv
action.csv
фильм тайна.csv
рэпер.csv
философ.csv
спортсмен.csv
многопользовательская игра.csv
шпионский фильм.csv
гора.csv
инженер.csv
математик.csv
архитектор.csv
китайцы.csv
криминальный фильм.csv
афроамериканцы.csv
баскетболист.csv
музыкальный жанр.csv
скульптура.csv
индийцы.csv
фильм ужасов.csv
100 женщин.csv
немцы.csv
фильм сказка.csv
гитарист.csv
итальянцы.csv
боевик.csv
спортивный фильм.csv
теннисист.csv
компьютерная стратегическая игра.csv
борец за права женщин.csv
учёный.csv
джазмен.csv
модельер.csv
исторический фильм.csv
растения.csv
пианист.csv
комедийный телесериал

In [33]:
# Directory containing the .csv files
directory = '../../data/final-crowdsourcing-experiments/ru/topics-scores'

# Output file path
output_file = '../../data/final-crowdsourcing-experiments/ru/results/final-scores.csv'

# Initialize an empty DataFrame to store the merged data
merged_data = pd.DataFrame()

# Loop through each file in the directory
for file in os.listdir(directory):
    if file.endswith('.csv'):
        # Read the current file
        file_path = os.path.join(directory, file)
        data = pd.read_csv(file_path, sep='\t', header=None)
        
        # Append the data to the merged DataFrame
        merged_data = merged_data.append(data, ignore_index=True)

# Write the merged data to the output file
merged_data.to_csv(output_file, index=False)

  merged_data = merged_data.append(data, ignore_index=True)
  merged_data = merged_data.append(data, ignore_index=True)
  merged_data = merged_data.append(data, ignore_index=True)
  merged_data = merged_data.append(data, ignore_index=True)
  merged_data = merged_data.append(data, ignore_index=True)
  merged_data = merged_data.append(data, ignore_index=True)
  merged_data = merged_data.append(data, ignore_index=True)
  merged_data = merged_data.append(data, ignore_index=True)
  merged_data = merged_data.append(data, ignore_index=True)
  merged_data = merged_data.append(data, ignore_index=True)
  merged_data = merged_data.append(data, ignore_index=True)
  merged_data = merged_data.append(data, ignore_index=True)
  merged_data = merged_data.append(data, ignore_index=True)
  merged_data = merged_data.append(data, ignore_index=True)
  merged_data = merged_data.append(data, ignore_index=True)
  merged_data = merged_data.append(data, ignore_index=True)
  merged_data = merged_data.append(data,