In [33]:
# Data handling libraries
import pandas as pd
import re

In [34]:
# Read the data
data = pd.read_csv(
    '../data/clean/sustainability-report-2022-NLB-reviewed.csv', sep=',', encoding='utf8')
# Remove all rows where Nan in column 'label'
data = data.dropna(subset=['LABEL'])
data = data.drop(columns=['Controlled by'])

In [35]:
# Replace wrongly encoded characters with correct ones
data.replace("Å¾", "ž", inplace=True)
data.replace("Å¾", "Ž", inplace=True)
data.replace("Å¾", "š", inplace=True)
data.replace("Å¾", "Š", inplace=True)
data.replace("Å¾", "ć", inplace=True)

In [36]:
# For each row in the data frame
for index, rows in data.iterrows():
    # Replace wrongly encoded characters with correct ones
    data.loc[index, "question"] = str(rows.question).replace("Å¾", "ž").replace(
        "Å¾", "Ž").replace("Å¡", "š").replace("Å", "Š").replace("Ä‡", "ć")
    data.loc[index, "answer"] = str(rows.answer).replace("Å¾", "ž").replace(
        "Å¾", "Ž").replace("Å¡", "š").replace("Å", "Š").replace("Ä‡", "ć")
    data.loc[index, "context"] = str(rows.context).replace("Å¾", "ž").replace(
        "Å¾", "Ž").replace("Å¡", "š").replace("Å", "Š").replace("Ä‡", "ć")

    # Get the strings
    answer = str(rows.answer)
    context = str(rows.context)

    # If the answer is a decimal number with different separators, replace the separator with the one used in the context
    answer_match_1 = re.findall(r'[0-9]+.([0-9]+).*', answer)
    context_match_1 = re.findall(r'[0-9]+,([0-9]+).*', context)
    if answer_match_1 and context_match_1 and all(elm in context_match_1 for elm in answer_match_1):
        data.loc[index, "answer"] = re.sub(
            r'(\d+).([0-9]+)(.*)', r'\1,\2\3', answer)
        continue

    # And wise versa
    answer_match_2 = re.findall(r'[0-9]+,([0-9]+).*', answer)
    context_match_2 = re.findall(r'[0-9]+.([0-9]+).*', context)
    if answer_match_2 and context_match_2 and all(elm in context_match_2 for elm in answer_match_2):
        data.loc[index, "answer"] = re.sub(
            r'(\d+),?([0-9]+)(.*)', r'\1.\2\3', answer)
        
data = data.reset_index()

# Few lines of manual cleaning left over, this ensures that all the numbers in the data have the same separator for both the decimal and thousands

In [37]:
# Go over each line and find the answer start
for i in range(len(data)):
    text = data["context"][i]
    answer = data["answer"][i]
    answer_start = text.find(answer)
    if answer_start == -1:
        print("Answer not found in text: " + answer)
    data.loc[i, "answer_start"] = int(answer_start)

In [38]:
# Convert the data type of the answer_start column to int
data["answer_start"] = data["answer_start"].astype(int)
# And save the data
data.to_csv("../data/clean/sustainability-report-2022-NLB-subset-with-starts.csv",
            sep=";", index=False)