This notebook will prepare the data in fasttext format.

In [1]:
interim_dir = "/home/peterr/macocu/task5_webgenres/data/interim"

import os
import pandas as pd

test = pd.read_csv(os.path.join(interim_dir, "test_2c.csv"))
train = pd.read_csv(os.path.join(interim_dir, "train_2c.csv"))
dev = pd.read_csv(os.path.join(interim_dir, "dev_2c.csv"))

In [2]:
test.primary.unique()

array(['Promotion of a Product', 'Review', 'Opinion/Argumentation',
       'Other', 'News/Reporting', 'Information/Explanation',
       'Promotion of Services', 'Forum', 'Opinionated News', 'Invitation',
       'List of Summaries/Excerpts', 'Instruction', 'Promotion', 'Call',
       'Legal/Regulation', 'Announcement'], dtype=object)

For fasttext these labels won't do. Spaces will be replaced with underscores.

In [3]:
for df in [test, train, dev]:
    df["primary"] = df.primary.str.replace(" ", "_")
    df["secondary"] = df.secondary.str.replace(" ", "_")
test.primary.unique()

array(['Promotion_of_a_Product', 'Review', 'Opinion/Argumentation',
       'Other', 'News/Reporting', 'Information/Explanation',
       'Promotion_of_Services', 'Forum', 'Opinionated_News', 'Invitation',
       'List_of_Summaries/Excerpts', 'Instruction', 'Promotion', 'Call',
       'Legal/Regulation', 'Announcement'], dtype=object)

# Data preparation for the first experiment

For first experimen we want to prepare data with only primary labels. We need train, dev, and test data in two versions, all of it and a subset of it where keep == True. In expectation of forthcoming experiments I also included the logic for working on full set of labels with agreed upon weight system.

Data will be read from dataframes and saved into delegated text files, which fasttext can read.

<span style="color:red">When secondary label is non-existant, I include the primary label again.</span> Is this OK?

In [57]:
def parse_df(df, only_keep=False, only_primary=True, only_no_duplicates=False):
    import ast
    filecontent = ""
    for primary, secondary, paragraphs in df.loc[:, ["primary", "secondary", "paragraphs"]].values:
        paragraphs = ast.literal_eval(paragraphs)
        if only_keep == True:
            paragraphs = [item for item in paragraphs if item["keep"]==True]
        if only_no_duplicates == True:
            paragraphs = [item for item in paragraphs if item["duplicate"]==False]
        text = " <p/> ".join([i["text"].replace("\n", " ") for i in paragraphs])
        prim_label = f"__label__{primary}"
        sec_label =  f"__label__{secondary if pd.isna(secondary)!= True else primary}"
        filecontent += f"""{prim_label} {text}\n{prim_label} {text}\n{prim_label if only_primary else sec_label} {text}\n"""
    return filecontent

final_path = "/home/peterr/macocu/task5_webgenres/data/final/fasttext1"
dfdict = dict(test=test, dev=dev, train=train)

for duplicate in [True, False]:
    for keep in [True, False]:
        for only_primary in [True, False]:
            for segment in ["dev", "train", "test"]:
                filename = f"{segment}_onlykeep_{keep}_onlyprimary_{only_primary}_dedup_{duplicate}.fasttext"
                df = dfdict[segment]
                content = parse_df(df, only_keep = keep, only_primary=only_primary, only_no_duplicates=duplicate)

                with open(os.path.join(final_path, filename), "w") as f:
                    f.write(content)


In [8]:
test.paragraphs[0]

"[{'text': 'Dobavljivost', 'duplicate': False, 'keep': True}, {'text': 'Cena', 'duplicate': False, 'keep': True}, {'text': 'Cena', 'duplicate': False, 'keep': True}, {'text': '33,00 €', 'duplicate': False, 'keep': True}, {'text': 'Količina', 'duplicate': False, 'keep': True}, {'text': 'S kompresijsko vrečo Nova pakirano padalo zaseda manj prostora in je zaščiteno pred znojem, z bolj ploskim pakiranjem pa je prenašanje padala v nahrbtniku udobnejše. Vreča ima ventil, da lahko iz nje iztisnemo odvečni zrak.', 'duplicate': False, 'keep': True}, {'text': 'Podrobnosti', 'duplicate': False, 'keep': True}, {'text': '100% vodotesna, padalo varuje pred znojem in umazanijo', 'duplicate': False, 'keep': True}, {'text': 'Teža: ok. 120 g', 'duplicate': False, 'keep': True}, {'text': 'Prostornina: ok. 30 l', 'duplicate': False, 'keep': True}]"