This notebook will prepare the data in fasttext format, including preparing the data with just paragraphs to be kept.

In [1]:
interim_dir = "/home/peterr/macocu/task5_webgenres/data/interim"

import os
import pandas as pd

test = pd.read_csv(os.path.join(interim_dir, "test_1d.csv"))
train = pd.read_csv(os.path.join(interim_dir, "train_1d.csv"))
dev = pd.read_csv(os.path.join(interim_dir, "dev_1d.csv"))

In [2]:
sorted(train.primary.unique())

['Announcement',
 'Call',
 'Correspondence',
 'Forum',
 'Information/Explanation',
 'Instruction',
 'Interview',
 'Invitation',
 'Legal/Regulation',
 'List of Summaries/Excerpts',
 'News/Reporting',
 'Opinion/Argumentation',
 'Opinionated News',
 'Other',
 'Promotion',
 'Promotion of Services',
 'Promotion of a Product',
 'Prose',
 'Recipe',
 'Research Article',
 'Review']

For fasttext current labels won't do. Spaces will be replaced with underscores.

In [5]:
for df in [test, train, dev]:
    df["primary"] = df.primary.str.replace(" ", "_")
    df["secondary"] = df.secondary.str.replace(" ", "_")
train.primary.unique()

array(['Information/Explanation', 'Opinion/Argumentation',
       'Promotion_of_a_Product', 'List_of_Summaries/Excerpts',
       'Promotion_of_Services', 'News/Reporting', 'Opinionated_News',
       'Announcement', 'Invitation', 'Instruction', 'Forum', 'Prose',
       'Recipe', 'Other', 'Legal/Regulation', 'Promotion', 'Review',
       'Interview', 'Call', 'Correspondence', 'Research_Article'],
      dtype=object)

# Data preparation for the first experiment

For first experimen we want to prepare data with only primary labels. We need train, dev, and test data in two versions, all of it and a subset of it where keep == True.

Data will be read from dataframes and saved into delegated text files, which fasttext can read.


In [7]:
def parse_df(df, 
            only_keep=False, 
            only_primary=True,
            only_no_duplicates=False):
    import ast
    filecontent = ""
    for primary, secondary, paragraphs in df.loc[:, ["primary", "secondary", "paragraphs"]].values:
        paragraphs = ast.literal_eval(paragraphs)
        if only_keep == True:
            paragraphs = [item for item in paragraphs if item["keep"]==True]
        if only_no_duplicates == True:
            paragraphs = [item for item in paragraphs if item["duplicate"]==False]
        text = " <p/> ".join([i["text"].replace("\n", " ") for i in paragraphs])
        prim_label = f"__label__{primary}"
        sec_label =  f"__label__{secondary if pd.isna(secondary)!= True else primary}"
        filecontent += f"""{prim_label} {text}\n{prim_label} {text}\n{prim_label if only_primary else sec_label} {text}\n"""
    return filecontent

final_path = "/home/peterr/macocu/task5_webgenres/data/final/fasttext4"
dfdict = dict(test=test, dev=dev, train=train)

for deduplicate in [True]:
    for only_primary in [True, False]:
        for segment in ["dev", "train", "test"]:
            for only_keep in [False]:
                filename = f"{segment}_onlyprimary_{only_primary}_dedup_{deduplicate}_only_keep_{only_keep}.fasttext"
                df = dfdict[segment]
                content = parse_df(df, only_primary=only_primary, only_no_duplicates=deduplicate, only_keep=only_keep)
                with open(os.path.join(final_path, filename), "w") as f:
                    f.write(content)


# Data preparation with KEEP tag in mind:

In [5]:
def parse_df(df, 
            only_keep=False, 
            only_primary=True,
            only_no_duplicates=False):
    import ast
    filecontent = ""
    for primary, secondary, paragraphs in df.loc[:, ["primary", "secondary", "paragraphs"]].values:
        paragraphs = ast.literal_eval(paragraphs)
        if only_keep == True:
            paragraphs = [item for item in paragraphs if item["keep"]==True]
        # if only_no_duplicates == True:
        #     paragraphs = [item for item in paragraphs if item["duplicate"]==False]
        text = " <p/> ".join([i["text"].replace("\n", " ") for i in paragraphs])
        prim_label = f"__label__{primary}"
        sec_label =  f"__label__{secondary if pd.isna(secondary)!= True else primary}"
        filecontent += f"""{prim_label} {text}\n{prim_label} {text}\n{prim_label if only_primary else sec_label} {text}\n"""
    return filecontent

final_path = "/home/peterr/macocu/task5_webgenres/data/final/fasttext2"
dfdict = dict(test=test, dev=dev, train=train)

for only_keep in [True, False]:
    for only_primary in [True]:
        for segment in ["dev", "train", "test"]:
            df = dfdict[segment]
            filename = f"{segment}_onlyprimary_{only_primary}_only_keep_{only_keep}.fasttext"
            content = parse_df(df, only_primary=only_primary, only_keep=only_keep)
            with open(os.path.join(final_path, filename), "w") as f:
                f.write(content)


# Data prep for second experiment

In [33]:
def parse_df(df, 
            only_keep=False, 
            only_primary=True,
            only_no_duplicates=False):
    import ast
    filecontent = ""
    for primary, secondary, paragraphs in df.loc[:, ["primary", "secondary", "paragraphs"]].values:
        paragraphs = ast.literal_eval(paragraphs)
        if only_keep == True:
            paragraphs = [item for item in paragraphs if item["keep"]==True]
        if only_no_duplicates == True:
            paragraphs = [item for item in paragraphs if item["duplicate"]==False]
        if len(paragraphs) == 0:
            continue
        text = " <p/> ".join([i["text"].replace("\n", " ") for i in paragraphs])
        prim_label = f"__label__{primary}"
        sec_label =  f"__label__{secondary if pd.isna(secondary)!= True else primary}"
        filecontent += f"""{prim_label} {text}\n{prim_label} {text}\n{prim_label if only_primary else sec_label} {text}\n"""
    return filecontent

final_path = "/home/peterr/macocu/task5_webgenres/data/final/fasttext3"
dfdict = dict(test=test, dev=dev, train=train)

segment = "train"
only_primary = False
only_keep = True
deduplicate = False


filename = f"{segment}_onlyprimary_{only_primary}_dedup_{deduplicate}_only_keep_{only_keep}.fasttext"
df = dfdict[segment]
content = parse_df(df, only_primary=only_primary, only_no_duplicates=deduplicate, only_keep=only_keep)
with open(os.path.join(final_path, filename), "w") as f:
    f.write(content)

segment = "test"
only_primary = False
only_keep = False
deduplicate = True


filename = f"{segment}_onlyprimary_{only_primary}_dedup_{deduplicate}_only_keep_{only_keep}.fasttext"
df = dfdict[segment]
content = parse_df(df, only_primary=only_primary, only_no_duplicates=deduplicate, only_keep=only_keep)
with open(os.path.join(final_path, filename), "w") as f:
    f.write(content)