In [73]:
import os
import parse
import fasttext
import numpy as np
from sklearn.metrics import f1_score

datadir = "/home/peterr/macocu/task5_webgenres/data/final/fasttext1"

dev_subset = os.path.join(datadir, "dev_onlykeep_True_onlyprimary_False.fasttext")
test_subset = os.path.join(datadir, "test_onlykeep_True_onlyprimary_False.fasttext")
train_subset = os.path.join(datadir, "train_onlykeep_True_onlyprimary_False.fasttext")

dev_full = os.path.join(datadir, "dev_onlykeep_False_onlyprimary_False.fasttext")
test_full = os.path.join(datadir, "test_onlykeep_False_onlyprimary_False.fasttext")
train_full = os.path.join(datadir, "train_onlykeep_False_onlyprimary_False.fasttext")

def parse_test_file(path: str):
    """Reads fasttext formatted file and returns labels, texts."""
    with open(path, "r") as f:
        content = f.readlines()
    pattern = "{label} {text}\n"
    p = parse.compile(pattern)

    labels, texts = list(), list()
    for line in content:
        rez = p.parse(line)
        labels.append(rez["label"])
        texts.append(rez["text"])
    return labels, texts

def prediction_to_label(prediction):
    """Transforms predicitons as returned by fasttext into pure labels."""
    import numpy as np
    return np.array(prediction[0])[:, 0]

all_labels = [
 '__label__Promotion_of_Services',
 '__label__Instruction',
 '__label__Review',
 '__label__Information/Explanation',
 '__label__Promotion_of_a_Product',
 '__label__News/Reporting',
 '__label__Promotion',
 '__label__Announcement',
 '__label__Invitation',
 '__label__Opinion/Argumentation',
 '__label__Forum',
 '__label__Legal/Regulation',
 '__label__Other',
 '__label__Opinionated_News',
 '__label__Call',
 '__label__List_of_Summaries/Excerpts']

def get_true_labels(path):
    """Reads the test file and returns labeldict, texts"""

    with open(path, "r") as f:
        content = f.readlines()

    pattern = "__label__{label} {text}\n"
    p = parse.compile(pattern)
    labels = list()
    texts = list()
    for l1, l2, l3 in zip(content[::3], content[1::3], content[2::3]):
        distribution = dict()
        r1 = p.parse(l1)
        r2 = p.parse(l2)
        r3 = p.parse(l3)
        assert r1 is not None, f"Parsing raised a None: {r1=}"
        assert r2 is not None, f"Parsing raised a None: {r2=}"
        assert r3 is not None, f"Parsing raised a None: {r3=}"
        assert r1["label"] == r2["label"], f"Primary label mismatch! \n{r1=}\n{r2=}\n{r3=}"
        assert r1["text"] == r2["text"]  == r3["text"], f"Text mismatch! \n{r1=}\n{r2=}\n{r3=}"
        texts.append(r2["text"])
        primary_label = "__label__" + r1["label"]
        secondary_label = "__label__" + r3["label"]

        distribution[primary_label] = 2/3
        distribution[secondary_label] = distribution.get(secondary_label, 0) + 1/3

        assert sum([i for i in distribution.values()]) == 1, f"Distribution does not add to 1!\n{distribution=}"

        labels.append(distribution)

    return labels, texts

def get_predicted_labels(model, texts):

    """Uses the model to predict the labels of the texts.
    
    Returns a list of dictionaries with label distributions."""

    predictions = model.predict(texts, k=-1)
    y_pred = list()
    for labels, probabilities in zip(*predictions):
        distribution = dict()
        for label, probability in zip(labels, probabilities):
            distribution[label] = probability
        y_pred.append(distribution)
    return y_pred

def binarize_distribution(distribution, cutoff = 0.1):
    """Sorts the probabilities distribution dict in a fixed order.
    Compares the probabilities to the cutoff and returns 1 where they are bigger
    than cutoff and 0 otherwise. Returns a list of binary values."""

    probabilities = [distribution.get(label, 0) for label in all_labels]

    binarized_probabilities = [1 if i >= cutoff else 0 for i in probabilities ]
    
    return binarized_probabilities



In [32]:
model_full = fasttext.train_supervised(input=train_full, autotuneValidationFile=dev_full, autotuneDuration=1200)
model_subset = fasttext.train_supervised(input=train_subset, autotuneValidationFile=dev_subset, autotuneDuration=1200)

In [74]:
y_true_full, texts_full = get_true_labels(test_full)
y_true_subset, texts_subset = get_true_labels(test_subset)

y_pred_full = get_predicted_labels(model_full, texts_full)
y_pred_subset = get_predicted_labels(model_subset, texts_subset)


In [78]:
y_true_full[1]

{'__label__Review': 0.6666666666666666,
 '__label__Information/Explanation': 0.3333333333333333}

In [77]:
y_pred_full[1]

{'__label__Opinion/Argumentation': 0.20486838,
 '__label__Information/Explanation': 0.1521792,
 '__label__List_of_Summaries/Excerpts': 0.14388034,
 '__label__Opinionated_News': 0.13404168,
 '__label__News/Reporting': 0.08886625,
 '__label__Other': 0.03339771,
 '__label__Forum': 0.032761667,
 '__label__Promotion': 0.031631626,
 '__label__Promotion_of_a_Product': 0.029673748,
 '__label__Review': 0.024865033,
 '__label__Instruction': 0.020232134,
 '__label__Promotion_of_Services': 0.01774978,
 '__label__Correspondence': 0.016739119,
 '__label__Legal/Regulation': 0.01468199,
 '__label__Invitation': 0.012260729,
 '__label__Research_Article': 0.006893041,
 '__label__Call': 0.006548532,
 '__label__Announcement': 0.006082948,
 '__label__Interview': 0.005877393,
 '__label__Prose': 0.0058747143,
 '__label__Promotion_of_services': 0.0029758418,
 '__label__Opinionated_news': 0.0029177745,
 '__label__Recipe': 0.002812528,
 '__label__Promotion_of_a_product': 0.0024278548}

In [76]:
binarize_distribution(y_true_full[1])

[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [79]:
y_true_full_binary = [binarize_distribution(i) for i in y_true_full]
y_true_subset_binary = [binarize_distribution(i) for i in y_true_subset]
y_pred_full_binary = [binarize_distribution(i) for i in y_pred_full]
y_pred_subset_binary = [binarize_distribution(i) for i in y_pred_subset]

In [89]:
a = [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
b = [0, 0, 0, 0, 6, 0.4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
f1_score(a,b)

ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [81]:
f1_score(y_true_full_binary, y_pred_full_binary, average="macro")

0.22909610659019153

0.4