In [42]:
import os
import parse
import fasttext
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
datadir = "/home/peterr/macocu/task5_webgenres/data/final/fasttext3"


test = os.path.join(datadir, "test_onlyprimary_False_dedup_True_only_keep_False.fasttext")
train= os.path.join(datadir, "train_onlyprimary_False_dedup_False_only_keep_True.fasttext")


train_labels = ['__label__Legal/Regulation', '__label__Opinionated_News', '__label__News/Reporting', '__label__Forum', '__label__Correspondence', '__label__Invitation', '__label__Instruction', '__label__Recipe', '__label__Opinion/Argumentation', '__label__Promotion_of_Services', '__label__Promotion', '__label__List_of_Summaries/Excerpts', '__label__Promotion_of_a_Product', '__label__Call', '__label__Review', '__label__Other', '__label__Information/Explanation', '__label__Interview', '__label__Prose', '__label__Research_Article', '__label__Announcement']

STR_TO_NUM = {s: i for i, s in enumerate(train_labels)}
NUM_TO_STR = {i: s for i, s in enumerate(train_labels)}


def parse_fasttext_file(path: str, encode=True):
    """Reads fasttext formatted file and returns dataframe."""
    with open(path, "r") as f:
        content = f.readlines()
    pattern = "{label} {text}\n"
    p = parse.compile(pattern)

    labels, texts = list(), list()
    for line in content:
        rez = p.parse(line)
        if rez is not None:
            if rez["label"] == '__label__Promotion_of_services':
                labels.append('__label__Promotion_of_Services')
            elif rez["label"] == '__label__Promotion_of_a_product':
                labels.append('__label__Promotion_of_a_Product')
            else:
                labels.append(rez["label"])
            texts.append(rez["text"])
        else:
            pass
            #print("error parsing line ", line)
    if encode:
        labels = [STR_TO_NUM[i] for i in labels]
    return pd.DataFrame(data={"text": texts, "labels": labels})

for filename in [train, test]:
    try:
        _ = parse_fasttext_file(filename)
    except Exception as e:
        raise e


def train_model(train_df, NUM_EPOCHS=30):
    from simpletransformers.classification import ClassificationModel
    model_args = {
        "num_train_epochs": NUM_EPOCHS,
        "learning_rate": 1e-5,
        "overwrite_output_dir": True,
        "train_batch_size": 32,
        "no_save": True,
        "no_cache": True,
        "overwrite_output_dir": True,
        "save_steps": -1,
        "max_seq_length": 512,
        "silent": True
    }

    model = ClassificationModel(
        "camembert", "EMBEDDIA/sloberta",
        num_labels = 21,
        use_cuda = True,
        args = model_args
    )
    model.train_model(train_df)
    return model

def eval_model(test_df):
    y_true_enc = test_df.labels[0::3]
    y_true_sec_enc = test_df.labels[2::3]
    y_pred_enc = model.predict(test_df.text.values.tolist()[0::3])[0]
    y_pred_sec_enc = np.argsort(model.predict(test_df.text.values.tolist()[0::3])[1])[:, -2]

    y_true = [NUM_TO_STR[i] for i in y_true_enc]
    y_true_sec = [NUM_TO_STR[i] for i in y_true_sec_enc]
    y_pred = [NUM_TO_STR[i] for i in y_pred_enc]
    y_pred_sec = [NUM_TO_STR[i] for i in y_pred_sec_enc]

    return {
            "y_true": y_true,
            "y_pred": y_pred,
            "y_pred_sec": y_pred_sec,
            "y_true_sec": y_true_sec }

import pandas as pd

results = list()

test_df = parse_fasttext_file(test)
train_df = parse_fasttext_file(train)


for i in range(5):
    print("Run ", i+1, "of 5")
    model = train_model(train_df)
    rundict = eval_model(test_df)
    rundict["train"] = "OK"
    rundict["eval"] = "test_dd"
    results.append(rundict)
print(results)


Run  1 of 5


Some weights of the model checkpoint at EMBEDDIA/sloberta were not used when initializing CamembertForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at EMBEDDIA/sloberta and are newly initialized: ['roberta.pooler.dense.weight', 'classifier.out_proj.wei

Run  2 of 5


Some weights of the model checkpoint at EMBEDDIA/sloberta were not used when initializing CamembertForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at EMBEDDIA/sloberta and are newly initialized: ['roberta.pooler.dense.weight', 'classifier.out_proj.wei

Run  3 of 5


Some weights of the model checkpoint at EMBEDDIA/sloberta were not used when initializing CamembertForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at EMBEDDIA/sloberta and are newly initialized: ['roberta.pooler.dense.weight', 'classifier.out_proj.wei

Run  4 of 5


Some weights of the model checkpoint at EMBEDDIA/sloberta were not used when initializing CamembertForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at EMBEDDIA/sloberta and are newly initialized: ['roberta.pooler.dense.weight', 'classifier.out_proj.wei

Run  5 of 5


Some weights of the model checkpoint at EMBEDDIA/sloberta were not used when initializing CamembertForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at EMBEDDIA/sloberta and are newly initialized: ['roberta.pooler.dense.weight', 'classifier.out_proj.wei

[{'y_true': ['__label__Promotion_of_a_Product', '__label__Information/Explanation', '__label__Opinion/Argumentation', '__label__Promotion_of_Services', '__label__Promotion_of_a_Product', '__label__Opinion/Argumentation', '__label__Correspondence', '__label__Opinion/Argumentation', '__label__Opinionated_News', '__label__List_of_Summaries/Excerpts', '__label__Opinion/Argumentation', '__label__Opinionated_News', '__label__Promotion_of_a_Product', '__label__Information/Explanation', '__label__Opinion/Argumentation', '__label__Promotion_of_a_Product', '__label__Information/Explanation', '__label__Invitation', '__label__Promotion_of_a_Product', '__label__Information/Explanation', '__label__Promotion', '__label__Opinionated_News', '__label__Opinion/Argumentation', '__label__Review', '__label__News/Reporting', '__label__Promotion_of_Services', '__label__Announcement', '__label__Instruction', '__label__Invitation', '__label__Instruction', '__label__News/Reporting', '__label__Information/Explana

In [None]:
with open("backup_16.txt", "w") as f:
    import json
    json.dump(results, f)

# Analysis

In [1]:
import pandas as pd
with open("backup_15.txt", "r") as f:
    content = f.readline()
    from ast import literal_eval
    content = literal_eval(content)
jsonlikecontent = dict()
for key in content[0].keys():
    jsonlikecontent[key] = [i[key] for i in content]



df = pd.DataFrame(data=jsonlikecontent)
df.tail()

Unnamed: 0,microF1,macroF1,y_true,y_pred,train,eval
0,0.600677,0.529577,"[__label__Promotion_of_a_Product, __label__Pro...","[__label__Promotion_of_a_Product, __label__Pro...",OK,test_dd
1,0.632826,0.564074,"[__label__Promotion_of_a_Product, __label__Pro...","[__label__Promotion_of_a_Product, __label__Pro...",OK,test_dd
2,0.582064,0.464591,"[__label__Promotion_of_a_Product, __label__Pro...","[__label__Promotion_of_a_Product, __label__Pro...",OK,test_dd
3,0.590525,0.491271,"[__label__Promotion_of_a_Product, __label__Pro...","[__label__Promotion_of_a_Product, __label__Pro...",OK,test_dd
4,0.576988,0.52569,"[__label__Promotion_of_a_Product, __label__Pro...","[__label__Promotion_of_a_Product, __label__Pro...",OK,test_dd


In [11]:
y_true = df.y_true[0]
y_pred = df.y_pred[0]

def eval_results(y_pred, y_true, own_F1=False):
    from sklearn.metrics import f1_score

    y_pred_primary = y_pred[0::3]
    y_pred_secondary = y_pred[2::3]
    y_true_primary = y_true[0::3]
    y_true_secondary = y_true[0::3]


    if own_F1 == False:
        micro = f1_score(y_true_primary, y_pred_primary, labels=train_labels, average="micro")
        macro = f1_score(y_true_primary, y_pred_primary, labels=train_labels, average="macro")

        return micro, macro
mis, mas = [],[]

for y_true, y_pred in zip(df.y_true.values, df.y_pred.values):
    print(eval_results(y_pred, y_true))
    mi, ma = eval_results(y_pred, y_true)
    mis.append(mi)
    mas.append(ma)

df["only_primary_macro"] = mas
df["only_primary_micro"] = mis

df

(0.6192893401015228, 0.5463083261424044)
(0.649746192893401, 0.5845274468255836)
(0.5888324873096447, 0.4726742579373407)
(0.6091370558375635, 0.5084770373758974)
(0.583756345177665, 0.5450833008978373)


Unnamed: 0,microF1,macroF1,y_true,y_pred,train,eval,only_primary_macro,only_primary_micro
0,0.600677,0.529577,"[__label__Promotion_of_a_Product, __label__Pro...","[__label__Promotion_of_a_Product, __label__Pro...",OK,test_dd,0.546308,0.619289
1,0.632826,0.564074,"[__label__Promotion_of_a_Product, __label__Pro...","[__label__Promotion_of_a_Product, __label__Pro...",OK,test_dd,0.584527,0.649746
2,0.582064,0.464591,"[__label__Promotion_of_a_Product, __label__Pro...","[__label__Promotion_of_a_Product, __label__Pro...",OK,test_dd,0.472674,0.588832
3,0.590525,0.491271,"[__label__Promotion_of_a_Product, __label__Pro...","[__label__Promotion_of_a_Product, __label__Pro...",OK,test_dd,0.508477,0.609137
4,0.576988,0.52569,"[__label__Promotion_of_a_Product, __label__Pro...","[__label__Promotion_of_a_Product, __label__Pro...",OK,test_dd,0.545083,0.583756


In [12]:
import numpy as np

mimean = df.only_primary_micro.mean()
mistd = df.only_primary_micro.std()

mamean = df.only_primary_macro.mean()
mastd = df.only_primary_macro.std()

print(f"micro F1: {mimean:0.3} +/- {mistd:0.3}")
print(f"macro F1: {mamean:0.3} +/- {mastd:0.3}")

micro F1: 0.61 +/- 0.0265
macro F1: 0.531 +/- 0.0424
