Here's useful code on how to calculate statistical significance.

In [None]:
import os
import parse
import fasttext
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
datadir = "/home/peterr/macocu/task5_webgenres/data/final/fasttext2"


test_full = os.path.join(datadir, "test_onlyprimary_True_dedup_False.fasttext")
test_dd = os.path.join(datadir, "test_onlyprimary_True_only_keep_True.fasttext")
train_OK= os.path.join(datadir, "train_onlyprimary_True_only_keep_True.fasttext")


train_labels = ['__label__Legal/Regulation', '__label__Opinionated_News', '__label__News/Reporting', '__label__Forum', '__label__Correspondence', '__label__Invitation', '__label__Instruction', '__label__Recipe', '__label__Opinion/Argumentation', '__label__Promotion_of_Services', '__label__Promotion', '__label__List_of_Summaries/Excerpts', '__label__Promotion_of_a_Product', '__label__Call', '__label__Review', '__label__Other', '__label__Information/Explanation', '__label__Interview', '__label__Prose', '__label__Research_Article', '__label__Announcement']

STR_TO_NUM = {s: i for i, s in enumerate(train_labels)}
NUM_TO_STR = {i: s for i, s in enumerate(train_labels)}


def parse_fasttext_file(path: str, encode=True):
    """Reads fasttext formatted file and returns dataframe."""
    with open(path, "r") as f:
        content = f.readlines()
    pattern = "{label} {text}\n"
    p = parse.compile(pattern)

    labels, texts = list(), list()
    for line in content:
        rez = p.parse(line)
        if rez is not None:
            labels.append(rez["label"])
            texts.append(rez["text"])
        else:
            pass
            #print("error parsing line ", line)
    if encode:
        labels = [STR_TO_NUM[i] for i in labels]
    return pd.DataFrame(data={"text": texts, "labels": labels})

for filename in [train_OK, test_full, test_dd]:
    try:
        _ = parse_fasttext_file(filename)
    except Exception as e:
        raise e


def train_model(train_df, NUM_EPOCHS=30):
    from simpletransformers.classification import ClassificationModel
    model_args = {
        "num_train_epochs": NUM_EPOCHS,
        "learning_rate": 1e-5,
        "overwrite_output_dir": True,
        "train_batch_size": 32,
        "no_save": True,
        "no_cache": True,
        "overwrite_output_dir": True,
        "save_steps": -1,
        "max_seq_length": 512,
        "silent": True
    }

    model = ClassificationModel(
        "camembert", "EMBEDDIA/sloberta",
        num_labels = 21,
        use_cuda = True,
        args = model_args
    )
    model.train_model(train_df)
    return model

def eval_model(test_df):
    y_true_enc = test_df.labels
    y_pred_enc = model.predict(test_df.text.tolist())[0]

    y_true = [NUM_TO_STR[i] for i in y_true_enc]
    y_pred = [NUM_TO_STR[i] for i in y_pred_enc]

    microF1 = f1_score(y_true, y_pred, labels=train_labels, average ="micro")
    macroF1 = f1_score(y_true, y_pred, labels=train_labels, average ="macro")

    return {"microF1": microF1, 
            "macroF1": macroF1,
            "y_true": y_true,
            "y_pred": y_pred}

import pandas as pd

results = list()

test_full_df = parse_fasttext_file(test_full)
test_dd_df = parse_fasttext_file(test_dd)
train_OK_df = parse_fasttext_file(train_OK)

# Second experiment: train on dedup, eval on all available 
for i in range(5):
    print("Run ", i+1, "of 5")
    model = train_model(train_OK_df)
    rundict = eval_model(test_full_df)
    rundict["train"] = "ok"
    rundict["eval"] = "test_full"
    results.append(rundict)

    rundict = eval_model(test_dd_df)
    rundict["train"] = "ok"
    rundict["eval"] = "test_dd"
    results.append(rundict)
print(results)

# Analysis of the data

In [1]:
import pandas as pd
with open("backup_12.txt", "r") as f:
    content = f.readline()
    from ast import literal_eval
    content = literal_eval(content)
jsonlikecontent = dict()
for key in content[0].keys():
    jsonlikecontent[key] = [i[key] for i in content]



df = pd.DataFrame(data=jsonlikecontent)
df.tail()

Unnamed: 0,microF1,macroF1,y_true,y_pred,train,eval
53,0.609137,0.52159,"[__label__Promotion_of_a_Product, __label__Pro...","[__label__Promotion_of_a_Product, __label__Pro...",dd,test_dd
54,0.54,0.518452,"[__label__Opinion/Argumentation, __label__Opin...","[__label__Opinionated_News, __label__Opinionat...",dd,test_full
55,0.54,0.425843,"[__label__News/Reporting, __label__News/Report...","[__label__News/Reporting, __label__News/Report...",dd,dev_full
56,0.582915,0.499443,"[__label__News/Reporting, __label__News/Report...","[__label__News/Reporting, __label__News/Report...",dd,dev_dd
57,0.619289,0.567954,"[__label__Promotion_of_a_Product, __label__Pro...","[__label__Promotion_of_a_Product, __label__Pro...",dd,test_dd


In [2]:
c_eval_full = df["eval"] == "test_full"
c_eval_dd = df["eval"] == "test_dd"
c_train_ok = df["train"] == "ok"

df[c_train_ok & c_eval_full]

Unnamed: 0,microF1,macroF1,y_true,y_pred,train,eval
0,0.58,0.596899,"[__label__Opinion/Argumentation, __label__Opin...","[__label__Opinionated_News, __label__Opinionat...",ok,test_full
2,0.585,0.565865,"[__label__Opinion/Argumentation, __label__Opin...","[__label__Opinionated_News, __label__Opinionat...",ok,test_full
4,0.555,0.493954,"[__label__Opinion/Argumentation, __label__Opin...","[__label__Opinionated_News, __label__Opinionat...",ok,test_full
6,0.565,0.57256,"[__label__Opinion/Argumentation, __label__Opin...","[__label__Opinionated_News, __label__Opinionat...",ok,test_full
8,0.515,0.523911,"[__label__Opinion/Argumentation, __label__Opin...","[__label__Opinionated_News, __label__Opinionat...",ok,test_full


In [3]:
evaluation = { "full_on_full":
    {"description": "trained on full, evaluated on full",
    "constraints": [df.train == "full",  df["eval"] == "test_full"],
    "train": "full", "eval": "test_full"
    },
    "dedup_on_dedup": {"description": "trained on deduplicated, evaluated on deduplicated",
    "constraints": [df.train == "dd",  df["eval"] == "test_dd"],
    "train": "dd", "eval": "test_dd"
    },
    "only_keep_on_dedup": {"description": "trained on only keep == True, evaluated on deduplicated",
    "constraints": [df.train == "ok",  df["eval"] == "test_dd"],
    "train": "ok", "eval": "test_dd"
    },
    "only_keep_on_full": {"description": "trained on only keep == True, evaluated on full",
    "constraints": [df.train == "ok",  df["eval"] == "test_full"],
    "train": "ok", "eval": "test_full"
    },
}
import numpy as np
from IPython.display import display

stds = df.groupby(by=["train", "eval"]).std()# .loc[("dd", "test_full")]
means = df.groupby(by=["train", "eval"]).mean()#.loc[("dd", "test_full")]

descs = list()
mimeans = list()
mameans = list()

mistd = list()
mastd = list()

for desc, setup in evaluation.items():
    descs.append(setup.get("description"))
    t = setup["train"]
    e = setup["eval"]

    mimeans.append(means.loc[(t,e),"microF1"])
    mameans.append(means.loc[(t,e),"macroF1"])

    mistd.append(stds.loc[(t,e),"microF1"])
    mastd.append(stds.loc[(t,e),"macroF1"])

anal_summary = pd.DataFrame(data={
    "description": descs,
    "micro F1": [f"{m:0.3} +/- {s:0.3}" for m, s in zip(mimeans, mistd)],
    "macro F1": [f"{m:0.3} +/- {s:0.3}" for m, s in zip(mameans, mastd)]

})
print(anal_summary.to_markdown())


|    | description                                             | micro F1         | macro F1         |
|---:|:--------------------------------------------------------|:-----------------|:-----------------|
|  0 | trained on full, evaluated on full                      | 0.615 +/- 0.0235 | 0.613 +/- 0.0281 |
|  1 | trained on deduplicated, evaluated on deduplicated      | 0.623 +/- 0.0137 | 0.559 +/- 0.0463 |
|  2 | trained on only keep == True, evaluated on deduplicated | 0.673 +/- 0.0189 | 0.65 +/- 0.0299  |
|  3 | trained on only keep == True, evaluated on full         | 0.56 +/- 0.0278  | 0.551 +/- 0.0412 |


## Let's check if `train:full, test:full `is better than `train:dd, test:dd`:

In [4]:
c_eval_full = df["eval"] == "test_full"
c_eval_dd = df["eval"] == "test_dd"
c_train_full = df["train"] == "full"
c_train_dd = df["train"] == "dd"
c_train_ok = df["train"] == "ok"


higher = df[c_train_ok & c_eval_dd]
lower = df[c_train_dd & c_eval_dd]

from scipy import stats
#Candidate:
higher = higher.macroF1[0:5]
#Alternative
lower = lower.macroF1[0:5]

print(f"Wilcoxon p value: {stats.wilcoxon(higher,lower, alternative='greater')[1]:0.3}", "\t\t(alternative hypothesis: first is greater than the second)")

print(f"MannWhithey p value: {stats.mannwhitneyu(higher,lower, alternative='greater')[1]:0.3}", "\t\t(alternative hypothesis: first is greater than the second)")

print(f"Student p value: {stats.ttest_ind(higher,lower)[1]:0.3}", "\t\t(null hypothesis: samples have identical average, equal variance is assumed but not necessary)")
import numpy as np
print(f"Higher average: {np.mean(higher):0.4}, lower average: {np.mean(lower):0.4}")

Wilcoxon p value: 0.0312 		(alternative hypothesis: first is greater than the second)
MannWhithey p value: 0.0108 		(alternative hypothesis: first is greater than the second)
Student p value: 0.0137 		(null hypothesis: samples have identical average, equal variance is assumed but not necessary)
Higher average: 0.6498, lower average: 0.5643
