In [2]:
import pandas as pd
import numpy as np
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


## Prepare the datasets for training and evaluation with the fastText model

In [3]:
# Load the X-GENRE datasets from Hugging Face
train = load_dataset("TajaKuzman/X-GENRE-text-genre-dataset", "train")


# To open them as Pandas DataFrame:
df_train = pd.DataFrame(train["train"])

print(df_train.shape)

(1772, 4)


In [4]:
# Load the test datasets from the GitHub repositories (access to them is obtained by request to the AGILE repository owner)

en_ginco = pd.read_json("../../datasets/EN-GINCO-test-dataset/EN-GINCO.jsonl", lines=True)
x_ginco = pd.read_json("../../datasets/X-GINCO-test-set/X-GINCO.jsonl", lines=True)

print(en_ginco.shape, x_ginco.shape)

(272, 4) (790, 6)


In [5]:
x_ginco.head(3)

Unnamed: 0,text,labels,language,dataset,text_id,translation
0,"Angelo Chetcuti, se jkun qed jieħu post Bjorn ...",News,Maltese,MaCoCu,macocu.mt.402244,"Angelo Chetcuti, will be replacing Bjorn Vassa..."
1,Poltergeist jirreferi għal fenomeni oħra tal-m...,Opinion/Argumentation,Maltese,MaCoCu,macocu.mt.377203,"Poltergeist refers to other woman's phenomena,..."
2,Chrysler: Brand ta 'lussu jew le? \n\nBrand ji...,Opinion/Argumentation,Maltese,MaCoCu,macocu.mt.109995,Chrysler: Luxury brand or not?\n\nBrand moves ...


In [6]:
df_train.head(3)

Unnamed: 0,text,labels,dataset,language
0,"Seeking All Things Brilliant ""I want people to...",Other,CORE,English
1,Meet Orchid du Bois I first met Hayley Mowday ...,Other,CORE,English
2,Abstract Objective: Reporting bias due to soci...,Information/Explanation,CORE,English


In [7]:
# Creating FastText train and test files

def fastText_files(df_train, x_ginco, en_ginco):
    """
    This function creates and saves the test and train file(s).
    
    The function returns a list of the following elements:
        - labels - which can be used for prediction and evaluation.
        - train file path
        - test file path
    """
    x_ginco = x_ginco[["text", "labels"]]
    en_ginco = en_ginco[["text", "labels"]]
    
    # Assure that the text contains no new lines
    x_ginco["text"] =  [text.replace("\n", "") for text in x_ginco.text.to_list()]
    en_ginco["text"] =  [text.replace("\n", "") for text in en_ginco.text.to_list()]
    df_train["text"] =  [text.replace("\n", "") for text in df_train.text.to_list()]

    print("The shape of the dataframes:")
    print(df_train.shape, x_ginco.shape, en_ginco.shape)
    
    # Then create CSV files which FastText can read
    
    train_file_content=""

    for labels, text in df_train.loc[:, ["labels", "text"]].values:
        label = f"__label__{labels}"
        train_file_content += f"""{label} {text}\n"""
    
    train_path = ""
    train_path = "data/x-genre-fasttext.train"

    with open(train_path,"w") as train_file:
        train_file.write(train_file_content)
    
    train_example = open(train_path,"r").readlines()
    print("Created train file:")
    print(train_example[:2])
    print("Number of lines: {}".format(len(train_example)))
    
    test_file_content_en=""
    
    for labels, text in en_ginco.loc[:, ["labels", "text"]].values:
        label = f"__label__{labels}"
        test_file_content_en += f"""{label} {text}\n"""
    
    test_path_en = ""
    test_path_en = "data/test-file-fasttext-en-ginco.test"
    
    with open(test_path_en,"w") as test_file_en:
        test_file_en.write(test_file_content_en)
    
    test_example = open(test_path_en,"r").readlines()
    print("Created test file:")
    print(test_example[:2])
    print("Number of lines: {}".format(len(test_example)))

    test_file_content_x=""
    
    for labels, text in x_ginco.loc[:, ["labels", "text"]].values:
        label = f"__label__{labels}"
        test_file_content_x += f"""{label} {text}\n"""
    
    test_path_x = ""
    test_path_x = "data/test-file-fasttext-x-ginco.test"
    
    with open(test_path_x,"w") as test_file_x:
        test_file_x.write(test_file_content_x)
    
    test_example = open(test_path_x,"r").readlines()
    print("Created test file:")
    print(test_example[:2])
    print("Number of lines: {}".format(len(test_example)))

    
    # Finally, create a list of labels which can be used for prediction and evaluation.
    # Let's inspect the labels:
    all_df_labels = df_train["labels"].unique().tolist()
    
    for i in x_ginco["labels"].unique().tolist():
        if i not in all_df_labels:
            all_df_labels.append(i)
    
    for i in en_ginco["labels"].unique().tolist():
        if i not in all_df_labels:
            all_df_labels.append(i)

    print(f"Number of all labels: {len(all_df_labels)}")
    
    # Create a final list of labels in a FastText-appropriate format:
    LABELS = df_train.labels.unique().tolist()
    LABELS = [f"__label__{i}" for i in LABELS]
    
    return_list = [LABELS, train_path, test_path_en, test_path_x]
    print(f"The function returned the following list: {return_list}")
    
    return return_list

fasttext_dict = fastText_files(df_train, x_ginco, en_ginco)

print(fasttext_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_ginco["text"] =  [text.replace("\n", "") for text in x_ginco.text.to_list()]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  en_ginco["text"] =  [text.replace("\n", "") for text in en_ginco.text.to_list()]


The shape of the dataframes:
(1772, 4) (790, 2) (272, 2)
Created train file:
['__label__Other Seeking All Things Brilliant "I want people to become curious about the stories that abound in their neighborhoods and cities. I want to create a \'culture of honour\' where we are all intentional about pulling out the good, the noble, the beautiful around us." I first met Jenn Co on Robson Street where my friend Keela and I interviewed her for a national documentary about church decline. Dressed like a fashion frontrow champ, she was articulate beyond our wildest hopes and savvy to boot. I liked her immediately. Fast forward seven years and she\'s the same, except brighter and bolder, if that is even possible. While Jenn\'s charted a course as a fashion stylist for years, today she\'s producing Anything Brilliant , a pilot television show that celebrates good wherever she can find it. It was a sitdown with footware magnate John Fluevog that led to her current trajectory. "I went to this churc

In [27]:
def prediction_to_label(prediction):
    """Transforms predictions as returned by fasttext into pure labels."""
    return np.array(prediction[0])[:, 0]

# Parsing test file
def parse_test_file(path: str):
    """Reads fasttext formatted file and returns labels, texts."""
    with open(path, "r") as f:
        content = f.readlines()
    pattern = "{label} {text}\n"
    p = parse.compile(pattern)

    labels, texts = list(), list()
    for line in content:
        rez = p.parse(line)
        if rez is not None:
            labels.append(rez["label"])
            texts.append(rez["text"])
        else:
            print("error parsing line ", line)
    return labels, texts

# Train fasttext model   
model = ft.train_supervised(input="data/x-genre-fasttext.train",
                            epoch = 350,
                            wordNgrams=1,
                            verbose = 2
                                        )

Read 1M words
Number of words:  163798
Number of labels: 9
Progress: 100.0% words/sec/thread:  145067 lr:  0.000000 avg.loss:  0.178346 ETA:   0h 0m 0s


In [28]:
# Parse the test files so that labels and texts are separated
y_true, y_texts = parse_test_file("data/test-file-fasttext.test")

# Evaluate the model on test data
y_pred = model.predict(y_texts)
y_pred = prediction_to_label(y_pred)

y_pred = [x.replace("__label__", "") for x in y_pred]

# Add the y_pred to the test_df
df_test["FastText"] = y_pred

## Evaluating the models

In [4]:
# Import the test df with results
df_test = pd.read_csv("data/main_test_file_with_predictions-additional-baseline-classifiers.csv", index_col = 0)
df_test.head(2)

Unnamed: 0,text,true_label,id,dataset,truncated_texts,true_label_forChatGPT,gpt-3.5-turbo,mt0,text_prompts,mt0_xl,X-GENRE,X-GENRE-confidence,dummy-classifier-stratified,nb-complement,logistic-regression,svc,FastText
0,Welcome to KBismarck.org! This is a community ...,Information/Explanation,en_ginco_0,EN-GINCO,Welcome to KBismarck.org! This is a community ...,Information,Forum,Forum,Please classify the following text according t...,Forum,Forum,"[('Forum', 0.9329903455731817), ('Other', 0.05...",Information/Explanation,Instruction,Opinion/Argumentation,Promotion,Promotion
1,Why graft thrives in postconflict zones <p> A ...,News,en_ginco_1,EN-GINCO,Why graft thrives in postconflict zones <p> A ...,News,News,Opinion,Please classify the following text according t...,World politics,News,"[('News', 0.9989983760338668), ('Instruction',...",News,Information/Explanation,News,News,News


In [13]:
df_test.columns

Index(['text', 'true_label', 'id', 'dataset', 'truncated_texts',
       'true_label_forChatGPT', 'gpt-3.5-turbo', 'mt0', 'text_prompts',
       'mt0_xl', 'X-GENRE', 'X-GENRE-confidence',
       'dummy-classifier-stratified', 'nb-complement', 'logistic-regression',
       'svc', 'FastText'],
      dtype='object')

In [5]:
# Let's test the performance on EN-GINCO, X-GENRE and different splits of X-GENRE
def test_splits(model):
    en_ginco_df = df_test[df_test["dataset"] == "EN-GINCO"]
    print("Performance on EN-GINCO")
    print(en_ginco_df.shape)

    y_true = en_ginco_df["true_label"].to_list()
    y_pred = en_ginco_df[model].to_list()
    labels = list(en_ginco_df["true_label"].unique())

    results_en_ginco = testing(y_true, y_pred, labels)

    x_genre_df = df_test[df_test["dataset"] != "EN-GINCO"]
    print("Performance on X-GENRE (entire)")
    print(x_genre_df.shape)

    y_true = x_genre_df["true_label"].to_list()
    y_pred = x_genre_df[model].to_list()
    labels = list(x_genre_df["true_label"].unique())

    results_x_genre = testing(y_true, y_pred, labels)

    x_genre_en_df = x_genre_df[x_genre_df["dataset"] != 'X-GENRE-test-GINCO']
    print("Performance on X-GENRE (English part)")
    print(x_genre_en_df.shape)

    y_true = x_genre_en_df["true_label"].to_list()
    y_pred = x_genre_en_df[model].to_list()
    labels = list(x_genre_en_df["true_label"].unique())

    results_x_genre_en = testing(y_true, y_pred, labels)

    x_genre_sl_df = x_genre_df[x_genre_df["dataset"] == 'X-GENRE-test-GINCO']
    print("Performance on X-GENRE (Slovene part)")
    print(x_genre_sl_df.shape)

    y_true = x_genre_sl_df["true_label"].to_list()
    y_pred = x_genre_sl_df[model].to_list()
    labels = list(x_genre_sl_df["true_label"].unique())

    results_x_genre_sl = testing(y_true, y_pred, labels)

    return {"en-ginco":results_en_ginco, "x-genre":results_x_genre, "x-genre-en":results_x_genre_en, "x-genre-sl": results_x_genre_sl}

In [6]:
en_ginco_all_results = {}
x_genre_all_results = {}
x_genre_en_all_results = {}
x_genre_sl_all_results = {}

In [None]:
for model in ['dummy-classifier-stratified', 'nb-complement', 'logistic-regression','svc', "FastText", "X-GENRE"]:
    print(model)
    result = test_splits(model)
    en_ginco_all_results[model] = result["en-ginco"]
    x_genre_all_results[model] = result["x-genre"]
    x_genre_en_all_results[model] = result["x-genre-en"]
    x_genre_sl_all_results[model] = result["x-genre-sl"]