In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=1

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=1


In [None]:
# Install and import fastText
!wget https://github.com/facebookresearch/fastText/archive/v0.9.2.zip
!unzip v0.9.2.zip


In [10]:
import zipfile
with zipfile.ZipFile("v0.9.2.zip", 'r') as zip_ref:
    zip_ref.extractall("v0.9.2")

In [None]:
%cd v0.9.2
%cd fastText-0.9.2

!pip install .

In [13]:
import pandas as pd
import numpy as np
from datasets import load_dataset
import parse
import fasttext as ft

## Prepare the datasets for training and evaluation with the fastText model

In [3]:
# Load the X-GENRE datasets from Hugging Face
train = load_dataset("TajaKuzman/X-GENRE-text-genre-dataset", "train")


# To open them as Pandas DataFrame:
df_train = pd.DataFrame(train["train"])

print(df_train.shape)

(1772, 4)


In [4]:
# Load the test datasets from the GitHub repositories (access to them is obtained by request to the AGILE repository owner)

en_ginco = pd.read_json("../../datasets/EN-GINCO-test-dataset/EN-GINCO.jsonl", lines=True)
x_ginco = pd.read_json("../../datasets/X-GINCO-test-set/X-GINCO.jsonl", lines=True)

print(en_ginco.shape, x_ginco.shape)

(272, 4) (790, 6)


In [5]:
x_ginco.head(3)

Unnamed: 0,text,labels,language,dataset,text_id,translation
0,"Angelo Chetcuti, se jkun qed jieħu post Bjorn ...",News,Maltese,MaCoCu,macocu.mt.402244,"Angelo Chetcuti, will be replacing Bjorn Vassa..."
1,Poltergeist jirreferi għal fenomeni oħra tal-m...,Opinion/Argumentation,Maltese,MaCoCu,macocu.mt.377203,"Poltergeist refers to other woman's phenomena,..."
2,Chrysler: Brand ta 'lussu jew le? \n\nBrand ji...,Opinion/Argumentation,Maltese,MaCoCu,macocu.mt.109995,Chrysler: Luxury brand or not?\n\nBrand moves ...


In [6]:
df_train.head(3)

Unnamed: 0,text,labels,dataset,language
0,"Seeking All Things Brilliant ""I want people to...",Other,CORE,English
1,Meet Orchid du Bois I first met Hayley Mowday ...,Other,CORE,English
2,Abstract Objective: Reporting bias due to soci...,Information/Explanation,CORE,English


In [None]:
# Creating FastText train and test files

def fastText_files(df_train, x_ginco, en_ginco):
    """
    This function creates and saves the test and train file(s).
    
    The function returns a list of the following elements:
        - labels - which can be used for prediction and evaluation.
        - train file path
        - test file path
    """
    x_ginco = x_ginco[["text", "labels"]]
    en_ginco = en_ginco[["text", "labels"]]
    
    # Assure that the text contains no new lines
    x_ginco["text"] =  [text.replace("\n", "") for text in x_ginco.text.to_list()]
    en_ginco["text"] =  [text.replace("\n", "") for text in en_ginco.text.to_list()]
    df_train["text"] =  [text.replace("\n", "") for text in df_train.text.to_list()]

    print("The shape of the dataframes:")
    print(df_train.shape, x_ginco.shape, en_ginco.shape)
    
    # Then create CSV files which FastText can read
    
    train_file_content=""

    for labels, text in df_train.loc[:, ["labels", "text"]].values:
        label = f"__label__{labels}"
        train_file_content += f"""{label} {text}\n"""
    
    train_path = ""
    train_path = "data/x-genre-fasttext.train"

    with open(train_path,"w") as train_file:
        train_file.write(train_file_content)
    
    train_example = open(train_path,"r").readlines()
    print("Created train file:")
    print(train_example[:2])
    print("Number of lines: {}".format(len(train_example)))
    
    test_file_content_en=""
    
    for labels, text in en_ginco.loc[:, ["labels", "text"]].values:
        label = f"__label__{labels}"
        test_file_content_en += f"""{label} {text}\n"""
    
    test_path_en = ""
    test_path_en = "data/test-file-fasttext-en-ginco.test"
    
    with open(test_path_en,"w") as test_file_en:
        test_file_en.write(test_file_content_en)
    
    test_example = open(test_path_en,"r").readlines()
    print("Created test file:")
    print(test_example[:2])
    print("Number of lines: {}".format(len(test_example)))

    test_file_content_x=""
    
    for labels, text in x_ginco.loc[:, ["labels", "text"]].values:
        label = f"__label__{labels}"
        test_file_content_x += f"""{label} {text}\n"""
    
    test_path_x = ""
    test_path_x = "data/test-file-fasttext-x-ginco.test"
    
    with open(test_path_x,"w") as test_file_x:
        test_file_x.write(test_file_content_x)
    
    test_example = open(test_path_x,"r").readlines()
    print("Created test file:")
    print(test_example[:2])
    print("Number of lines: {}".format(len(test_example)))

    
    # Finally, create a list of labels which can be used for prediction and evaluation.
    # Let's inspect the labels:
    all_df_labels = df_train["labels"].unique().tolist()
    
    for i in x_ginco["labels"].unique().tolist():
        if i not in all_df_labels:
            all_df_labels.append(i)
    
    for i in en_ginco["labels"].unique().tolist():
        if i not in all_df_labels:
            all_df_labels.append(i)

    print(f"Number of all labels: {len(all_df_labels)}")
    
    # Create a final list of labels in a FastText-appropriate format:
    LABELS = df_train.labels.unique().tolist()
    LABELS = [f"__label__{i}" for i in LABELS]
    
    return_list = [LABELS, train_path, test_path_en, test_path_x]
    print(f"The function returned the following list: {return_list}")
    
    return return_list

fasttext_dict = fastText_files(df_train, x_ginco, en_ginco)

print(fasttext_dict)

In [14]:
def prediction_to_label(prediction):
    """Transforms predictions as returned by fasttext into pure labels."""
    return np.array(prediction[0])[:, 0]

# Parsing test file
def parse_test_file(path: str):
    """Reads fasttext formatted file and returns labels, texts."""
    with open(path, "r") as f:
        content = f.readlines()
    pattern = "{label} {text}\n"
    p = parse.compile(pattern)

    labels, texts = list(), list()
    for line in content:
        rez = p.parse(line)
        if rez is not None:
            labels.append(rez["label"])
            texts.append(rez["text"])
        else:
            print("error parsing line ", line)
    return labels, texts

Train the model

In [19]:
# Train fasttext model   
model = ft.train_supervised(input="data/x-genre-fasttext.train",
                            epoch = 350,
                            wordNgrams=1,
                            verbose = 2
                                        )

Read 1M words
Number of words:  163798
Number of labels: 9
Progress: 100.0% words/sec/thread:  146863 lr:  0.000000 avg.loss:  0.247710 ETA:   0h 0m 0s  3.5% words/sec/thread:  149828 lr:  0.096489 avg.loss:  2.010013 ETA:   0h 0m17s


Apply the model

In [20]:
def predict(df_test_name):
	import json

	test_file_paths = {
		"en-ginco": "data/test-file-fasttext-en-ginco.test",
		"x-ginco": "data/test-file-fasttext-x-ginco.test"
	}
	
	# Parse the test files so that labels and texts are separated
	y_true, y_texts = parse_test_file(test_file_paths[df_test_name])

	# Evaluate the model on test data
	y_pred = model.predict(y_texts)
	y_pred = prediction_to_label(y_pred)

	y_pred = [x.replace("__label__", "") for x in y_pred]

	# Create a json with results

	current_results = {
		"system": "fastText",
		"predictions": [
			{
			"train": "X-GENRE (train split)",
			"test": "{}".format(df_test_name),
			"predictions": y_pred,
			}
		]
		}

	# Save the results as a new json
	with open("submissions/submission-{}-{}.json".format("fastText", df_test_name), "w") as file:
		json.dump(current_results, file)

	print("Classification with {} on {} finished.".format("fastText", df_test_name))

In [21]:
predict("x-ginco")

Classification with fastText on x-ginco finished.


In [22]:
predict("en-ginco")

Classification with fastText on en-ginco finished.
