# Learning on various text representations - shortened

This notebook was used to ran all of the experiments, based on the functions, defined in the utility scripts at the beginning of the file. To inspect step-by-step procedure, see the *3-Learning-On-Various_Representations.ipynb* file.

## Preparing the dataset for FastText

Importing the necessary libraries

In [1]:
!pip install parse

In [2]:
import json
import pandas as pd
from copy import deepcopy
import re
from tqdm import tqdm
import fasttext as ft
import parse
import numpy as np
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

Add all utility scripts:

In [3]:
# Creating FastText train and test files


def fastText_files(representation):
    """
    This function creates and saves the test and train file
    from the test, train and dev split of the dataset (named test, dev and train),
    using the "primary_level_3" level labels, and the chosen text representation.
    
    Possible representations: 'baseline_text', 'lemmas',
    'upos', 'xpos', 'ner', 'dependency', 'lowercase', 'lowercase_nopunctuation'
    
    The function returns a list of the following elements:
        - labels - which can be used for prediction and evaluation.
        - train file path
        - test file path
    
    Args:
        representation (str): the name of the key (from the dataset)
                                of the text representation we want to use
    """
    # First create the dataframes from each split:
    
    train_df = pd.DataFrame(data=train, columns=[representation, "primary_level_3"])
    # Renaming columns to `text` and `labels`
    train_df.columns = ["text", "labels"]
    
    test_df = pd.DataFrame(data=test, columns=[representation, "primary_level_3"])
    test_df.columns = ["text", "labels"]
    
    print("The shape of the dataframes:")
    print(train_df.shape, test_df.shape)
    
    # Then create CSV files which FastText can read
    
    train_file_content=""

    for labels, text in train_df.loc[:, ["labels", "text"]].values:
        label = f"__label__{labels}"
        train_file_content += f"""{label} {text}\n"""
    
    train_path = ""
    train_path = representation + "-fasttext.train"

    with open(train_path,"w") as train_file:
        train_file.write(train_file_content)
    
    train_example = open(train_path,"r").read(1000)
    print("Created train file:")
    print(train_example)
    
    test_file_content=""
    
    for labels, text in test_df.loc[:, ["labels", "text"]].values:
        label = f"__label__{labels}"
        test_file_content += f"""{label} {text}\n"""
    
    test_path = ""
    test_path = representation + "-fasttext.test"
    
    with open(test_path,"w") as test_file:
        test_file.write(test_file_content)
    
    test_example = open(test_path,"r").read(1000)
    print("Created test file:")
    print(test_example)
    
    
    # Finally, create a list of labels which can be used for prediction and evaluation.
    # Let's inspect the labels:
    all_df_labels = train_df["labels"].unique().tolist()
    
    for i in test_df["labels"].unique().tolist():
        if i not in all_df_labels:
            all_df_labels.append(i)

    print(f"Number of all labels: {len(all_df_labels)}")
    
    # Create a final list of labels in a FastText-appropriate format:
    LABELS = train_df.labels.unique().tolist()
    LABELS = [f"__label__{i}" for i in LABELS]
    
    return_list = [LABELS, train_path, test_path]
    print(f"The function returned the following list: {return_list}")
    
    return return_list

# Parsing test file
def parse_test_file(path: str):
    """Reads fasttext formatted file and returns labels, texts."""
    with open(path, "r") as f:
        content = f.readlines()
    pattern = "{label} {text}\n"
    p = parse.compile(pattern)

    labels, texts = list(), list()
    for line in content:
        rez = p.parse(line)
        if rez is not None:
            labels.append(rez["label"])
            texts.append(rez["text"])
        else:
            print("error parsing line ", line)
    return labels, texts

def prediction_to_label(prediction):
    """Transforms predictions as returned by fasttext into pure labels."""
    return np.array(prediction[0])[:, 0]

def train_FastText(representation):
    """
    The function uses the created FT_train_file and FT_test_file
    and performs five runs of training and evaluation of the model.
    It plots a confusion matrix for each run.

    Args:
        representation (str): the name of the key (from the dataset)
                                of the text representation we want to use
    
    """
    results = []

    for i in range(5):
        model = ft.train_supervised(input=FT_train_file,
                                    epoch = 350,
                                    lr = 0.7,
                                    wordNgrams=1,
                                    verbose = 2,
                                    # For the final experiments, we set the ws (context window to 0)
                                    #ws = 0
                                   )
        # Parse the test files so that labels and texts are separated
        y_true, y_texts = parse_test_file(FT_test_file)

        # Evaluate the model on test data
        y_pred = model.predict(y_texts)
        y_pred = prediction_to_label(y_pred)

        # Plot the confusion matrix:
        cm = confusion_matrix(y_true, y_pred, labels=LABELS)
        plt.figure(figsize=(9, 9))
        plt.imshow(cm, cmap="Oranges")
        classNames = LABELS
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        tick_marks = np.arange(len(classNames))
        plt.xticks(tick_marks, classNames, rotation=90)
        plt.yticks(tick_marks, classNames)
        
        m = f1_score(y_true, y_pred, labels=LABELS, average ="micro")
        M = f1_score(y_true, y_pred, labels=LABELS, average ="macro")
        score_per_label = list(f1_score(y_true, y_pred, labels=LABELS, average=None))
        
        dict_score_per_label = {}
        
        for index in range(len(LABELS)):
            dict_score_per_label[LABELS[index]] = score_per_label[index]
             
        print(f"Score per labels: {dict_score_per_label}")

        metrics = f"{m:0.4}, {M:0.4}"
        title = f"Representation: {representation}, Run: {i}"
        plt.title(title +";\n" + metrics)
        plt.tight_layout()
        plt.savefig(title)
        plt.show()
        
        rezdict = {}
        
        rezdict = {
            "microF1": m,
            "macroF1": M,
            "label_scores": dict_score_per_label,
            "run": i,
            "experiment": f"{representation}",
        }
        results.append(rezdict)
        final_results.append(rezdict)
    
    # Calculate the average micro and macro F1 for the 5 runs:
    mi = []
    ma = []
    
    for i in results:
        mi.append(i['microF1'])
        ma.append(i["macroF1"])

    print(f"micro F1: {np.array(mi).mean():0.03} +/- {np.array(mi).std():0.02}")
    print(f"macro F1: {np.array(ma).mean():0.03} +/- {np.array(ma).std():0.02}")

In [4]:
# Import the file with additional text representations (only the paragraphs marked to be kept
# in the original corpus are included)

with open("/kaggle/input/ginco-with-additional-text-representations/Language-Processed-GINCO.json") as f:
    dataset = json.load(f)

In [5]:
dataset[0].keys()

Let's create a preprocessed representation that also has the stopwords removed.

In [26]:
from string import digits
from string import punctuation

for instance in tqdm(dataset):
        text = instance['lowercase']
        
        # split text into tokens by white space
        token = text.split()
         
        # remove punctuation from each token
        table = str.maketrans('', '', punctuation)
        token = [word.translate(table) for word in token]

        text_no_punctuation = " ".join(token)
        
        # remove digits from each token
        remove_digits = str.maketrans('', '', digits)
        text_no_digits = text_no_punctuation.translate(remove_digits)

        # add a new key with lowercase text with punctuation and digits removed
        instance["lowercase_nopunct_nodigits"] = text_no_digits

dataset[-1]

In [48]:
# Import the file with stopwords and create a list from them.
stopwords = []

with open("/kaggle/input/slovene-stopwords/Slovene-stopwords.txt","r") as stopwords_file:
    for line in stopwords_file:
        stopwords.append(line.strip())

stopwords_set = set(stopwords)
        
print(stopwords_set)
print(len(stopwords), len(stopwords_set))

In [60]:
# Remove stopwords

for instance in tqdm(dataset):
        text = instance['lowercase_nopunct_nodigits']
        
        # split text into tokens by white space
        token = text.split()
        
        token_no_stopwords = [word for word in token if not word in stopwords_set]         

        preprocessed_text = " ".join(token_no_stopwords)

        # add a new key with preprocessed text
        instance["preprocessed_text"] = preprocessed_text

dataset[-1]

### Pre-processing dataset

Here, we'll create additional representations which consist of lemmas of selected PoS classes only.

In [6]:
def only_specific_word_types(key_name, tag):
    """
    This function creates an additional representation (of lemmas) containing just the desired word type
    based on the 'representation_list' which contains lemmas and various linguistic tags.
    
    Args:
        key_name (string) = name of the new key with the created representation
        tag (str or list) = specify in which tag you're interested in (e.g. NOUN, PUNCT, ADJ ...)
            If you're interested in multiple tags, tag can be a list: ["NOUN", "PUNCT"]
    """
    
    for instance in tqdm(dataset):
        list_new_representation = []

        for token in instance["representation_list"]:
            if token[1] in tag:
                list_new_representation.append(token[0])
            else:
                list_new_representation.append("O")

        instance[key_name] = " ".join(list_new_representation)
    
    print("New representation added :)")

In [7]:
only_specific_word_types("stopwords_removed", ["ADP", "AUX", "CCONJ", "SCONJ", "DET", "NUM", "PART", "PRON"])

In [8]:
only_specific_word_types("only_stopwords", ["ADP", "AUX", "CCONJ", "SCONJ", "DET", "NUM", "PART", "PRON"])

In [9]:
only_specific_word_types("only_subjective_classes", ["ADJ", "ADV", "PART"])

In [10]:
only_specific_word_types("only_nouns_verbs", ["PROPN", "NOUN", "VERB"])

### Downcasting number of labels

In these experiments, we will not use all of the texts but only texts from 5 main categories, meaning that some categories will be merged into them, whereas some categories with a very small frequency will be discarded. Additionally, the texts marked us hard, will be discarded (see notebook *1-Preparing_Data_Hyperparameter_Search*).

We will start with a reduced set of labels (primary_level_3), then merge News and Opinionated News, and discard some of the lables.

In [61]:
# merge News and Opinionated News
for i in dataset:
    if i["primary_level_3"] == "Opinionated News" or i["primary_level_3"] == "News/Reporting":
        i["primary_level_3"] = "News"

Let's create train:test:dev split that contains only the wanted labels.

In [62]:
downcasted_labels = ['Information/Explanation', 'Promotion', 'News', 'Forum', 'Opinion/Argumentation']

train = [i for i in dataset if i["split"] == "train" and i["primary_level_3"] in downcasted_labels and not i["hard"]]
test = [i for i in dataset if i["split"] == "test" and i["primary_level_3"] in downcasted_labels and not i["hard"]]
dev = [i for i in dataset if i["split"] == "dev" and i["primary_level_3"] in downcasted_labels and not i["hard"]]

print("The train-dev-test splits consist of the following numbers of examples:", len(train), len(test), len(dev))

In [63]:
print(f"Number of all texts is {len(train)+len(test)+len(dev)}")

### Creating FastText texts

Use the function fastText_files(representation) from the utility functions.

This function creates and saves the test and train file
    from the test, train and dev split of the dataset (named test, dev and train),
    using the "primary_level_3" level labels, and the chosen text representation.

In [65]:
dataset[0].keys()

In [66]:
current_representation = 'preprocessed_text'

In [67]:
final_results = list()

In [68]:
representation = fastText_files(current_representation)

# Train a fastText model

In [69]:
# Define the label list:
LABELS = representation[0]

LABELS

Input the data:

In [70]:
FT_train_file = representation[1]
FT_test_file = representation[2]

Use the train_FastText(representation) function from utils.py

In [71]:
train_FastText(current_representation)

In [72]:
len(final_results)

In [73]:
print(final_results[0])
print(final_results[-1])

In [76]:
previous_results_file = open("/kaggle/input/fasttextrepresentationsresults/FastTextExperimentsResults-all-representations-updated.json")
previous_final_results = json.load(previous_results_file)
len(previous_final_results)

In [77]:
for element in final_results:
    previous_final_results.append(element)

len(previous_final_results)

In [78]:
def average_label_scores(representation):
    """
    This function takes the label scores from the previous_final_results list for a chosen representation (value in "experiment")
    and returns a list containing a list of averages and a list of stds.
    """
    average_label_score_baseline = {"Info":[], "Promotion":[], "News": [], "Forum": [], "Opinion": []}


    for element in previous_final_results:
        if element["experiment"] == representation:
            average_label_score_baseline["Info"].append(element["label_scores"]["__label__Information/Explanation"])
            average_label_score_baseline["Promotion"].append(element["label_scores"]["__label__Promotion"])
            average_label_score_baseline["News"].append(element["label_scores"]["__label__News"])
            average_label_score_baseline["Forum"].append(element["label_scores"]["__label__Forum"])
            average_label_score_baseline["Opinion"].append(element["label_scores"]["__label__Opinion/Argumentation"])

    baseline_list_of_averages = [np.array(average_label_score_baseline["Info"]).mean(),np.array(average_label_score_baseline["Promotion"]).mean(), np.array(average_label_score_baseline["News"]).mean(), np.array(average_label_score_baseline["Forum"]).mean(), np.array(average_label_score_baseline["Opinion"]).mean()]
    baseline_list_of_stds = [np.array(average_label_score_baseline["Info"]).std(),np.array(average_label_score_baseline["Promotion"]).std(), np.array(average_label_score_baseline["News"]).std(), np.array(average_label_score_baseline["Forum"]).std(), np.array(average_label_score_baseline["Opinion"]).std()]

    return [baseline_list_of_averages, baseline_list_of_stds]

In [80]:
baseline_label_scores = average_label_scores("baseline_text")
baseline_label_scores

In [81]:
current_representation_label_scores = average_label_scores(current_representation)
current_representation_label_scores

In [82]:
labels_names = ['Information/Explanation', 'Promotion', 'News', 'Forum', 'Opinion/Argumentation']

fig, ax = plt.subplots(figsize=(4,4), dpi=200)
ax.errorbar(labels_names, baseline_label_scores[0], yerr=baseline_label_scores[1], label="baseline", capsize=3)
ax.errorbar(labels_names, current_representation_label_scores[0], yerr=current_representation_label_scores[1], label=current_representation, capsize=3)
plt.xticks(fontsize=6)
ax.set_xlabel('Labels')
ax.set_ylabel('F1 Scores')
ax.legend(loc="lower right")
#ax.set_xlabel("Impact of the Size of the Slovene Pre-Training Data on the Micro and Macro F1")
plt.savefig(f"baseline-versus-{current_representation}-label-scores.png")
plt.show()

In [83]:
# Save the file with updated results.
with open("FastTextExperimentsResults-all-representations-updated_stopwords_added.json", "w") as results_file:
    json.dump(previous_final_results,results_file, indent= "")