# Preparing the dataset

Importing the necessary libraries

In [1]:
import json
import pandas as pd

In [2]:
# Import the file

with open("G:\My Drive\GitHub\Text-Representations-in-FastText\data\GINCO-1.0-suitable.json") as f:
    dataset = json.load(f)

dataset[0]

{'id': '3949',
 'url': 'http://www.pomurje.si/aktualno/sport/zimska-liga-malega-nogometa/',
 'crawled': '2014',
 'hard': False,
 'paragraphs': [{'text': 'Šport', 'duplicate': False, 'keep': True},
  {'text': 'Zimska liga malega nogometa sobota, 12.02.2011',
   'duplicate': False,
   'keep': True},
  {'text': 'avtor: Tonček Gider', 'duplicate': False, 'keep': True},
  {'text': "V 7. krogu zimske lige v malem nogometu v Križevcih pri Ljutomeru je v prvi ligi vodilni 100 plus iz Križevec izgubil s tretjo ekipo na lestvici Rock'n roll iz Križevec z rezultatom 1:2, druga na lestvici Top Finedika iz Križevec je bila poražena z ekipo Bar Milene iz Ključarovec z rezultatom 7:8. V drugi križevski ligi je vodilni Cafe del Mar iz Vučje vasi premagal Montažo Vrbnjak iz Stare Nove vasi z rezultatom 3:2.",
   'duplicate': False,
   'keep': True},
  {'text': 'oglasno sporočilo', 'duplicate': False, 'keep': True},
  {'text': 'Ocena', 'duplicate': False, 'keep': True},
  {'text': 'Komentiraj Za komenti

 ## Extract text from paragraphs into one string

 We'll create an additional parameter for each text: "dedup_text" with text from the deduplicated paragraphs only (no near-duplicates).

In [3]:
for instance in dataset:
    paragraphs = instance["paragraphs"]
    # Removing duplicates:
    paragraphs = [p for p in paragraphs if not p["duplicate"]]

    # Joining texts:
    instance_dedup_text = " <p/> ".join([p["text"] for p in paragraphs])

    # Assigning texts to a new field:
    instance["dedup_text"] = instance_dedup_text

dataset[0]

{'id': '3949',
 'url': 'http://www.pomurje.si/aktualno/sport/zimska-liga-malega-nogometa/',
 'crawled': '2014',
 'hard': False,
 'paragraphs': [{'text': 'Šport', 'duplicate': False, 'keep': True},
  {'text': 'Zimska liga malega nogometa sobota, 12.02.2011',
   'duplicate': False,
   'keep': True},
  {'text': 'avtor: Tonček Gider', 'duplicate': False, 'keep': True},
  {'text': "V 7. krogu zimske lige v malem nogometu v Križevcih pri Ljutomeru je v prvi ligi vodilni 100 plus iz Križevec izgubil s tretjo ekipo na lestvici Rock'n roll iz Križevec z rezultatom 1:2, druga na lestvici Top Finedika iz Križevec je bila poražena z ekipo Bar Milene iz Ključarovec z rezultatom 7:8. V drugi križevski ligi je vodilni Cafe del Mar iz Vučje vasi premagal Montažo Vrbnjak iz Stare Nove vasi z rezultatom 3:2.",
   'duplicate': False,
   'keep': True},
  {'text': 'oglasno sporočilo', 'duplicate': False, 'keep': True},
  {'text': 'Ocena', 'duplicate': False, 'keep': True},
  {'text': 'Komentiraj Za komenti

## Create the test-train-dev split

Here, we remove instances which don't have any text which was the result of using only the deduplicated text.

In [28]:
train = [i for i in dataset if i["split"] == "train" and len(i["dedup_text"]) != 0]
test = [i for i in dataset if i["split"] == "test" and len(i["dedup_text"]) != 0]
dev = [i for i in dataset if i["split"] == "dev" and len(i["dedup_text"]) != 0]

print("The train-dev-test splits consist of the following numbers of examples:", len(train), len(test), len(dev))

The train-dev-test splits consist of the following numbers of examples: 587 197 199


## Transform the dataset in tabular form

For labels we will first use the primary_level_2 label and for the text, we'll use the deduplicated text, as done in the initial experiments in the previous article.

In [31]:
train_df = pd.DataFrame(data=train, columns=["dedup_text", "primary_level_2"])
# Renaming columns to `text` and `labels`
train_df.columns = ["text", "labels"]

# Let's look at the beginning of the train dataframe

train_df.head()

Unnamed: 0,text,labels
0,JEDILNIK <p/> Iskalnik <p/> Poglavitni cilj pr...,Information/Explanation
1,Projekt INNOVAge in zavod Oreli <p/> Zavod Ore...,Promotion of Services
2,"V novembru, mesecu preprečevanja odvisnosti, b...",News/Reporting
3,Uvajanje moderne tehnologije in sledenje hitre...,Invitation
4,Meliso dodajajo čajem in pripravkom proti mela...,Promotion of a Product


Let's investigate the labels:

In [32]:
LABELS = train_df.labels.unique().tolist()
print(LABELS)

['Information/Explanation', 'Promotion of Services', 'News/Reporting', 'Invitation', 'Promotion of a Product', 'Forum', 'Opinion/Argumentation', 'Opinionated News', 'Instruction', 'List of Summaries/Excerpts', 'Legal/Regulation', 'Promotion', 'Other', 'Review', 'Prose', 'Announcement', 'Call', 'Recipe', 'Correspondence', 'Research Article', 'Interview']


Repeat the process with the test and dev split:

In [33]:
test_df = pd.DataFrame(data=test, columns=["dedup_text", "primary_level_2"])
test_df.columns = ["text", "labels"]
test_df.head()

Unnamed: 0,text,labels
0,"Šport <p/> Zimska liga malega nogometa sobota,...",News/Reporting
1,Selena Gomez ponudila v poslušanje novi album ...,Opinionated News
2,"Razstava,, beli šport ob zeleni reki,, <p/> Ra...",Invitation
3,Naročila sprejemam na elektronsko naslov emobi...,Other
4,V diplomskem delu je predstavljena nova vpenja...,Research Article


In [34]:
dev_df = pd.DataFrame(data=dev, columns=["dedup_text", "primary_level_2"])
dev_df.columns = ["text", "labels"]
dev_df.head()

Unnamed: 0,text,labels
0,Razno <p/> Letos so Švicarji že tridesetič pri...,Promotion of a Product
1,OBRAZLOŽITEV: (1) S tožbo terjani zneski se na...,Legal/Regulation
2,ODPLAVLJEN <p/> Z nihaji spoznanj se umirja vi...,Other
3,"Krško, 14. januar 2013 – Snežne padavine so za...",Announcement
4,Almodovar v mestu <p/> 16. 5. 2013 <p/> Zadnji...,Promotion of a Product


## Prepare the data in fastText format

For fasttext current labels won't do. Spaces will be replaced with underscores.

In [35]:
train_df["labels"] = train_df.labels.str.replace(" ", "_")
test_df["labels"] = test_df.labels.str.replace(" ", "_")
dev_df["labels"] = dev_df.labels.str.replace(" ", "_")

print(train_df["labels"].unique(), len(train_df["labels"].unique()))
print(test_df["labels"].unique(), len(test_df["labels"].unique()))
print(dev_df["labels"].unique(), len(dev_df["labels"].unique()))

['Information/Explanation' 'Promotion_of_Services' 'News/Reporting'
 'Invitation' 'Promotion_of_a_Product' 'Forum' 'Opinion/Argumentation'
 'Opinionated_News' 'Instruction' 'List_of_Summaries/Excerpts'
 'Legal/Regulation' 'Promotion' 'Other' 'Review' 'Prose' 'Announcement'
 'Call' 'Recipe' 'Correspondence' 'Research_Article' 'Interview'] 21
['News/Reporting' 'Opinionated_News' 'Invitation' 'Other'
 'Research_Article' 'Opinion/Argumentation' 'Forum' 'Correspondence'
 'Information/Explanation' 'Promotion' 'Instruction'
 'Promotion_of_a_Product' 'Interview' 'List_of_Summaries/Excerpts'
 'Promotion_of_Services' 'Review' 'Prose' 'Announcement'
 'Legal/Regulation' 'Call' 'Recipe'] 21
['Promotion_of_a_Product' 'Legal/Regulation' 'Other' 'Announcement'
 'Opinion/Argumentation' 'Opinionated_News' 'News/Reporting'
 'List_of_Summaries/Excerpts' 'Promotion_of_Services' 'Forum' 'Invitation'
 'Information/Explanation' 'Promotion' 'Recipe' 'Call' 'Review' 'Prose'
 'Research_Article' 'Interview' 'Corr

Then we create csv files with labels in one column and text in another which fastText can read.

In [36]:
train_file_content=""

for labels, text in train_df.loc[:, ["labels", "text"]].values:
  label = f"__label__{labels}"
  train_file_content += f"""{label} {text}\n"""

with open("train_fasttext.train","w") as train_file:
  train_file.write(train_file_content)

UnicodeEncodeError: 'charmap' codec can't encode character '\u010d' in position 276: character maps to <undefined>

We take a look at the created file.

In [24]:
train_file = open("train_fasttext.train","r").read()

train_file

FileNotFoundError: [Errno 2] No such file or directory: 'train_fasttext.txt'

We repeat this process to create test and dev file.

In [25]:
test_file_content=""

for labels, text in test_df.loc[:, ["labels", "text"]].values:
  label = f"__label__{labels}"
  test_file_content += f"""{label} {text}\n"""

with open("test_fasttext.test","w") as test_file:
  test_file.write(test_file_content)

dev_file_content=""

for labels, text in dev_df.loc[:, ["labels", "text"]].values:
  label = f"__label__{labels}"
  dev_file_content += f"""{label} {text}\n"""

with open("dev_fasttext.valid","w") as dev_file:
  dev_file.write(dev_file_content)

UnicodeEncodeError: 'charmap' codec can't encode character '\u010d' in position 97: character maps to <undefined>

# Train a fastText model on GINCO

Resources:
* the Text Classification with Fasttext tutorial (https://github.com/mpuig/textclassification/blob/master/notebook.ipynb)
* Peter's code (https://github.com/5roop/task5_webgenres/)

Importing the necessary libraries

In [None]:
!wget https://github.com/facebookresearch/fastText/archive/v0.9.2.zip
!unzip v0.9.2.zip

In [None]:
%cd fastText-0.9.2

!pip install .

In [None]:
import fasttext as ft
import parse
import numpy as np
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

## Input the data:

In [None]:
FT_train_file = ""
FT_test_file = ""
FT_dev_file = ""

## Training

In our initial article (LREC), the following hyperparameters were used:
* 200 epochs

We reported the following results (on 21 labels, deduplicated data, 200 epochs, no embeddings): micro 0.352, macro 0.217

In [None]:
def parse_test_file(path: str):
    """Reads fasttext formatted file and returns labels, texts."""
    with open(path, "r") as f:
        content = f.readlines()
    pattern = "{label} {text}\n"
    p = parse.compile(pattern)

    labels, texts = list(), list()
    for line in content:
        rez = p.parse(line)
        if rez is not None:
            labels.append(rez["label"])
            texts.append(rez["text"])
        else:
            print("error parsing line ", line)
    return labels, texts

def prediction_to_label(prediction):
    """Transforms predictions as returned by fasttext into pure labels."""
    return np.array(prediction[0])[:, 0]

def plot_cm(y_true, y_pred, labels, save=False, title=None):
    cm = confusion_matrix(y_true, y_pred, labels=labels, )
    plt.figure(figsize=(9, 9))
    plt.imshow(cm, cmap="Oranges")
    classNames = labels
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    tick_marks = np.arange(len(classNames))
    plt.xticks(tick_marks, classNames, rotation=90)
    plt.yticks(tick_marks, classNames)
    microF1 = f1_score(y_true, y_pred, labels=labels, average ="micro")
    macroF1 = f1_score(y_true, y_pred, labels=labels, average ="macro")

    print(f"{microF1=:0.4}")
    print(f"{macroF1=:0.4}")

    metrics = f"{microF1=:0.4}, {macroF1=:0.4}"
    if title:
        plt.title(title +";\n" + metrics)
    else:
        plt.title(metrics)
    plt.tight_layout()
    if save:
        plt.savefig(save)
    plt.show()
    return microF1, macroF1

### Finding optimal hyperparameters

#### Training
Let's experiment with the following parameters (testing on dev file):
* epochs (default: 5, standard range [5 - 50])
* lr (Good values of the learning rate are in the range 0.1 - 1.0)
* using ngrams (standard range [1 - 5]): wordNgrams=2 (3,4, ...)

In [None]:
for i in range(5):
    # Training:
    model = fasttext.train_supervised(input=FT_train_file,
                                            epoch=200,
                                            #lr=
                                            #ngrams=
                                            )

    # Testing:

    # Parse the test files so that labels and texts are separated
    y_true, y_texts = parse_test_file(FT_dev_file)

    # Evaluate te model on test data
    y_pred = model.predict(y_texts)
    y_pred = prediction_to_label(y_pred)

    # Plot the confusion matrix:
    m, M = plot_cm(y_true, y_pred, LABELS, save=False, title=f"Unrestricted dataset, run {i}")

    # Record the results:
    rezdict = dict(
        microF1=m,
        macroF1=M,
        y_true= y_true,
        y_pred=y_pred,
        train="fasttext_dd_lemma_vec"
    )
    results.append(rezdict)

    # Record the micro and macro F1 scores:
    #micros.append(m)
    #macros.append(M)

### Autotuning

To find the optimal hyperparameters, I will use the automatic hyperparameter optimization, provided by FastText, done on the dev file.

#### Finding the optimal duration for hyperparameter optimisation

 By default, the search will take 5 minutes. You can set the timeout in seconds with the -autotune-duration argument. While autotuning, fastText displays the best f1-score found so far. If we decide to stop the tuning before the time limit, we can send one SIGINT signal (via CTLR-C for example). FastText will then finish the current training, and retrain with the best parameters found so far.

In [None]:
model = fasttext.train_supervised(input=FT_train_file, 
                                            autotuneValidationFile=FT_dev_file,
                                            #Searching for the best hyperparameters for 20 minutes: 
                                            autotuneDuration=60*15
                                            )

#### Training

In [None]:
micros, macros = list(), list()
results = list()

for i in range(5):
    # Training:
    model = fasttext.train_supervised(input=FT_train_file, 
                                            autotuneValidationFile=FT_dev_file,
                                            #Searching for the best hyperparameters for 15 minutes: 
                                            autotuneDuration=60*15,
                                            epoch=200,
                                            #dim=100
                                            )

    # Testing:

    # Parse the test files so that labels and texts are separated
    y_true, y_texts = parse_test_file(FT_test_file)

    # Evaluate te model on test data
    y_pred = model.predict(y_texts)
    y_pred = prediction_to_label(y_pred)

    # Plot the confusion matrix:
    m, M = plot_cm(y_true, y_pred, LABELS, save=False, title=f"Unrestricted dataset, run {i}")

    # Record the results:
    rezdict = dict(
        microF1=m,
        macroF1=M,
        y_true= y_true,
        y_pred=y_pred,
        train="fasttext_dd_lemma_vec"
    )
    results.append(rezdict)

    # Record the micro and macro F1 scores:
    #micros.append(m)
    #macros.append(M)

np.average(micros)

Save the results:

with open("backup_24_2.txt", "w") as f:
    import json
    for i in results:
        i["y_pred"] = i["y_pred"].tolist()
    json.dump(results, f)

Info to save the model:
model_dir = "/content/text_classification/models"
GINCO_output = "/content/text_classification/data"

GINCO_model = ft.train_supervised(GINCO_input, lr=1.0, epoch=10)

Code for using only lowercase:
splitted_chunks = data.split()
lowered_chunks = (item.lower() for item in splitted_chunks)
return " ".join(list(chunks_without_punctuation))