# Preparing the dataset

Importing the necessary libraries

In [1]:
import json
import pandas as pd

In [2]:
# Import the file

with open("/kaggle/input/genre-identification-corpus-ginco-10/GINCO-1.0-suitable.json") as f:
    dataset = json.load(f)

dataset[0]

 ## Extract text from paragraphs into one string

 We'll create an additional parameter for each text: "keep_text" with text non-duplicates and duplicates, marked as "keep" (useful for genre identification, although they are duplicates).

We'll use keep instead of deduplicated to have more text which will be useful for further analysis

In [3]:
for instance in dataset:
    paragraphs = instance["paragraphs"]
    # Removing duplicates:
    paragraphs = [p for p in paragraphs if p["keep"]]

    # Joining texts:
    instance_keep_text = " <p/> ".join([p["text"] for p in paragraphs])

    # Assigning texts to a new field:
    instance["keep_text"] = instance_keep_text

dataset[0]

## Transform the dataset in tabular form

To get better results, we'll use a smaller number of labels - primary_level_3, which uses the downcasted set of 12 labels.

In [4]:
dataframe = pd.DataFrame(data=dataset, columns=["keep_text","primary_level_3"])
# Renaming columns to `text` and `labels`
dataframe.columns = ["text", "labels"]

dataframe.shape

We can see that by using the keep parameter, we keep all texts.

Let's analyse the distribution of labels.

In [5]:
df_labels = dataframe.groupby("labels")
df_labels.size()

Let's discard categories that are not present enough: Interview (8), Legal/Regulation (17), Announcement (17), Instruction (44).

Let's also discard Other (75) and List of Summaries/Excerpts (106) which have characteristics of multiple genres.

Let's merge Opinionated News and News/Reporting into News.

In [6]:
for i in dataset:
    if i["primary_level_3"] == "Opinionated News" or i["primary_level_3"] == "News/Reporting":
        i["primary_level_3"] = "News"

Let's also discard texts that are marked as "Hard".

In [7]:
downcasted_labels = ['Information/Explanation', 'Promotion', 'News', 'Forum', 'Opinion/Argumentation']

train_downcasted = [i for i in dataset if i["split"] == "train" and i["primary_level_3"] in downcasted_labels and not i["hard"]]
test_downcasted = [i for i in dataset if i["split"] == "test" and i["primary_level_3"] in downcasted_labels and not i["hard"]]
dev_downcasted = [i for i in dataset if i["split"] == "dev" and i["primary_level_3"] in downcasted_labels and not i["hard"]]

print("The train-dev-test splits consist of the following numbers of examples:", len(train_downcasted), len(test_downcasted), len(dev_downcasted))

In [8]:
print(f"Number of all texts is {len(train_downcasted)+len(test_downcasted)+len(dev_downcasted)}")

In [9]:
train_df = pd.DataFrame(data=train_downcasted, columns=["keep_text", "primary_level_3"])
# Renaming columns to `text` and `labels`
train_df.columns = ["text", "labels"]

# Let's look at the beginning of the train dataframe

train_df.head()

Repeat the process with the test and dev split:

In [10]:
test_df = pd.DataFrame(data=test_downcasted, columns=["keep_text", "primary_level_3"])
test_df.columns = ["text", "labels"]
test_df.head()

In [20]:
dev_df = pd.DataFrame(data=dev_downcasted, columns=["keep_text", "primary_level_3"])
# Renaming columns to `text` and `labels`
dev_df.columns = ["text", "labels"]

# Let's look at the beginning of the train dataframe

dev_df.head()

In [12]:
print(train_df.shape, dev_df.shape, test_df.shape)

Let's inspect the labels:

In [21]:
all_df_labels = train_df["labels"].unique().tolist()
for i in test_df["labels"].unique().tolist():
    if i not in all_df_labels:
        all_df_labels.append(i)
        
for i in dev_df["labels"].unique().tolist():
    if i not in all_df_labels:
        all_df_labels.append(i)
        
print(all_df_labels, len(all_df_labels))

## Prepare the data in fastText format

We create csv files with labels in one column and text in another which fastText can read.

In [14]:
train_file_content=""

for labels, text in train_df.loc[:, ["labels", "text"]].values:
  label = f"__label__{labels}"
  train_file_content += f"""{label} {text}\n"""

with open("train_fasttext.train","w") as train_file:
  train_file.write(train_file_content)

We take a look at the created file.

In [15]:
train_file = open("train_fasttext.train","r").read(1000)

train_file

We repeat this process to create test and dev file.

In [16]:
test_file_content=""

for labels, text in test_df.loc[:, ["labels", "text"]].values:
  label = f"__label__{labels}"
  test_file_content += f"""{label} {text}\n"""

with open("test_fasttext.test","w") as test_file:
  test_file.write(test_file_content)

In [17]:
test_file = open("test_fasttext.test","r").read(1000)

test_file

In [23]:
dev_file_content=""

for labels, text in dev_df.loc[:, ["labels", "text"]].values:
  label = f"__label__{labels}"
  dev_file_content += f"""{label} {text}\n"""

with open("dev_fasttext.valid","w") as dev_file:
  dev_file.write(dev_file_content)

Transform labels into labels appropriate for FastText:

In [24]:
LABELS = train_df.labels.unique().tolist()
print(LABELS)

In [25]:
LABELS = [f"__label__{i}" for i in LABELS]
LABELS

# Train a fastText model on GINCO

Resources:
* the Text Classification with Fasttext tutorial (https://github.com/mpuig/textclassification/blob/master/notebook.ipynb)
* Peter's code (https://github.com/5roop/task5_webgenres/)

Importing the necessary libraries

In [26]:
!pip install parse

In [27]:
import fasttext as ft
import parse
import numpy as np
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

## Input the data:

In [28]:
FT_train_file = "train_fasttext.train"
FT_test_file = "test_fasttext.test"
FT_dev_file = "dev_fasttext.valid"

## Training

In [29]:
def parse_test_file(path: str):
    """Reads fasttext formatted file and returns labels, texts."""
    with open(path, "r") as f:
        content = f.readlines()
    pattern = "{label} {text}\n"
    p = parse.compile(pattern)

    labels, texts = list(), list()
    for line in content:
        rez = p.parse(line)
        if rez is not None:
            labels.append(rez["label"])
            texts.append(rez["text"])
        else:
            print("error parsing line ", line)
    return labels, texts

def prediction_to_label(prediction):
    """Transforms predictions as returned by fasttext into pure labels."""
    return np.array(prediction[0])[:, 0]

def plot_cm(save=False, title=None):
    """
    Plots confusion matrix for prediction on the test set.
    Takes the predictions, named as y_pred, true values, named as y_true,
    and labels, named as LABELS.
    
    Arguments:
        save: whether the confusion matrix is saved. Defaults to False.
        title: the title of the confusion matrix. Defaults to None.
    """
    cm = confusion_matrix(y_true, y_pred, labels=LABELS)
    plt.figure(figsize=(9, 9))
    plt.imshow(cm, cmap="Oranges")
    classNames = LABELS
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    tick_marks = np.arange(len(classNames))
    plt.xticks(tick_marks, classNames, rotation=90)
    plt.yticks(tick_marks, classNames)
    microF1 = f1_score(y_true, y_pred, labels=LABELS, average ="micro")
    macroF1 = f1_score(y_true, y_pred, labels=LABELS, average ="macro")

    print(f"{microF1:0.4}")
    print(f"{macroF1:0.4}")

    metrics = f"{microF1:0.4}, {macroF1:0.4}"
    if title:
        plt.title(title +";\n" + metrics)
    else:
        plt.title(metrics)
    plt.tight_layout()
    if save:
        plt.savefig(save)
    plt.show()
    return microF1, macroF1

## Finding optimal hyperparameters

### Using the automatic hyperparameter optimisation

In [None]:
results_automatic_optimisation = []

for i in range(3):
        model = ft.train_supervised(input=FT_train_file,
                                autotuneValidationFile= FT_dev_file,
                                autotuneDuration=60*5,
                                verbose = 2
                                            )
        # Parse the test files so that labels and texts are separated
        y_true, y_texts = parse_test_file(FT_dev_file)

        # Evaluate te model on test data
        y_pred = model.predict(y_texts)
        y_pred = prediction_to_label(y_pred)

        # Plot the confusion matrix:
        m, M = plot_cm(save=False, title=f"Automatic Hyperparameter Optimisation, run {i}")

        rezdict = dict(
            microF1=m,
            macroF1=M,
            run=i
        )
        results_automatic_optimisation.append(rezdict)

        # Print out the hyperparameters:
        args_obj = model.f.getArgs()
        for hparam in dir(args_obj):
            if not hparam.startswith('__'):
                print(f"{hparam} -> {getattr(args_obj, hparam)}")

### Finding optimal hyperparameters with experiments

In [76]:
results = list()

exp_range = [1,2,3,4,5]

for i in exp_range:
    model = ft.train_supervised(input=FT_train_file,
                                epoch = 350,
                                lr = 0.7,
                                wordNgrams=i,
                                verbose = 2
                                            )
    # Parse the test files so that labels and texts are separated
    y_true, y_texts = parse_test_file(FT_dev_file)

    # Evaluate te model on test data
    y_pred = model.predict(y_texts)
    y_pred = prediction_to_label(y_pred)

    # Plot the confusion matrix:
    m, M = plot_cm(save=False, title=f"NGram: {i}")
    
    rezdict = dict(
        microF1=m,
        macroF1=M,
        ngram=i
    )
    results.append(rezdict)

In [77]:
results

In [78]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

plt.style.use('_mpl-gallery')

# make data

x = []
mi = []
ma = []

for i in results:
    x.append(i['ngram'])
    mi.append(i['microF1'])
    ma.append(i["macroF1"])

# plot
fig, ax = plt.subplots(figsize=(6,3), dpi=100)

ax.plot(x, mi, linewidth=2.0, label="Micro F1")
ax.plot(x, ma, linewidth=2.0, label="Macro F1")

ax.set(xlim=(min(exp_range), max(exp_range)),
       ylim=(0.5, 0.7),xticks=exp_range)

ax.set_xlabel('Word n-Grams')
ax.set_ylabel('F1 Score')
ax.legend(loc="lower right")

plt.tight_layout()
plt.savefig("N-grams.png")
plt.show()

In [79]:
print(f"micro F1: {np.array(mi).mean():0.03} +/- {np.array(mi).std():0.02}")
print(f"macro F1: {np.array(ma).mean():0.03} +/- {np.array(ma).std():0.02}")