In [None]:
# Define the gpu on the gpu machine
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=1

import json
import pandas as pd
import random
import regex
# install the libraries necessary for data wrangling, prediction and result analysis
import json
import numpy as np
import pandas as pd
import logging
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, f1_score,precision_score, recall_score
import torch
from numba import cuda
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from simpletransformers.classification import ClassificationModel
import wandb
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Login to wandb
wandb.login()

# Train the model

In [None]:
# X-GENRE-2
train_df = pd.read_csv("/kaggle/input/xgenre2/X-GENRE-train.csv-2.csv", index_col=0)
dev_df = pd.read_csv("/kaggle/input/xgenre2/X-GENRE-dev.csv-2.csv",  index_col = 0)
test_df = pd.read_csv("/kaggle/input/xgenre2/X-GENRE-test.csv-2.csv", index_col = 0)

print("X-GENRE-2 train shape: {}, Dev shape: {}, Test shape: {}.".format(train_df.shape, dev_df.shape, test_df.shape))

In [None]:
# Create a list of labels
LABELS = train_df.labels.unique().tolist()
print(LABELS)

In [None]:
# Initialize Wandb
wandb.init(project="X-GENRE classifiers", entity="tajak", name="X-GENRE-2-training")

In [None]:
# Calculate how many steps will each epoch have
# Num steps in epoch = training samples / batch size
steps_per_epoch = int(1562/8)
steps_per_epoch

In [None]:
# Create a TransformerModel
roberta_base_model = ClassificationModel(
        "xlmroberta", "xlm-roberta-base",
        num_labels=len(LABELS),
        use_cuda=True,
        args= {
            "overwrite_output_dir": True,
            "num_train_epochs": 30,
            "train_batch_size":8,
            "learning_rate": 1e-5,
            # Use these parameters if you want to evaluate during training
            "evaluate_during_training": True,
            "evaluate_during_training_steps": steps_per_epoch*10,
            "evaluate_during_training_verbose": True,
            "use_cached_eval_features": True,
            'reprocess_input_data': True,
            "labels_list": LABELS,
            # The following parameters are commented out because I want to save the model
            "no_cache": True,
            # Disable no_save: True if you want to save the model
            "no_save": True,
            "max_seq_length": 512,
            "save_steps": -1,
            # Only the trained model will be saved - to prevent filling all of the space
            "save_model_every_epoch":False,
            "wandb_project": 'X-GENRE classifiers',
            "silent": True,
            }
        )

In [None]:
# Train the model
roberta_base_model.train_model(train_df, eval_df = dev_df)

In [None]:
def testing(test_df, test_name, epoch):
    """
    This function takes the test dataset and applies the trained model on it to infer predictions.
    It also prints and saves a confusion matrix, calculates the F1 scores and saves the results in a list of results.

    Args:
    - test_df (pandas DataFrame)
    - test_name
    - epoch: num_train_epochs
    """
    # Get the true labels
    y_true = test_df.labels

    model = roberta_base_model
    
    # Calculate the model's predictions on test
    def make_prediction(input_string):
        return model.predict([input_string])[0][0]

    y_pred = test_df.text.apply(make_prediction)

    # Calculate the scores
    macro = f1_score(y_true, y_pred, labels=LABELS, average="macro")
    micro = f1_score(y_true, y_pred, labels=LABELS,  average="micro")
    print(f"Macro f1: {macro:0.3}, Micro f1: {micro:0.3}")

    # Plot the confusion matrix:
    cm = confusion_matrix(y_true, y_pred, labels=LABELS)
    plt.figure(figsize=(9, 9))
    plt.imshow(cm, cmap="Oranges")
    for (i, j), z in np.ndenumerate(cm):
        plt.text(j, i, '{:d}'.format(z), ha='center', va='center')
    classNames = LABELS
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    tick_marks = np.arange(len(classNames))
    plt.xticks(tick_marks, classNames, rotation=90)
    plt.yticks(tick_marks, classNames)
    plt.title(f"{test_name}")

    plt.tight_layout()
    fig1 = plt.gcf()
    plt.show()
    plt.draw()
    #fig1.savefig(f"Confusion-matrix-{test_name}.png",dpi=100)

    # Save the results:
    rezdict = {
        "experiment": test_name,
        "num_train_epochs": epoch,
        "train_batch_size":8,
        "learning_rate": 1e-5,
        "microF1": micro,
        "macroF1": macro,
        "y_true": y_true.to_dict(),
        "y_pred": y_pred.to_dict(),
        }
    #previous_results.append(rezdict)

    #Save intermediate results (just in case)
    backup = []
    backup.append(rezdict)
    with open(f"backup-results-{test_name}.json", "w") as backup_file:
        json.dump(backup,backup_file, indent= "")

In [None]:
# Train the model for various epochs to find the optimum number
epochs = [2, 5, 8, 10, 15, 20]

for epoch in epochs:
    roberta_base_model = ClassificationModel(
                "xlmroberta", "xlm-roberta-base",
                num_labels=len(LABELS),
                use_cuda=True,
                args= {
                    "overwrite_output_dir": True,
                    "num_train_epochs": epoch,
                    "train_batch_size":8,
                    "learning_rate": 1e-5,
                    "labels_list": LABELS,
                    # The following parameters (no_cache, no_save) are commented out if I want to save the model
                    "no_cache": True,
                    # Disable no_save: True if you want to save the model
                    "no_save": True,
                    "max_seq_length": 512,
                    "save_steps": -1,
                    # Only the trained model will be saved - to prevent filling all of the space
                    "save_model_every_epoch":False,
                    "wandb_project": 'X-GENRE classifiers',
                    "silent": True,
                    }
                )

    # Train the model
    roberta_base_model.train_model(train_df)
    
    # Test the model on dev_df
    testing(dev_df, f"X-GENRE-dev-epoch-search:{epoch}", epoch)

In [None]:
# Compare the results by creating a dataframe from the previous_results dictionary:
results_df = pd.DataFrame(previous_results)

results_df

In [None]:
# Create a TransformerModel
roberta_base_model = ClassificationModel(
        "xlmroberta", "xlm-roberta-base",
        num_labels=len(LABELS),
        use_cuda=True,
        args= {
            "overwrite_output_dir": True,
            "num_train_epochs": 8,
            "train_batch_size":8,
            "learning_rate": 1e-5,
            "labels_list": LABELS,
            # The following parameters are commented out because I want to save the model
            #"no_cache": True,
            # Disable no_save: True if you want to save the model
            #"no_save": True,
            "max_seq_length": 512,
            "save_steps": -1,
            # Only the trained model will be saved - to prevent filling all of the space
            "save_model_every_epoch":False,
            "wandb_project": 'X-GENRE classifiers',
            "silent": True,
            }
        )

In [None]:
# Train the model
roberta_base_model.train_model(train_df)

## Predicting genres

In [None]:
# Import the dataset
# Import the final dataset with test sets
with open("manual-annotations/multilingual-genre-annotated-test-set.json") as main_file:
	main_dict = json.load(main_file)

main_dict.keys()

In [None]:
from transformers import AutoTokenizer
import sys
import torch
import json
from scipy.special import softmax
from transformers import AutoModelForSequenceClassification
import argparse
from knockknock import discord_sender

In [None]:
def predict_genre(texts):
    prediction_list = []

    model = AutoModelForSequenceClassification.from_pretrained("classla/xlm-roberta-base-multilingual-text-genre-classifier")
    model.to("cuda:0")

    tokenizer = AutoTokenizer.from_pretrained("classla/xlm-roberta-base-multilingual-text-genre-classifier")

    labels = ["Other", "Information/Explanation", "News", "Instruction", "Opinion/Argumentation", "Forum", "Prose/Lyrical", "Legal", "Promotion"]

    def transcode(logit):
        cats=sorted(zip(labels,softmax(logit)),key=lambda x:-x[1])
        if cats[0][1]>=0.8:
            label=cats[0][0]
        else:
            label='Mix'
        return label

    inputs = tokenizer(texts, max_length=512, truncation=True, padding=True, return_tensors="pt").to("cuda:0")

    with torch.no_grad():
        logits = model(**inputs).logits

    for idx in range(len(logits)):
        current_logit = logits[idx].tolist()
        prediction_list.append(transcode(current_logit))
    
    print("Prediction finished.")

    return prediction_list

In [None]:
for column in ['translation','shuffled-text', 'text_no_punct', 'text_no_capital','text_no_capital_rand', 'text_no_num', 'text_no_num_rand','text_no_structure']:
  text_list = df[column].to_list()
  print(f"Predicting genres to column {column}")
  prediction_list = predict_genre(text_list)
  df[f"{column}-pred"] = prediction_list


df.head(1)

## Analysing predictions

In [None]:
import pandas as pd
from evaluation import testing

In [None]:
# Open the test set
df = pd.read_csv("datasets/adversarial-analysis/merged-test-sets-for-adversarial-analysis.csv", index_col=0)
df.head(2)

In [None]:
for column in ['y_true', 'translation-pred', 'shuffled-text-pred','text_no_punct-pred', 'text_no_capital-pred','text_no_capital_rand-pred', 'text_no_num-pred','text_no_num_rand-pred', 'text_no_structure-pred', 'text_random_removal_10-pred',
       'text_random_removal_25-pred', 'text_random_removal_50-pred']:
	print(f"{column}\n\n")
	merge_stats = pd.concat((df[column].value_counts(normalize=True), df[column].value_counts()), axis=1)
	merge_stats.rename(columns = {"count": f"count_{column}"}, inplace=True)

	if column not in ['text_no_capital-pred', 'text_no_num-pred','text_no_num_rand-pred', 'text_no_structure-pred']:
		merge_stats = pd.concat((merge_stats, df["y_pred"].value_counts()), axis=1)
		merge_stats.rename(columns = {"count": "count_y_pred"}, inplace=True)
		if column != "y_true":
			merge_stats = pd.concat((merge_stats, df["y_true"].value_counts()), axis=1)
			merge_stats.rename(columns = {"count": "count_y_true"}, inplace=True)
			merge_stats["change vs y_true (%)"] = ((merge_stats[f"count_{column}"] - merge_stats["count_y_true"])/merge_stats[f"count_y_true"]*100)
		merge_stats["change vs y_pred (%)"] = ((merge_stats[f"count_{column}"] - merge_stats["count_y_pred"])/merge_stats[f"count_y_pred"]*100)
		print(merge_stats.sort_values(by="change vs y_pred (%)", ascending=False).to_markdown())
		print("\n --------------------------- \n")
	elif column == 'text_no_capital-pred':
		merge_stats = pd.concat((merge_stats, df["text_no_capital_rand-pred"].value_counts()), axis=1)
		merge_stats.rename(columns = {"count": "count_no_capital_rand"}, inplace=True)
		merge_stats = pd.concat((merge_stats, df["y_true"].value_counts()), axis=1)
		merge_stats.rename(columns = {"count": "count_y_true"}, inplace=True)
		merge_stats["change vs y_true (%)"] = ((merge_stats[f"count_{column}"] - merge_stats["count_y_true"])/merge_stats[f"count_y_true"]*100)
		merge_stats["change vs no_capital_rand (%)"] = ((merge_stats[f"count_{column}"] - merge_stats["count_no_capital_rand"])/merge_stats[f"count_no_capital_rand"]*100)
		print(merge_stats.sort_values(by="change vs no_capital_rand (%)", ascending=False).to_markdown())
		print("\n --------------------------- \n")
	elif column in ['text_no_num-pred', 'text_no_structure-pred']:
		merge_stats = pd.concat((merge_stats, df['text_no_num_rand-pred'].value_counts()), axis=1)
		merge_stats.rename(columns = {"count": "count_no_num_rand"}, inplace=True)
		merge_stats = pd.concat((merge_stats, df["y_true"].value_counts()), axis=1)
		merge_stats.rename(columns = {"count": "count_y_true"}, inplace=True)
		merge_stats["change vs y_true (%)"] = ((merge_stats[f"count_{column}"] - merge_stats["count_y_true"])/merge_stats[f"count_y_true"]*100)
		merge_stats["change vs no_num_rand (%)"] = ((merge_stats[f"count_{column}"] - merge_stats["count_no_num_rand"])/merge_stats[f"count_no_num_rand"]*100)
		print(merge_stats.sort_values(by="change vs no_num_rand (%)", ascending=False).to_markdown())
		print("\n --------------------------- \n")


In [None]:
results = {}

for column in ["y_pred", 'translation-pred', 'shuffled-text-pred', 'text_no_punct-pred', 'text_no_capital-pred', 'text_no_capital_rand-pred', 'text_no_num-pred', 'text_no_num_rand-pred', 'text_no_structure-pred', 'text_random_removal_10-pred', 'text_random_removal_25-pred', 'text_random_removal_50-pred']:
	print(column)
	print("\n\n")
	y_pred = df[column].to_list()
	y_true = df["y_true"].to_list()
	labels = list(df["y_true"].unique())

	current_result = testing(y_true, y_pred, labels, show_matrix=True)

	print(current_result)

	current_dict = {}
	current_dict["macro_F1"] = current_result["macro F1"]

	for label in labels:
		try:
			current_dict[f"{label}_F1"] = current_result["report"][label]["f1-score"]
		except:
			continue

	results[column] = current_dict
	print("---------------")

In [None]:
results_df = pd.DataFrame(results)

results_df