# Installing dependencies

The following lines assume you're running this script using python version 3.6, on a Linux machine with CUDA GPU.
If any aforementioned assumption is incorrect, please replace the first pip3 install line in the next box with the correct requirement from https://pytorch.org/get-started/locally/.

In [10]:

print("Installing dependencies...")
from IPython.display import clear_output
import sys
!{sys.executable} -m pip install torch==1.10.2+cu113 torchvision==0.11.3+cu113 torchaudio==0.10.2+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html --ignore-installed
!{sys.executable} -m pip install numpy pandas matplotlib tqdm simpletransformers  --ignore-installed
!{sys.executable} -m pip install -U scikit-learn --ignore-installed
!{sys.executable} -m pip install simpletransformers --ignore-installed
clear_output()

print("Installing dependencies: done")

Installing dependencies: done


## Importing packages & Setting up CUDA GPU device

In [11]:
print("Importing installed packages...")
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from matplotlib import pyplot as plt
import torch
from collections import Counter
from tqdm import tqdm
tqdm.pandas()

#%%
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device=", device)
if torch.cuda.is_available():
    print("Emptying CUDA GPU Cache")
    torch.cuda.empty_cache()

print("Importing installed packages: done")

Importing installed packages...
device= cuda
Emptying CUDA GPU Cache
Importing installed packages: done


### Extract datasets from csvs:

In [12]:
print("Extracting datasets from csvs...")
train_ids = pd.read_csv('./datasets/train_semeval_parids-labels.csv')
test_ids = pd.read_csv('./datasets/dev_semeval_parids-labels.csv')
data_pcl = pd.read_csv("./datasets/dontpatronizeme_pcl.tsv", sep="\t", skiprows=3,
                        names=['par_id','art_id','keyword','country_code','text','label'])

# Introducing yes/no PCL label from PCL score annotation
data_pcl['is_pcl'] = data_pcl.label >= 2

Extracting datasets from csvs...


### Pre-processing

In [13]:

print("Pre-processing dataset")
# Seperate train and test df according to train_ids and test_ids
train_df = data_pcl.loc[data_pcl.par_id.isin(train_ids.par_id)][['par_id','keyword','text', 'is_pcl']]
test_df = data_pcl.loc[data_pcl.par_id.isin(test_ids.par_id)][['par_id','text', 'is_pcl']]

yes_pcl = train_df.loc[train_df.is_pcl==True]
no_pcl = train_df.loc[train_df.is_pcl==False]

Pre-processing dataset


In [14]:
# Separate data frame into train and validation sets
def separate_train_validation(pcl_df, percent_validation=0.10):
    assert(0 <= percent_validation <= 1)
    keywords = set(train_df.keyword.to_list())
    pcl_validation_list = []
    pcl_train_list = []
    for keyword in keywords:
        pcl_with_keyword = pcl_df.loc[pcl_df.keyword==keyword]
        validation_set_len = int(np.floor(len(pcl_with_keyword) * percent_validation))
        pcl_validation_list.append(pcl_with_keyword[:validation_set_len])
        pcl_train_list.append(pcl_with_keyword[validation_set_len:])

    pcl_validation_df = pd.concat(pcl_validation_list)
    pcl_train_df = pd.concat(pcl_train_list)

    return pcl_validation_df, pcl_train_df 

yes_pcl_validation, yes_pcl_train = separate_train_validation(yes_pcl)
no_pcl_validation, no_pcl_train = separate_train_validation(no_pcl)

In [15]:
# Augment yes_pcl data frame with backtranslated data
def augment_with_translations(yes_pcl_df, no_pcl_df):
    yes_pcl_translations = pd.read_csv("./datasets/data_pcl_translations.csv")
    yes_pcl_translations['is_pcl'] = True
    yes_pcl_translations = yes_pcl_translations.loc[yes_pcl_translations.par_id.isin(yes_pcl_df.par_id)]
    yes_pcl_df = pd.concat([yes_pcl_df, yes_pcl_translations])
    new_df = pd.concat([yes_pcl_df, no_pcl_df])[['text', 'is_pcl']]
    print('nb yes', (new_df['is_pcl'] > .5).sum())
    print('nb no', (new_df['is_pcl'] < .5).sum())
    return new_df

print("Augmenting training dataframe with translations")
new_train = augment_with_translations(yes_pcl_train, no_pcl_train)
new_validation = augment_with_translations(yes_pcl_validation, no_pcl_validation)

print("Data augmentation: done")
print('PCL len(validation) = ', (new_validation['is_pcl'] > .5).sum())
print('NON-PCL len(validation) = ', (new_validation['is_pcl'] < .5).sum())
print('PCL len(train) = ', (new_train['is_pcl'] > .5).sum())
print('NON-PCL len(train) = ', (new_train['is_pcl'] < .5).sum())

Augmenting training dataframe with translations
nb yes 6471
nb no 6827
nb yes 675
nb no 754
Data augmentation: done
PCL len(validation) =  675
NON-PCL len(validation) =  754
PCL len(train) =  6471
NON-PCL len(train) =  6827


# Fine Tuning Model

In [17]:
print("Creating model...")
# Generate model
batch_size = 64
seq_length = 128
epochs = 10

task1_model_args = ClassificationArgs(num_train_epochs=10,
                                        no_save=False,
                                        no_cache=False,
                                        overwrite_output_dir=True,
                                        evaluate_during_training=True, 
                                        output_dir=f'./outputs/test_roberta_bs_{batch_size}_seq_{seq_length}', #by default
                                        best_model_dir=f'./outputs/test_roberta_bs_{batch_size}_seq_{seq_length}/best_model',
                                        max_seq_length=seq_length, #by default 128, it could be intresting to see if this trucates our texts
                                        save_eval_checkpoints=False,
                                        save_model_every_epoch=True,
                                        save_steps=-1,
                                        evaluate_during_training_verbose=False,
                                        learning_rate=4e-5,
                                        train_batch_size=batch_size,
                                        early_stopping_metric='f1',
                                        early_stopping_metric_minimize=False,
                                        early_stopping_patience=100,
                                        )


task1_model = ClassificationModel("roberta", "roberta-base",
                                    args=task1_model_args,
                                    use_cuda=torch.cuda.is_available()
                                    )

print("Pre-Trained Roberta Model Generated.")

Creating model...


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

In [None]:
# Train the model
print("Training the model...")
task1_model.train_model(new_train, show_running_loss=True, eval_df=new_validation, f1=f1_score)
print("Model trained")

In [None]:

# run predictions
print("Running prediction for dev dataset")
preds_task1, _ = task1_model.predict(test_df.text.tolist())
print(Counter(preds_task1))

In [None]:

# Evaluate predictions
true_positive = ((preds_task1 == 1) & (test_df.label == preds_task1)).sum() / (
    preds_task1 == 1).sum()
false_positive = ((preds_task1 == 1) & (test_df.label != preds_task1)).sum() / (
    preds_task1 == 1).sum()
true_negative = ((preds_task1 == 0) & (test_df.label == preds_task1)).sum() / (
    preds_task1 == 0).sum()
false_negative = ((preds_task1 == 0) & (test_df.label != preds_task1)).sum() / (
    preds_task1 == 0).sum()
precision = true_positive / (true_positive + false_positive)
recall = true_positive / (true_positive + true_negative)
accuracy = (test_df.label == preds_task1).mean()
f1_score = 2 * precision * recall / (precision + recall)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1_score)