In [1]:
from back2back import Back2BackTranslator
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from matplotlib import pyplot as plt
import torch
from collections import Counter
from tqdm.notebook import tqdm
tqdm.pandas()

In [2]:
def proprocess_data(validation_proportion=0.3):
    # returns 3 dataframes, one for the train set, one for validation set, and one for test
    
    data_pcl = pd.read_csv("./datasets/dontpatronizeme_pcl.tsv", sep="\t", skiprows=3,
                           names=['par_id','art_id','keyword','country_code','text','label'])
    dev_ids = pd.read_csv('./datasets/dev_semeval_parids-labels.csv')
    train_ids = pd.read_csv('./datasets/train_semeval_parids-labels.csv')
    
    # Binary labels
    data_pcl['labels'] = data_pcl.label > 1.5
    
    # Select train and test examples according to train_ids and dev_ids
    train_df = data_pcl.loc[data_pcl.par_id.isin(train_ids.par_id)][['text', 'labels']]
    dev_df = data_pcl.loc[data_pcl.par_id.isin(dev_ids.par_id)][['text', 'labels']]
    
    print('train_df.shape =', train_df.shape)
    print('dev_df.shape =', dev_df.shape)
    
    # Is pcl and is not pcl
    yes_pcl = train_df.loc[train_df.labels==True]
    no_pcl = train_df.loc[train_df.labels==False]
    
    print('yes_pcl.shape =', yes_pcl.shape)
    print('no_pcl.shape =', no_pcl.shape)
    print('proportion_no_over_yes =', len(no_pcl) / len(yes_pcl))
    
    # Seperate train and validation sets randomly with equal proportion of yes-no labels
    # for reproducibility:
    np.random.seed(1234)
    yes_ids = np.random.permutation(len(yes_pcl))
    no_ids = np.random.permutation(len(no_pcl))
    
    n_yes_val = int(validation_proportion * len(yes_pcl))
    n_no_val = int(validation_proportion * len(no_pcl))
    
    n_yes_copies = int( len(no_pcl) / len(yes_pcl) )
    
    validation_set = pd.concat((yes_pcl.iloc[yes_ids[:n_yes_val]], no_pcl.iloc[no_ids[:n_no_val]]))
    train_set = pd.concat((pd.concat((yes_pcl.iloc[yes_ids[n_yes_val:]] for _ in range(n_yes_copies))), no_pcl.iloc[no_ids[n_no_val:]]))
    
    print('n_yes_val =', (validation_set['labels'] > .5).sum())
    print('n_no_val =', (validation_set['labels'] < .5).sum())
    print('n_validation =', len(validation_set))
    
    print('n_yes_train =', (train_set['labels'] > .5).sum())
    print('n_no_train =', (train_set['labels'] < .5).sum())
    print('n_train =', len(train_set))
    
    # Shuffle the training set... Eventhough I'm pretty sure it's already done at every epoch when training
    train_set = train_set.iloc[np.random.permutation(len(train_set))]
    
    return train_set, validation_set, dev_df


train_set, validation_set, test_set = proprocess_data(validation_proportion=0.3)

train_df.shape = (8375, 2)
dev_df.shape = (2094, 2)
yes_pcl.shape = (794, 2)
no_pcl.shape = (7581, 2)
proportion_no_over_yes = 9.547858942065492
n_yes_val = 238
n_no_val = 2274
n_validation = 2512
n_yes_train = 5004
n_no_train = 5307
n_train = 10311


In [3]:
train_set

Unnamed: 0,text,labels
8406,"And like temporary refugees , they became the ...",True
6189,Due to cultural factors and the government 's ...,False
239,"Stefanovic said immigrants "" from faraway land...",False
3137,The spirit the Rwandese have shown to accommod...,True
2700,Minnis told legislators that migrants who are ...,False
...,...,...
278,A submission from the Irish Women 's Council o...,False
333,The government has agreed to pay pensions to d...,False
6678,PIE replaces the common law action whereby own...,False
3548,Aside from the subdivision for urban poor fami...,False


In [4]:
def augmentation_b2b_translation(dataset, languages):
    '''Augment dataset with back 2 back translation'''
    b2b = Back2BackTranslator()
    list_datasets = [dataset]
    for lang in languages:
        duplicate = dataset.copy()
        duplicate['text'] = duplicate['text'].progress_apply(
            lambda txt: b2b.translate_back2back(lang, txt)
        )
        list_datasets.append(duplicate)
    augmented_dataset = pd.concat((list_datasets))
    return augmented_dataset

In [7]:
languages = ('pt', 'fr', 'cn', 'jp')
# I'm using 10 examples (too long on my pc) but should be fine with gpu # Remove .iloc[:10]
new_train_set = augmentation_b2b_translation(train_set.iloc[:10], languages)
#new_train_set = augmentation_b2b_translation(train_set, languages)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

In [12]:
print(new_train_set.shape) # In my case: 10 datapoints * 5 languages = 50 datapoints

(50, 2)


In [19]:
model_args = ClassificationArgs(
    num_train_epochs=2,
    no_save=False,
    no_cache=False,
    overwrite_output_dir=True,
    evaluate_during_training=True, 
    output_dir='./output1',
    best_model_dir='./output1/best_model',
    max_seq_length=256, #was 128 by default
    save_eval_checkpoints=True,
    save_model_every_epoch=True,
    save_steps=100_000,
    evaluate_during_training_verbose=True,
    learning_rate=4e-5,
    train_batch_size=16, # was 8
    logging_steps=2,
)
model = ClassificationModel(
    "roberta",
    "roberta-base",
    args=model_args,
    use_cuda=torch.cuda.is_available(),
)

print(torch.cuda.is_available())

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

False


In [21]:
# Training on a mini set (cause don't have gpu for this notebook)
model.train_model(
    new_train_set,
    eval_df=validation_set.iloc[[0,1,2,-3,-2,-1]], # Remove iloc for full validation set
    show_running_loss=True,
    f1=f1_score,
)

  0%|          | 0/50 [00:00<?, ?it/s]



Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/4 [00:00<?, ?it/s]



  0%|          | 0/6 [00:00<?, ?it/s]

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


Running Epoch 1 of 2:   0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

(8,
 defaultdict(list,
             {'global_step': [4, 8],
              'train_loss': [0.7654550075531006, 0.5815525054931641],
              'mcc': [0.0, 0.0],
              'tp': [0, 0],
              'tn': [3, 3],
              'fp': [0, 0],
              'fn': [3, 3],
              'auroc': [0.5555555555555556, 0.5555555555555556],
              'auprc': [0.5888888888888889, 0.5888888888888889],
              'f1': [0.0, 0.0],
              'eval_loss': [0.6934757232666016, 0.6939296126365662]}))

In [22]:
out1 = pd.read_csv('./output1/training_progress_scores.csv')
out1

Unnamed: 0,global_step,train_loss,mcc,tp,tn,fp,fn,auroc,auprc,f1,eval_loss
0,4,0.765455,0.0,0,3,0,3,0.555556,0.588889,0.0,0.693476
1,8,0.581553,0.0,0,3,0,3,0.555556,0.588889,0.0,0.69393
