# Main imports and code

In [1]:
# check which gpu we're using
!nvidia-smi

Thu Mar  3 22:23:55 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0  On |                  N/A |
|  0%   49C    P8    42W / 370W |   3684MiB / 24265MiB |     24%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# !pip install simpletransformers
# !pip install tensorboardx

In [3]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs, MultiLabelClassificationModel, MultiLabelClassificationArgs
from urllib import request
import pandas as pd
import logging
import torch
from collections import Counter
from ast import literal_eval

In [4]:
# prepare logger
logging.basicConfig(level=logging.INFO)

transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# check gpu
cuda_available = torch.cuda.is_available()

print('Cuda available? ',cuda_available)

Cuda available?  True


In [5]:
if cuda_available:
  import tensorflow as tf
  # Get the GPU device name.
#   device_name = tf.test.gpu_device_name()
#   # The device name should look like the following:
#   if device_name == '/device:GPU:0':
#       print('Found GPU at: {}'.format(device_name))
#   else:
#       raise SystemError('GPU device not found')

# Fetch Don't Patronize Me! data manager module

In [6]:
# module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
# module_name = module_url.split('/')[-1]
# print(f'Fetching {module_url}')
# #with open("file_1.txt") as f1, open("file_2.txt") as f2
# with request.urlopen(module_url) as f, open(module_name,'w') as outf:
#   a = f.read()
#   outf.write(a.decode('utf-8'))

In [7]:
# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

In [8]:
from dont_patronize_me import DontPatronizeMe

In [9]:
dpm = DontPatronizeMe('data/', 'data/task4_test.tsv')

In [10]:
dpm.load_task1()
dpm.load_task2(return_one_hot=True)

Map of label to numerical label:
{'Unbalanced_power_relations': 0, 'Shallow_solution': 1, 'Presupposition': 2, 'Authority_voice': 3, 'Metaphors': 4, 'Compassion': 5, 'The_poorer_the_merrier': 6}


# Load paragraph IDs

In [11]:
trids = pd.read_csv('data/train_semeval_parids-labels.csv')
teids = pd.read_csv('data/dev_semeval_parids-labels.csv')

In [12]:
trids.head()

Unnamed: 0,par_id,label
0,4341,"[1, 0, 0, 1, 0, 0, 0]"
1,4136,"[0, 1, 0, 0, 0, 0, 0]"
2,10352,"[1, 0, 0, 0, 0, 1, 0]"
3,8279,"[0, 0, 0, 1, 0, 0, 0]"
4,1164,"[1, 0, 0, 1, 1, 1, 0]"


In [13]:
trids.par_id = trids.par_id.astype(str)
teids.par_id = teids.par_id.astype(str)

# Rebuild training set (Task 1)

In [14]:
def rebuild_raw_dataset(dpm, ids):
    rows = [] # will contain par_id, label and text
    for idx in range(len(ids)):  
        parid = ids.par_id[idx]

        # select row from original dataset
        text = dpm.loc[dpm.par_id == parid].text.values[0]
        label = dpm.loc[dpm.par_id == parid].label.values[0]
        orig_label = dpm.loc[dpm.par_id == parid].orig_label.values[0]
        art_id = dpm.loc[dpm.par_id == parid].art_id.values[0]
        keyword = dpm.loc[dpm.par_id == parid].keyword.values[0]
        country = dpm.loc[dpm.par_id == parid].country.values[0]
        rows.append({
            'par_id':parid,
            'text':text,
            'label':label,
            'orig_label': orig_label,
            'art_id': art_id,
            'keyword': keyword,
            'country': country
        })
    return pd.DataFrame(rows)

In [15]:
trdf1 = rebuild_raw_dataset(dpm.train_task1_df, trids)

# Rebuild test set (Task 1)

In [16]:
# rows = [] # will contain par_id, label and text
# for idx in range(len(teids)):  
#   parid = teids.par_id[idx]
#   #print(parid)
#   # select row from original dataset
#   text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
#   label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]
#   rows.append({
#       'par_id':parid,
#       'text':text,
#       'label':label
#   })

tedf1 = rebuild_raw_dataset(dpm.train_task1_df, teids)
  

In [17]:
len(tedf1)

2094

In [18]:
# tedf1 = pd.DataFrame(rows)

# RoBERTa Baseline for Task 1

In [19]:
# downsample negative instances
pcldf = trdf1[trdf1.label==1]
npos = len(pcldf)

training_set1 = pd.concat([pcldf,trdf1[trdf1.label==0][:npos*2]])

In [20]:
training_set1

Unnamed: 0,par_id,text,label,orig_label,art_id,keyword,country
0,4341,"The scheme saw an estimated 150,000 children f...",1,4,@@17139403,poor-families,gb
1,4136,Durban 's homeless communities reconciliation ...,1,2,@@22273328,homeless,za
2,10352,The next immediate problem that cropped up was...,1,4,@@21102155,poor-families,lk
3,8279,Far more important than the implications for t...,1,2,@@21220476,vulnerable,nz
4,1164,To strengthen child-sensitive social protectio...,1,4,@@14727121,poor-families,gh
...,...,...,...,...,...,...,...
2377,1775,Last but not the least element of culpability ...,0,0,@@9622478,refugee,nz
2378,1776,"Then , taking the art of counter-intuitive non...",0,0,@@7919587,refugee,ie
2379,1777,Kagunga village was reported to lack necessary...,0,0,@@1921089,refugee,tz
2380,1778,"""After her parents high-profile divorce after ...",0,1,@@19805601,vulnerable,my


In [21]:
import random, os
import numpy as np
def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [22]:
seed_everything(42)

task1_model_args = ClassificationArgs(num_train_epochs=1, 
                                      no_save=True, 
                                      no_cache=True, 
                                      overwrite_output_dir=True)
task1_model = ClassificationModel("roberta", 
                                  'roberta-base', 
                                  args = task1_model_args, 
                                  num_labels=2, 
                                  use_cuda=cuda_available)
# train model
task1_model.train_model(training_set1[['text', 'label']])

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

(298, 0.511156686450411)

In [27]:
# analysis
analysis_dpm = DontPatronizeMe('data/', 'data/task4_test.tsv')
analysis_dpm.load_task1("test_short.tsv")
analysis = analysis_dpm.train_task1_df
print(len(analysis))

preds_task1, _ = task1_model.predict(analysis.text.tolist())

print(Counter(preds_task1))
from datasets import load_metric

metric_f1 = load_metric("f1")
metric_precision = load_metric("precision")
metric_recall = load_metric("recall")


f1 = metric_f1.compute(predictions=preds_task1, references=analysis.label.tolist())["f1"]
precision = metric_precision.compute(predictions=preds_task1, references=analysis.label.tolist())["precision"]
recall = metric_recall.compute(predictions=preds_task1, references=analysis.label.tolist())["recall"]
f1, precision, recall

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


1012


  0%|          | 3/1012 [00:00<01:25, 11.78it/s]
100%|██████████| 127/127 [00:02<00:00, 47.16it/s]


Counter({0: 874, 1: 138})


(0.47706422018348627, 0.37681159420289856, 0.65)

In [25]:
dpm.load_test()
preds_task1, _ = task1_model.predict(dpm.test_set_df.text.tolist())
labels2file([[k] for k in preds_task1], 'task1.txt')

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
  0%|          | 8/3832 [00:00<07:24,  8.60it/s]
100%|██████████| 479/479 [00:10<00:00, 47.44it/s]


# Rebuild training set (Task 2)

In [None]:
rows2 = [] # will contain par_id, label and text
for idx in range(len(trids)):  
  parid = trids.par_id[idx]
  label = trids.label[idx]
  # select row from original dataset to retrieve the `text` value
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  rows2.append({
      'par_id':parid,
      'text':text,
      'label':label
  })
  

In [None]:
trdf2 = pd.DataFrame(rows2)
trdf2

In [None]:
trdf2.label = trdf2.label.apply(literal_eval)

# Rebuild test set (Task 2)

In [None]:
rows2 = [] # will contain par_id, label and text
for idx in range(len(teids)):  
  parid = teids.par_id[idx]
  label = teids.label[idx]
  #print(parid)
  # select row from original dataset to access the `text` value
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  rows2.append({
      'par_id':parid,
      'text':text,
      'label':label
  })
  

In [None]:
tedf2 = pd.DataFrame(rows2)

In [None]:
tedf2

In [None]:
tedf2.label = tedf2.label.apply(literal_eval)

# RoBERTa baseline for Task 2

In [None]:
all_negs = trdf2[trdf2.label.apply(lambda x:sum(x) == 0)]
all_pos = trdf2[trdf2.label.apply(lambda x:sum(x) > 0)]

training_set2 = pd.concat([all_pos,all_negs[:round(len(all_pos)*0.5)]])

In [None]:
training_set2

In [None]:
task2_model_args = MultiLabelClassificationArgs(num_train_epochs=1,
                                                no_save=True, 
                                                no_cache=True, 
                                                overwrite_output_dir=True
                                                )
task2_model = MultiLabelClassificationModel("roberta", 
                                            'roberta-base', 
                                            num_labels=7,
                                            args = task2_model_args, 
                                            use_cuda=cuda_available)
# train model
task2_model.train_model(training_set2[['text', 'label']])
# run predictions
preds_task2, _ = task2_model.predict(tedf2.text.tolist())

In [None]:
labels2file(preds_task2, 'task2.txt')

## Prepare submission

In [None]:
!cat task1.txt | head -n 10

In [None]:
!cat task2.txt | head -n 10

In [None]:
!zip submission.zip task1.txt task2.txt