# Setup

In [None]:
import os
os.chdir('/content/drive/MyDrive/w266 project/dontpatronizeme/semeval-2022')
os.getcwd()

'/content/drive/MyDrive/w266 project/dontpatronizeme/semeval-2022'

In [None]:
!pip install transformers

In [None]:
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt
import random

from sklearn.metrics import f1_score
import tensorflow as tf
import transformers
from transformers import BertTokenizer, TFBertModel

import logging
tf.get_logger().setLevel(logging.ERROR)
tf.config.list_physical_devices('GPU')

In [None]:
# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

# Data

In [None]:
from dont_patronize_me import DontPatronizeMe
dpm = DontPatronizeMe('data', 'TEST/task4_test.tsv')
dpm.load_task1()
dpm.load_task2(return_one_hot=True)
dpm.load_test()

Map of label to numerical label:
{'Unbalanced_power_relations': 0, 'Shallow_solution': 1, 'Presupposition': 2, 'Authority_voice': 3, 'Metaphors': 4, 'Compassion': 5, 'The_poorer_the_merrier': 6}


In [None]:
trids = pd.read_csv('practice splits/train_semeval_parids-labels.csv')
teids = pd.read_csv('practice splits/dev_semeval_parids-labels.csv') 
trids.par_id = trids.par_id.astype(str)
teids.par_id = teids.par_id.astype(str)
print(trids.shape)
print(teids.shape)

(8375, 2)
(2094, 2)


In [None]:
# Rebuild train set for Task 1
rows = [] # will contain par_id, label and text
for idx in range(len(trids)):  
  parid = trids.par_id[idx]
  #print(parid)
  # select row from original dataset to retrieve `text` and binary label
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'text':text,
      'label':label
  })

trdf1 = pd.DataFrame(rows)

# Rebuild test set for Task 1
rows = [] # will contain par_id, label and text
for idx in range(len(teids)):  
  parid = teids.par_id[idx]
  #print(parid)
  # select row from original dataset
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'text':text,
      'label':label
  })

tedf1 = pd.DataFrame(rows)

# downsample negative instances
pcldf = trdf1[trdf1.label==1]
npos = len(pcldf)

training_set1 = pd.concat([pcldf,trdf1[trdf1.label==0][:npos*2]])
training_set1

Unnamed: 0,par_id,text,label
0,4341,"The scheme saw an estimated 150,000 children f...",1
1,4136,Durban 's homeless communities reconciliation ...,1
2,10352,The next immediate problem that cropped up was...,1
3,8279,Far more important than the implications for t...,1
4,1164,To strengthen child-sensitive social protectio...,1
...,...,...,...
2377,1775,Last but not the least element of culpability ...,0
2378,1776,"Then , taking the art of counter-intuitive non...",0
2379,1777,Kagunga village was reported to lack necessary...,0
2380,1778,"""After her parents high-profile divorce after ...",0


In [None]:
# Rebuild train set for task 2
rows2 = [] # will contain par_id, label and text
for idx in range(len(trids)):  
  parid = trids.par_id[idx]
  label = trids.label[idx]
  # select row from original dataset to retrieve the `text` value
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  rows2.append({
      'par_id':parid,
      'text':text,
      'label':label
  })
  
trdf2 = pd.DataFrame(rows2)
trdf2.label = trdf2.label.apply(ast.literal_eval)

rows2 = [] # will contain par_id, label and text
for idx in range(len(teids)):  
  parid = teids.par_id[idx]
  label = teids.label[idx]
  #print(parid)
  # select row from original dataset to access the `text` value
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  rows2.append({
      'par_id':parid,
      'text':text,
      'label':label
  })
  
tedf2 = pd.DataFrame(rows2)
tedf2.label = tedf2.label.apply(ast.literal_eval)

# downsample
all_negs = trdf2[trdf2.label.apply(lambda x:sum(x) == 0)]
all_pos = trdf2[trdf2.label.apply(lambda x:sum(x) > 0)]

training_set2 = pd.concat([all_pos,all_negs[:round(len(all_pos)*0.5)]])
training_set2

Unnamed: 0,par_id,text,label
0,4341,"The scheme saw an estimated 150,000 children f...","[1, 0, 0, 1, 0, 0, 0]"
1,4136,Durban 's homeless communities reconciliation ...,"[0, 1, 0, 0, 0, 0, 0]"
2,10352,The next immediate problem that cropped up was...,"[1, 0, 0, 0, 0, 1, 0]"
3,8279,Far more important than the implications for t...,"[0, 0, 0, 1, 0, 0, 0]"
4,1164,To strengthen child-sensitive social protectio...,"[1, 0, 0, 1, 1, 1, 0]"
...,...,...,...
1186,434,""""""" I was absolutely useless at school , hopel...","[0, 0, 0, 0, 0, 0, 0]"
1187,435,I also noticed the change in socio-economic le...,"[0, 0, 0, 0, 0, 0, 0]"
1188,436,"Can Donald Trump win ? It 's possible , but ce...","[0, 0, 0, 0, 0, 0, 0]"
1189,437,He added that any introduction of new law must...,"[0, 0, 0, 0, 0, 0, 0]"


# Modeling

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
bert_model = TFBertModel.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/502M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
max_length = 150

x_train = tokenizer([str(x) for x in training_set1['text'].values], 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')
y_train = training_set1['label'].values

x_test = tokenizer([str(x) for x in tedf1['text'].values], 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')
y_test = tedf1['label'].values

In [None]:
def create_classification_model(hidden_size = 200, 
                                train_layers = -1, 
                                optimizer=tf.keras.optimizers.Adam()):
    """
    Build a simple classification model with BERT. Let's keep it simple and don't add dropout, layer norms, etc.
    """

    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                  'token_type_ids': token_type_ids,
                  'attention_mask': attention_mask}


    #restrict training to the train_layers outer transformer layers
    if not train_layers == -1:

            retrain_layers = []

            for retrain_layer_number in range(train_layers):

                layer_code = '_' + str(11 - retrain_layer_number)
                retrain_layers.append(layer_code)

            for w in bert_model.weights:
                if not any([x in w.name for x in retrain_layers]):
                    w._trainable = False


    bert_out = bert_model(bert_inputs)


    classification_token = tf.keras.layers.Lambda(lambda x: x[:,0,:], name='get_first_vector')(bert_out[0])


    hidden1 = tf.keras.layers.Dense(hidden_size, name='hidden_layer_1')(classification_token)
    hidden2 = tf.keras.layers.Dense(hidden_size, name='hidden_layer_2')(hidden1)

    classification = tf.keras.layers.Dense(1, activation='sigmoid', name='classification_layer')(hidden2)

    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], 
                                          outputs=[classification])
    
    classification_model.compile(optimizer=optimizer,
                            loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
                            metrics='accuracy')


    return classification_model

In [None]:
try:
    del classification_model
except:
    pass

try:
    del bert_model
except:
    pass

tf.keras.backend.clear_session()

bert_model = TFBertModel.from_pretrained('bert-base-cased')

classification_model = create_classification_model(optimizer=tf.keras.optimizers.Adam(0.00005),
                                                   train_layers=-1)

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
classification_model.fit([x_train.input_ids, x_train.token_type_ids, x_train.attention_mask],
                         y_train,
                         #validation_data=([x_test.input_ids, x_test.token_type_ids, x_test.attention_mask],
                         #y_test),
                        epochs=3,
                        batch_size=8)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f9c098f2310>

In [None]:
y_predict = classification_model.predict([x_test.input_ids, x_test.token_type_ids, x_test.attention_mask], 
                                         batch_size=8, verbose=1)  # steps=2?
y_predict = [1 if i[0]>0.5 else 0 for i in y_predict]

f1_score(y_test, y_predict)

0.5008944543828264

# Error Analysis

# Evaluation

In [None]:
# output
labels2file([[y] for y in y_predict], os.path.join('res/task1.txt'))

# Evaluate
!python3 evaluation.py . .
!cat scores.txt

task1_precision:0.3888888888888889
task1_recall:0.7035175879396985
task1_f1:0.5008944543828264
task2_unb:1.0
task2_sha:1.0
task2_pre:1.0
task2_aut:1.0
task2_met:1.0
task2_com:1.0
task2_the:1.0
task2_avg:1.0


# TEST submission

In [None]:
# Task 1
x_test_s = tokenizer([str(x) for x in dpm.test_set_df['text'].values], 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')

y_predict = classification_model.predict([x_test_s.input_ids, x_test_s.token_type_ids, x_test_s.attention_mask], 
                                         batch_size=8, verbose=1)  # steps=2?
y_predict = [1 if i[0]>0.5 else 0 for i in y_predict]

# output
labels2file([[y] for y in y_predict], os.path.join('res/task1.txt'))



In [None]:
# Task 2
#res = [[random.choice([0,1]) for k in range(7)] for k in range(0,dpm.test_set_df.shape[0])]
#labels2file(res, os.path.join('res/task2.txt'))

In [None]:
# load test data
# predict & output (task1 with model, task2 with random)

os.chdir('res')

!cat task1.txt | head -n 3
!cat task2.txt | head -n 3
!zip submission.zip task1.txt task2.txt

os.chdir('..')
#os.chdir('/content/drive/MyDrive/w266 project/dontpatronizeme/semeval-2022')

0
1
0
1,0,1,1,1,1,1
1,1,0,1,0,0,1
0,1,0,0,1,1,1
  adding: task1.txt (deflated 92%)
  adding: task2.txt (deflated 87%)
