# **Configuration & Installation**

In [1]:
# Config (controls parameters, based on whether running locally or on google colab)
try:
  import google.colab
  IN_COLAB = True
  path_prefix = ''
except:
  IN_COLAB = False
  path_prefix = 'data/'

TRAIN = True # Whether to train the model or not.
# Sourced from https://huggingface.co/models?sort=downloads&search=bert
PRETRAINED_MODELS = ['roberta-base','bert-base-cased', 'cardiffnlp/twitter-roberta-base-mar2022']
EXISTING_MODEL = '' # Path to an existing model checkpoint.
BATCH_SIZE = 8 # Set lower when using a GPU with less memory.
AUGMENT_DATA = False

In [2]:
# Python libaries
if IN_COLAB:
    !pip install transformers
    !pip install datasets
    !pip install nlpaug
    !pip install sacremoses
    !pip install kaggle

Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 5.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 4.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 61.3 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 51.1 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed huggingface-hub-0.

Collecting nlpaug
  Downloading nlpaug-1.1.10-py3-none-any.whl (410 kB)
[?25l[K     |▉                               | 10 kB 32.2 MB/s eta 0:00:01[K     |█▋                              | 20 kB 23.8 MB/s eta 0:00:01[K     |██▍                             | 30 kB 11.1 MB/s eta 0:00:01[K     |███▏                            | 40 kB 4.3 MB/s eta 0:00:01[K     |████                            | 51 kB 4.2 MB/s eta 0:00:01[K     |████▉                           | 61 kB 5.0 MB/s eta 0:00:01[K     |█████▋                          | 71 kB 5.5 MB/s eta 0:00:01[K     |██████▍                         | 81 kB 5.5 MB/s eta 0:00:01[K     |███████▏                        | 92 kB 6.1 MB/s eta 0:00:01[K     |████████                        | 102 kB 5.1 MB/s eta 0:00:01[K     |████████▊                       | 112 kB 5.1 MB/s eta 0:00:01[K     |█████████▋                      | 122 kB 5.1 MB/s eta 0:00:01[K     |██████████▍                     | 133 kB 5.1 MB/s eta 0:00:01[K 

In [3]:
from datasets import load_dataset, concatenate_datasets, load_metric, Dataset
import numpy as np
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForSequenceClassification
import torch
import csv
import datetime
from IPython.display import display, HTML
import nltk
import nlpaug.augmenter.word as naw
import os.path
import re

In [4]:
# Connect to google drive
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')


Mounted at /content/drive


In [5]:
# Download kaggle data
if IN_COLAB:
    !mkdir ~/.kaggle
    !cp /content/drive/MyDrive/kaggle/kaggle.json ~/.kaggle/kaggle.json # Change this to your kaggle.json path
    !chmod 600 ~/.kaggle/kaggle.json
    !kaggle competitions download -c nlp-getting-started
    !unzip -o nlp-getting-started.zip

Downloading nlp-getting-started.zip to /content
  0% 0.00/593k [00:00<?, ?B/s]
100% 593k/593k [00:00<00:00, 136MB/s]
Archive:  nlp-getting-started.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


# **Initial Data Analysis**

In [9]:
# Load data
dataset = load_dataset('csv', data_files=f'{path_prefix}train.csv')
df = dataset['train'].to_pandas()

Using custom data configuration default-6ceadb4107aacce8
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-6ceadb4107aacce8/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
# Classes
cls_0_df = df[df['target'] == 0]
cls_1_df =  df[df['target'] == 1]
print(f'rows in class 0 {cls_0_df.size/df.size}, rows in class 1 {cls_1_df.size/df.size}')
print(f"Tweet length min {df['text'].str.len().min()}, max {df['text'].str.len().max()} ")

rows in class 0 0.5703402075397347, rows in class 1 0.4296597924602653
Tweet length min 7, max 157 


In [11]:
# User tags
user_df = df[df['text'].str.contains('@')]
print(f"User tags are in {user_df.size/df.size} rows")

# URLs
user_df = df[df['text'].str.contains('http')]
print(f"URLs are in {user_df.size/df.size} rows")

# Hashtags
user_df = df[df['text'].str.contains('#')]
print(f"# are in {user_df.size/df.size} rows")

User tags are in 0.26783134112701956 rows
URLs are in 0.5216077761723368 rows
# are in 0.2313148561670826 rows


In [12]:
# Location analysis
location_ratio = df[df['location'].notnull()].size / df.size
print(f'Rows with a location {location_ratio}')

Rows with a location 0.6672796532247471


In [13]:
# Keyword analysis - inspection of keyword types by class
keyword_ratio = df[df['keyword'].notnull()].size / df.size
cls_0_df = df[(df['keyword'].notnull()) & (df['target'] == 0)]
keyword_cls_0_ratio = cls_0_df.size / df[df['target'] == 0].size
cls_1_df =  df[(df['keyword'].notnull()) & (df['target'] == 1)]
keyword_cls_1_ratio = cls_1_df.size / df[df['target'] == 1].size

print(f'Rows with a keyword: {keyword_ratio}, in class 0: {keyword_cls_0_ratio}, in class 1: {keyword_cls_1_ratio}')

keyword_counts_cls_0 = cls_0_df['keyword'].value_counts()
keyword_counts_cls_1 = cls_1_df['keyword'].value_counts()

print('Top 10 keywords in class 0')
display(keyword_counts_cls_0.nlargest(10))

print('Top 10 keywords in class 1')
display(keyword_counts_cls_1.nlargest(10))

# Intersection of keywords between classes 0 and 1
common_keywords = set(keyword_counts_cls_0.index.tolist()).intersection(set(keyword_counts_cls_1.index.tolist()))

print(f'class 0 has {keyword_counts_cls_0.size} unique keywords')
print(f'class 1 has {keyword_counts_cls_1.size} unique keywords')
print(f'They have {len(common_keywords)} keywords in common')

Rows with a keyword: 0.9919873899908052, in class 0: 0.9956241363426992, in class 1: 0.9871598899419138
Top 10 keywords in class 0


body%20bags    40
harm           37
armageddon     37
wrecked        36
ruin           36
deluge         36
explode        35
twister        35
fear           35
siren          35
Name: keyword, dtype: int64

Top 10 keywords in class 1


derailment           39
wreckage             39
outbreak             39
debris               37
oil%20spill          37
typhoon              37
evacuated            32
suicide%20bombing    32
rescuers             32
suicide%20bomb       32
Name: keyword, dtype: int64

class 0 has 218 unique keywords
class 1 has 220 unique keywords
They have 217 keywords in common


# **Pre-processing & Augmentation**

In [14]:
def fix_bad_symbols(t):
    to_replace = [('&amp;','&'),('&gt;','>'), ('&lt;','<'),('\x89Ûª', "'")]
    for (bad, fix) in to_replace:
        t = t.replace(bad, fix)

    t = re.sub('\\x89.{2}', '', t)
    return t

preprocessing_operations = [
    {'description': "anonymise users by replacing all '@some_name' with '@user'",'operation': lambda t: '@user' if t.startswith('@') and len(t) > 1 else t},
    {'description': "omit users entirely",'operation': lambda t: '' if t.startswith('@') and len(t) > 1 else t},
    {'description': "shorten any URL such as 'https://youtube.com/' to 'http'",'operation': lambda t: 'http' if t.startswith('http') else t},
    {'description': "omit URLs entirely",'operation': lambda t: '' if t.startswith('http') and len(t) > 1 else t},
    {'description': "Remove hashtag symbol #' to 'http'",'operation': lambda t: t.replace('#','')},
    {'description': "Lower case all strings",'operation': lambda t: t.lower()},
    {'description': "Replace incorrectly scraped characters (e.g '&amp;') and remove meaningless ones (e.g '\x89ÛI')",'operation': fix_bad_symbols},
]


def preprocess(row):
    new_text = []
    text = row['text']
    if text == None:
      print(row)
    for t in text.split(" "):
        #t = preprocessing_operations[2]['operation'](t)
        new_text.append(t)
    return {**row, 'text':" ".join(new_text)}

def cleanup(d_set):
  return d_set.map(preprocess).remove_columns(['keyword','location']).rename_column('target','label')

In [15]:
# Split off validation data
train_validation = dataset['train'].train_test_split(test_size=0.1, seed=777)
train_dataset = train_validation['train']
validation_dataset = train_validation['test']

In [16]:
%%time
# Augment data
if AUGMENT_DATA:
  augmentation_rate = 0.2 # how many samples to modify in total
  file_path = f'{path_prefix}train-augmented.csv'
  # Check if we already created an augmented dataset as a cache
  if not os.path.isfile(file_path):

    # https://huggingface.co/facebook/wmt19-en-de
    back_translation_aug = naw.BackTranslationAug(
        from_model_name='facebook/wmt19-en-de',
        to_model_name='facebook/wmt19-de-en',
        device='cuda',
        batch_size=BATCH_SIZE
    )

    to_augment = train_dataset.to_dict()
    to_augment['text'] =  back_translation_aug.augment(train_dataset['text'])
    to_augment['id'] = [100000 + id for id in to_augment['id']]
    extra_dataset = Dataset.from_dict(to_augment)
    extra_dataset.to_csv(file_path)
    
  extra_dataset = load_dataset('csv', data_files=file_path)
  extra_dataset = extra_dataset.filter(lambda r: r['text'] != None) # There's 2 cases where the text is None for some reason
  extra_dataset = extra_dataset['train'].train_test_split(test_size=augmentation_rate, seed=777)['test']
  augmented_dataset = concatenate_datasets([train_dataset, extra_dataset])


else:
  augmented_dataset = concatenate_datasets([train_dataset])

CPU times: user 1.45 ms, sys: 942 µs, total: 2.4 ms
Wall time: 2.07 ms


# **Training**


In [17]:
# Basic classification accuracy metric
metric = load_metric("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [18]:
# Define model
if EXISTING_MODEL:
    models = [AutoModelForSequenceClassification.from_pretrained(EXISTING_MODEL)]
    tokenizers = [AutoTokenizer.from_pretrained(EXISTING_MODEL)]
else:
    models = [AutoModelForSequenceClassification.from_pretrained(m, num_labels=2) for m in PRETRAINED_MODELS]
    tokenizers = [AutoTokenizer.from_pretrained(m) for m in PRETRAINED_MODELS]


Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Downloading:   0%|          | 0.00/725 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-mar2022 were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-mar2022 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_p

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/346 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [None]:
%%time

trainers = []
names = []

for i,model in enumerate(models):
    tokenizer = tokenizers[i]
    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True)

    def tokenize(d_set):
        return d_set.map(tokenize_function, batched=True)
    tokenized_train = tokenize(cleanup(augmented_dataset))
    tokenized_validation = tokenize(cleanup(validation_dataset))


    # Hyperparameter Settings
    date_string = datetime.datetime.now().strftime('%Y-%m-%dT%H-%M-%S')
    model_name = PRETRAINED_MODELS[i].replace('/','-')
    names.append(model_name)
    training_args = TrainingArguments(output_dir=f"models/{model_name}-{date_string}",
                                      evaluation_strategy="epoch",
                                      logging_strategy="epoch",
                                      per_device_train_batch_size=BATCH_SIZE,
                                      per_device_eval_batch_size=BATCH_SIZE,
                                      seed=777)
    
    # Train model
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_validation,
        compute_metrics=compute_metrics,
    )
    trainers.append(trainer)
    if TRAIN:
      print(model_name)
      trainer.train()
      print('Done')
    


  0%|          | 0/6851 [00:00<?, ?ex/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/762 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: id, text. If id, text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6851
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2571


roberta-base


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6283,0.501481,0.792651
2,0.6599,0.681053,0.569554


Saving model checkpoint to models/roberta-base-2022-05-16T20-40-54/checkpoint-500
Configuration saved in models/roberta-base-2022-05-16T20-40-54/checkpoint-500/config.json
Model weights saved in models/roberta-base-2022-05-16T20-40-54/checkpoint-500/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: id, text. If id, text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
Saving model checkpoint to models/roberta-base-2022-05-16T20-40-54/checkpoint-1000
Configuration saved in models/roberta-base-2022-05-16T20-40-54/checkpoint-1000/config.json
Model weights saved in models/roberta-base-2022-05-16T20-40-54/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to models/roberta-base-2022-05-16T20-40-54/checkpoint-1500
Configuration saved in models/roberta

# **Evaluation & Inspection**

In [None]:
# Used only for debugging and inspection of tokenisation result.
tokenized_dataset = tokenize(cleanup(dataset))
tokenized_df = tokenized_dataset['train'].to_pandas() 
tokenized_df['tokenized'] = tokenized_df.apply(lambda row: tokenizer.convert_ids_to_tokens(row['input_ids']), axis=1)

In [None]:
# Extract incorrect classifications from validation data

trainer = trainers[0]
tokenizer = tokenizers[0]
model_name = names[0]

predictions = trainer.predict(tokenized_validation)
predicted_labels = np.argmax(predictions.predictions, axis=1)
df = tokenized_validation.to_pandas()
df['predicted'] = predicted_labels
false_positives = df[(df['predicted'] == 1) & (df['label'] == 0)]
false_negatives = df[(df['predicted'] == 0) & (df['label'] == 1)]

In [None]:
# Evaluate Testing data
test_dataset = load_dataset('csv', data_files=f'{path_prefix}test.csv')
test_dataset = test_dataset.map(preprocess).remove_columns(['keyword','location'])
tokenized_test_dataset = tokenize(test_dataset)
predictions = trainer.predict(tokenized_test_dataset['train'])

In [None]:
# Save predictions
predicted_labels = np.argmax(predictions.predictions, axis=1)
file_name = 'test_predictions.csv'
message = f'{datetime.datetime.now().isoformat()} - {model_name} submission'

with open(file_name, mode='w') as csv_file:
    writer = csv.writer(csv_file, delimiter=',')

    writer.writerow(['id','target'])
    for id, target in zip(tokenized_test_dataset['train']['id'], predicted_labels):
      writer.writerow([id,target])

In [None]:
# Submit to Kaggle
#!kaggle competitions submit -c nlp-getting-started -f $file_name -m "$message"