<a href="https://colab.research.google.com/github/Orenjonas/natural_language_processing_with_disaster_tweets/blob/main/NLP_disaster_tweets_using_torch_RoBERTa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction
Using a RoBERTa model pretrained on twitter data for classifying tweets as relating to real disasters or not.

Model description can be found [here](https://huggingface.co/cardiffnlp/twitter-roberta-base)

TODO:
- More detailed text cleaning.
- Inspect mislassified [example](https://colab.research.google.com/github/markwest1972/LSTM-Example-Google-Colaboratory/blob/master/LSTM_IMDB_Sentiment_Example.ipynb#scrollTo=rpCS2-jFH1KY)

# Import data
## Mount google drive to notebook

In [None]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
import os
for dirname, _, filenames in os.walk('/gdrive/My Drive/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/gdrive/My Drive/input/glove.twitter.27B.100d.txt
/gdrive/My Drive/input/glove.twitter.27B.25d.txt
/gdrive/My Drive/input/glove.twitter.27B.50d.txt
/gdrive/My Drive/input/glove.twitter.27B.zip
/gdrive/My Drive/input/nlp-getting-started/test.csv
/gdrive/My Drive/input/nlp-getting-started/sample_submission.csv
/gdrive/My Drive/input/nlp-getting-started/train.csv


In [None]:
!pip install datasets
from datasets import load_dataset

# raw_dataset = load_dataset('csv', data_files='/gdrive/My Drive/input/nlp-getting-started/train.csv')
# raw_competition_test_dataset = load_dataset('csv', data_files='/gdrive/My Drive/input/nlp-getting-started/test.csv')


In [None]:
import pandas as pd
import numpy as np

# raw_dataset = pd.read_csv('/gdrive/My Drive/input/nlp-getting-started/train.csv')

In [None]:
raw_dataset.drop_duplicates(subset=['text'], inplace=True)

In [None]:
!sudo apt-get install enchant
!pip install pyenchant

# Clean data

In [None]:

def clean_tweets(df):
    from itertools import combinations
    import re

    def sub(pattern, output, string, whole_word=False):
        token = output
        if whole_word:
            pattern = r'(\s|^)' + pattern + r'(\s|$)'

        if isinstance(output, str):
            token = ' ' + output + ' '
        else:
            token = lambda match: ' ' + output(match) + ' '

        return re.sub(pattern, token, string)


    def hashtag(token):
        """ Replace hashtag `#` with `<hashtag>` and split following joined words."""
        token = token.group('tag')
        if token != token.upper():
            token = ' '.join(re.findall('[a-zA-Z][^A-Z]*', token))

        return '<hashtag> ' + token

    def punc_repeat(token):
        return token.group(0)[0] + " <repeat>"

    def punc_separate(token):
        return token.group()

    def number(token):
        return token.group() + ' <number>';

    def word_end_repeat(token):
        return token.group(1) + token.group(2) + ' <elong>'
    
    def allcaps(token):
        return token.group() + ' <allcaps>'

    def clean_repeated_letters(tweet: str):
        """
        Splits a tweet into words, finds repeated letters in the word and
        removes combinations of the repeated letters until the word is matched by a key in
        the english dicitonary
        """

        # English dictionary
        import enchant
        d = enchant.Dict("en_US")

        cleaned_tweet = []

        for word_i in tweet.split():
            word_found = False

            # Check that word is in the english library
            if d.check(word_i):
                cleaned_tweet.append(word_i)
                continue

            matches = re.findall(r"""(\S*?)    # 1: Optional preceeding letters
                                     (\w)      # 2: A letter that might be repeated
                                     (\2{1,})  # 3: Repetead instances of the preceeding letter (group 2)
                                     (\S*?)    # 4: Optional trailing letters""",
                                 word_i,
                                 flags=re.X)  # Verbose regex, for commenting
                                 
            repeated_letters = [match[2] for match in matches]
                    
            # Loop over all combinations of repeated letters
            for i in range(len(repeated_letters), 0, -1):  # i decides length of combination
                if word_found:
                    continue
                    
                for combination in combinations(repeated_letters, r = i):
                    if word_found:
                        continue
                        
                    tword = word_i 
                    
                        
                    for letters in combination:
                        tword = re.sub(letters, "", tword, count=1)
                                        
                        # Word in the english dictionary?
                        if d.check(tword):
                            # Keep the word and stop searching
                            word_found = True
                            tword = tword + " <elong>"
                            continue  
            if not word_found:
                # No match, we simply keep the word
                tword = word_i
                
            cleaned_tweet.append(tword)
            
        return " ".join(cleaned_tweet)



    eyes        = r"[8:=;]"
    nose        = r"['`\-\^]?"
    sad_front   = r"[(\[/\\]+"
    sad_back    = r"[)\]/\\]+"
    smile_front = r"[)\]]+"
    smile_back  = r"[(\[]+"
    lol_front   = r"[DbpP]+"
    lol_back    = r"[d]+"
    neutral     = r"[|]+"
    sadface     = eyes + nose + sad_front   + '|' + sad_back   + nose + eyes
    smile       = eyes + nose + smile_front + '|' + smile_back + nose + eyes
    lolface     = eyes + nose + lol_front   + '|' + lol_back   + nose + eyes
    neutralface = eyes + nose + neutral     + '|' + neutral    + nose + eyes
    punctuation = r"""[ '!"#$%&'()+,/:;=?@_`{|}~\*\-\.\^\\\[\]]+""" ## < and > omitted to avoid messing up tokens

    # Remove contractions
    contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                        "can't": "cannot","can't've": "cannot have",
                        "'cause": "because","could've": "could have","couldn't": "could not",
                        "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                        "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                        "hasn't": "has not","haven't": "have not","he'd": "he would",
                        "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                        "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                        "i'd": "i would", "i'd've": "i would have","i'll": "i will",
                        "i'll've": "i will have","i'm": "i am","i've": "i have", "isn't": "is not",
                        "it'd": "it would","it'd've": "it would have","it'll": "it will",
                        "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                        "mayn't": "may not","might've": "might have","mightn't": "might not", 
                        "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                        "mustn't've": "must not have", "needn't": "need not",
                        "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                        "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                        "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                        "she'll": "she will", "she'll've": "she will have","should've": "should have",
                        "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                        "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                        "there'd've": "there would have", "they'd": "they would",
                        "they'd've": "they would have","they'll": "they will",
                        "they'll've": "they will have", "they're": "they are","they've": "they have",
                        "to've": "to have","wasn't": "was not","we'd": "we would",
                        "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                        "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                        "what'll've": "what will have","what're": "what are", "what've": "what have",
                        "when've": "when have","where'd": "where did", "where've": "where have",
                        "who'll": "who will","who'll've": "who will have","who've": "who have",
                        "why've": "why have","will've": "will have","won't": "will not",
                        "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                        "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                        "y'all'd've": "you all would have","y'all're": "you all are",
                        "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                        "you'll": "you will","you'll've": "you will have", "you're": "you are",
                        "you've": "you have"}
    
    # Regular expression for finding contractions
    #    adding positive lookbehind for "'s" in the regex to make sure a letter is preceeding
    contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()).lower().replace("|'s", "|(?<=[a-zA-Z])'s"))

    def expand_contractions(text,contractions_dict=contractions_dict):
        def replace(match):
            return contractions_dict[match.group(1)]
        return contractions_re.sub(replace, text)


    for i in range(df.shape[0]):
        df.loc[i,'text'] = sub(r'[\s]+',                             '  ',            df.loc[i,'text']) # ensure 2 spaces between everything
        df.loc[i,'text'] = sub(r'(?:(?:https?|ftp)://|www\.)[^\s]+', 'http',         df.loc[i,'text'], True)
        df.loc[i,'text'] = sub(r'@\w+',                              '@user',        df.loc[i,'text'], True)
        df.loc[i,'text'] = sub(r'#(?P<tag>\w+)',                     hashtag,         df.loc[i,'text'], True)
        df.loc[i,'text'] = sub(sadface,                              '<sadface>',     df.loc[i,'text'], True)
        df.loc[i,'text'] = sub(smile,                                '<smile>',       df.loc[i,'text'], True)
        df.loc[i,'text'] = sub(lolface,                              '<lolface>',     df.loc[i,'text'], True)
        df.loc[i,'text'] = sub(neutralface,                          '<neutralface>', df.loc[i,'text'], True)
        df.loc[i,'text'] = sub(r'(?:<3+)+',                          '<heart>',       df.loc[i,'text'], True)
        df.loc[i,'text'] = sub(r'\b[A-Z]+\b',                        allcaps,         df.loc[i,'text'], True) 
        df.loc[i,'text'] =                                           df.loc[i,'text'].lower()
        df.loc[i,'text'] =                                           expand_contractions(df.loc[i, 'text'])
        df.loc[i,'text'] = sub(r'[-+]?[.\d]*[\d]+[:,.\d]*',          number,          df.loc[i,'text'], True)
        df.loc[i,'text'] = sub(punctuation,                          punc_separate,   df.loc[i,'text'])
        df.loc[i,'text'] = sub(r'([!?.])\1+',                        punc_repeat,     df.loc[i,'text'])
        
        df.loc[i,'text'] = clean_repeated_letters(df.loc[i,'text'])

    return df

In [None]:
cleaned_data = clean_tweets(df = raw_dataset)

In [None]:
%%capture
!pip install datasets
from datasets import Dataset

cleaned_data = Dataset.from_pandas(cleaned_data)

In [None]:
# Inspect cleaned tweet text

cleaned_data['text'][2]

"all residents asked to ' shelter in place ' are being notified by officers . no other evacuation or shelter in place orders are expected"

# Tokenize data

In [None]:
%%capture

!pip install transformers

from transformers import pipeline, AutoTokenizer

MODEL = "cardiffnlp/twitter-roberta-base"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding=True, truncation=True)

tokenized_data = cleaned_data.map(tokenize_function, batched=True)

# # TODO: adapt code
# MAX_LEN = 
# tokenized_train_data = processed_train_data.map(tokenize_function, batched=True)
# tokenized_test_data = processed_test_data.map(tokenize_function, batched=True)



  0%|          | 0/8 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
# Split training data into a dictionary containing a test and training (validation) set

tokenized_datasets = tokenized_data.train_test_split(test_size=0.2)

In [None]:
small_train_dataset = tokenized_datasets['train'].shuffle(seed=42).select(range(1000))
small_eval_dataset =  tokenized_datasets['test'].shuffle(seed=42).select(range(1000))

# full_train_dataset =  tokenized_datasets["train"]
# full_eval_dataset =   tokenized_datasets["test"]

# Model

In [None]:
import tensorflow as tf

train_tf_dataset = small_train_dataset.remove_columns(["text"]).with_format("tensorflow")
eval_tf_dataset = small_eval_dataset.remove_columns(["text"]).with_format("tensorflow")


In [None]:
train_tf_dataset = small_train_dataset.remove_columns(["text"]).with_format("tensorflow")
eval_tf_dataset  = small_eval_dataset.remove_columns(["text"]).with_format("tensorflow")

In [None]:

def sliceDataset_to_batchDataset(dataset):
    features = {x: dataset[x].to_tensor() for x in tokenizer.model_input_names}
    dataset = tf.data.Dataset.from_tensor_slices((features, dataset["target"]))
    return dataset.shuffle(len(dataset)).batch(8)


train_tf_dataset = sliceDataset_to_batchDataset(train_tf_dataset)
eval_tf_dataset = sliceDataset_to_batchDataset(eval_tf_dataset)

  return np.array(array, copy=False, **self.np_array_kwargs)


1000
1000


In [None]:
from transformers import TFAutoModelForSequenceClassification

# Initiate a tensorflow model from the pretrained model. Will throw a warning about some layers not bein initialized
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=2)

Downloading:   0%|          | 0.00/499M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-6),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)

model.summary()

Model: "tf_roberta_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
roberta (TFRobertaMainLayer) multiple                  124055040 
_________________________________________________________________
classifier (TFRobertaClassif multiple                  592130    
Total params: 124,647,170
Trainable params: 124,647,170
Non-trainable params: 0
_________________________________________________________________


In [None]:

history = model.fit(train_tf_dataset, validation_data=eval_tf_dataset, epochs=3)

Epoch 1/3
Epoch 2/3


## Model is overfitting
- Try more detailed cleaning.
- Try another pretrained model, e.g roberta-base.

## Something to try:

In [None]:
# Add dense layer after transformer model (RoBERTa)


# Import the needed model(Bert, Roberta or DistilBert) with output_hidden_states=True
transformer_model = TFBertForSequenceClassification.from_pretrained('bert-large-cased', output_hidden_states=True)

input_ids = tf.keras.Input(shape=(128, ),dtype='int32')
attention_mask = tf.keras.Input(shape=(128, ), dtype='int32')

transformer = transformer_model([input_ids, attention_mask])    
hidden_states = transformer[1] # get output_hidden_states

hidden_states_size = 4 # count of the last states 
hiddes_states_ind = list(range(-hidden_states_size, 0, 1))

selected_hiddes_states = tf.keras.layers.concatenate(tuple([hidden_states[i] for i in hiddes_states_ind]))

# Now we can use selected_hiddes_states as we want
output = tf.keras.layers.Dense(128, activation='relu')(selected_hiddes_states)
output = tf.keras.layers.Dense(1, activation='sigmoid')(output)
model = tf.keras.models.Model(inputs = [input_ids, attention_mask], outputs = output)
model.compile(tf.keras.optimizers.Adam(lr=1e-4), loss='binary_crossentropy', metrics=['accuracy'])