# Final Project: Disaster Tweet Classification

In [1]:
__author__ = "Kevin Guo, Pranav Sriram, Raymond Yao"
__version__ = "CS224u, Stanford, Spring 2021"

## Data Pre-Processing

In [2]:
import numpy as np
import pandas as pd
import re
from transformers import BertModel, BertTokenizer
import utils
import torch
import torch.nn as nn
from torch_shallow_neural_classifier import TorchShallowNeuralClassifier
import datasets
from datasets import Dataset
from datasets import load_dataset
from datasets import load_metric
from transformers import TrainingArguments
from transformers import Trainer
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm


In [3]:
# Set random seeds
utils.fix_random_seeds()

In [4]:
df = pd.read_csv('tweets_mod_copy.csv')
df.head(10)

Unnamed: 0,text,labels
0,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,Telangana: Section 144 has been imposed in Bha...,1
2,Arsonist sets cars ablaze at dealership https:...,1
3,Arsonist sets cars ablaze at dealership https:...,1
4,"""Lord Jesus, your love brings freedom and pard...",0
5,"If this child was Chinese, this tweet would ha...",0
6,Several houses have been set ablaze in Ngemsib...,1
7,Asansol: A BJP office in Salanpur village was ...,1
8,"National Security Minister, Kan Dapaah's side ...",0
9,This creature who’s soul is no longer clarent ...,0


In [5]:
# Remove URLs
df['text'] = df['text'].apply(lambda x: re.sub(r'https?\S+', '', x))
df.head(10)

Unnamed: 0,text,labels
0,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,Telangana: Section 144 has been imposed in Bha...,1
2,Arsonist sets cars ablaze at dealership,1
3,Arsonist sets cars ablaze at dealership,1
4,"""Lord Jesus, your love brings freedom and pard...",0
5,"If this child was Chinese, this tweet would ha...",0
6,Several houses have been set ablaze in Ngemsib...,1
7,Asansol: A BJP office in Salanpur village was ...,1
8,"National Security Minister, Kan Dapaah's side ...",0
9,This creature who’s soul is no longer clarent ...,0


In [6]:
# Split into train, dev, and test sets using a 80-10-10 ratio
# train = df 
# dev = df
# test = df
train, dev, test = np.split(df.sample(frac=1, random_state=42), [int(.8*len(df)), int(.9*len(df))])

In [7]:
# train_dataset = load_dataset('csv', data_files = 'tweets.csv')
# dev_dataset = load_dataset('csv', data_files = 'tweets.csv')
# test_dataset = load_dataset('csv', data_files = 'tweets.csv')

train_dataset = Dataset.from_pandas(train)
dev_dataset = Dataset.from_pandas(dev)
test_dataset = Dataset.from_pandas(test)
# dataset = Dataset.from_pandas(df)

In [8]:
from transformers import AutoModelForSequenceClassification
# weights_name = 'bert-base-cased'
# tokenizer = BertTokenizer.from_pretrained(weights_name)
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
# x_train = train_dataset['text']
# y_train = train_dataset['target']

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [9]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_dev_dataset = dev_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["text"])
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["__index_level_0__"])

tokenized_train_dataset.set_format("torch")

tokenized_dev_dataset.set_format("torch")

tokenized_test_dataset.set_format("torch")

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




In [10]:
training_args = TrainingArguments("test_trainer")

metric = load_metric("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_dev_dataset,
    compute_metrics=compute_metrics,
)

In [12]:
trainer.train()

Step,Training Loss
500,0.3652
1000,0.3164
1500,0.2491
2000,0.2259
2500,0.163
3000,0.1133


TrainOutput(global_step=3411, training_loss=0.22191675007780148, metrics={'train_runtime': 5007.9302, 'train_samples_per_second': 0.681, 'total_flos': 9079642126172160, 'epoch': 3.0})

In [13]:
trainer.evaluate()

{'eval_loss': 0.4116906523704529,
 'eval_f1': 0.7634660421545667,
 'eval_runtime': 71.9667,
 'eval_samples_per_second': 15.799,
 'epoch': 3.0}

In [14]:
trainer.save_model()

In [26]:
finetuned_model = BertModel.from_pretrained('test_trainer')

In [126]:
def bert_phi(text):
    input_ids = tokenizer.encode(text, add_special_tokens=True)
    X = torch.tensor([input_ids])
    with torch.no_grad():
        reps = finetuned_model(X)
        return reps.last_hidden_state.squeeze(0).numpy()

In [127]:
def bert_classifier_phi(text):
    reps = bert_phi(text)
    #return reps.mean(axis=0)  # Another good, easy option.
    return reps[0]

In [128]:
train_table = pd.read_table('2013_Queensland_Floods_train.tsv')
train_table['text'] = train_table['text'].apply(lambda x: re.sub(r'https?\S+', '', x))

dev_table = pd.read_table('2013_Queensland_Floods_dev.tsv')
dev_table['text'] = dev_table['text'].apply(lambda x: re.sub(r'https?\S+', '', x))

test_table = pd.read_table('2013_Queensland_Floods_test.tsv')
test_table['text'] = test_table['text'].apply(lambda x: re.sub(r'https?\S+', '', x))


train = train_table 
dev = dev_table 
test = test_table

In [129]:
print(train)
print(dev)
print(test)

                                                   text  label
0     I just though about the night I went clubbing ...      0
1     Looks like its going to be another long night ...      0
2     @LaniiBanani hahahaha I just told him id have ...      0
3     Off to meeting.... with so called... Baaps of ...      0
4              Doubt I'll be getting much sleep tonight      0
...                                                 ...    ...
6014  RT @GrillTeam: The Queensland government has s...      1
6015  Can we have 5 NEMA staff from Nigeria  come to...      1
6016  RT @7NewsBrisbane: Foam from rough waves at Al...      1
6017  RT @abcsouthqld: Master Electricians Australia...      1
6018  RT @HomeLoanKing: Leader of Aussie opposition,...      1

[6019 rows x 2 columns]
                                                   text  label
0     Fuck It.. Chelsea should have been all over Br...      0
1     Hey Dana Does @Alistairovereem gets the title ...      0
2     @mimstacey @janecaro gam

In [130]:
X_str_train = train.text.values
print(len(X_str_train))
y_train = train.label.values

X_str_dev = dev.text.values
print(len(X_str_dev))
y_dev = dev.label.values

6019
1003


In [131]:
%time X_train = [bert_classifier_phi(text) for text in X_str_train]

CPU times: user 15min 36s, sys: 887 ms, total: 15min 37s
Wall time: 7min 48s


In [132]:
%time X_dev = [bert_classifier_phi(text) for text in X_str_dev]

CPU times: user 2min 36s, sys: 135 ms, total: 2min 36s
Wall time: 1min 18s


In [133]:
model = TorchShallowNeuralClassifier(
    early_stopping=True,
    hidden_dim=300,
    )

In [134]:
%time _ = model.fit(X_train, y_train)

Stopping after epoch 47. Validation score did not improve by tol=1e-05 for more than 10 epochs. Final error is 0.8856246322393417

CPU times: user 4.38 s, sys: 119 ms, total: 4.5 s
Wall time: 3.81 s


In [135]:
preds = model.predict(X_dev)

In [136]:
from sklearn.metrics import classification_report
print(classification_report(y_dev, preds, digits=3))

              precision    recall  f1-score   support

           0      0.924     0.926     0.925       462
           1      0.937     0.935     0.936       541

    accuracy                          0.931      1003
   macro avg      0.931     0.931     0.931      1003
weighted avg      0.931     0.931     0.931      1003



In [137]:
second_data = pd.read_csv('tweets_mod_copy.csv')
second_data['text'] = second_data['text'].apply(lambda x: re.sub(r'https?\S+', '', x))
second_train, second_dev, second_test = np.split(second_data.sample(frac=1, random_state=42), [int(.8*len(second_data)), int(.9*len(second_data))])

In [138]:
print(second_train)

                                                    text  labels
3495   How many illegal buildings should be demolishe...       0
5461                     Who’s fatality is this tho ????       0
9794   #OnThisDay 2018 Chinese state media confirmed ...       1
11105  With any luck you will miss the windstorm on e...       0
1803   Inferno on Black Friday 1939: 71 deaths, 3,700...       1
...                                                  ...     ...
2196   go ahead and make a playlist with your name. g...       0
8561   Ruckelshaus, Sweeney and DDT – rescued from th...       0
11236  😂We learned a long time ago why all major bank...       0
4285   5,000 feral camels culled in drought-hit Austr...       1
8569   Another rescued mumma koala with her little ne...       1

[9096 rows x 2 columns]


In [139]:
second_X_str_train = second_train.text.values
print(len(second_X_str_train))
second_y_train = second_train.labels.values

second_X_str_dev = second_dev.text.values
print(len(second_X_str_dev))
second_y_dev = second_dev.labels.values

9096
1137


In [140]:
%time second_X_train = [bert_classifier_phi(text) for text in second_X_str_train]

CPU times: user 23min 56s, sys: 1.44 s, total: 23min 57s
Wall time: 11min 58s


In [141]:
%time second_X_dev = [bert_classifier_phi(text) for text in second_X_str_dev]

CPU times: user 2min 58s, sys: 259 ms, total: 2min 58s
Wall time: 1min 29s


In [142]:
%time _ = model.fit(second_X_train, second_y_train)

Stopping after epoch 17. Validation score did not improve by tol=1e-05 for more than 10 epochs. Final error is 0.3091777227818966

CPU times: user 4.01 s, sys: 122 ms, total: 4.13 s
Wall time: 3.57 s


In [143]:
second_preds = model.predict(second_X_dev)

In [144]:
print(classification_report(second_y_dev, second_preds, digits=3))

              precision    recall  f1-score   support

           0      0.941     0.949     0.945       920
           1      0.775     0.747     0.761       217

    accuracy                          0.910      1137
   macro avg      0.858     0.848     0.853      1137
weighted avg      0.909     0.910     0.910      1137



In [145]:
all_data = pd.read_csv('real_and_fake_news.csv')
all_data['text'] = all_data['text'].apply(lambda x: re.sub(r'https?\S+', '', x))
relevant_data, unneeded = np.split(all_data.sample(frac=1, random_state=42), [10000])
train, dev, test = np.split(relevant_data, [int(.8*len(relevant_data)), int(.9*len(relevant_data))])

In [146]:
print(train)
print(dev)
print(test)

                                                    text  label
30464   Obama Announces ‘Unfinished Business’ For 201...      0
41926  IT BEGINS….OBAMA APPOINTED JUDGE Rules Trump U...      0
8877   White House says Obama will not discuss FBI pr...      1
3798   Treasury unit to share records with Senate for...      1
39671  HOLLYWOOD LIBS HAVE EPIC MELT DOWNS…Threaten P...      0
...                                                  ...    ...
22636   Republicans Dine With Trump, Then Try To Rail...      0
2622   Trump Jr., Manafort agree to negotiate over in...      1
6334   After 2016 campaign, more Americans consider R...      1
1801   US Senate may vote this week to add penalties ...      1
20427  FPL shuts one reactor in Florida, reduces powe...      1

[8000 rows x 2 columns]
                                                    text  label
34238  WILL HILLARY ATTEND? ‘CLOWN LIVES MATTER’ Rall...      0
13316  Bosnian Croat war criminal Praljak killed hims...      1
14057  U.S. all

In [147]:
X_str_train = train.text.values
print(len(X_str_train))
y_train = train.label.values

X_str_dev = dev.text.values
print(len(X_str_dev))
y_dev = dev.label.values

8000
1000


In [148]:
%time X_train = [bert_classifier_phi(text) for text in X_str_train]

CPU times: user 19min 48s, sys: 1.42 s, total: 19min 50s
Wall time: 9min 55s


In [149]:
%time X_dev = [bert_classifier_phi(text) for text in X_str_dev]

CPU times: user 2min 30s, sys: 175 ms, total: 2min 30s
Wall time: 1min 15s


In [150]:
model = TorchShallowNeuralClassifier(
    early_stopping=True,
    hidden_dim=300,
    hidden_activation=nn.LeakyReLU())

In [151]:
%time _ = model.fit(X_train, y_train)

Stopping after epoch 40. Validation score did not improve by tol=1e-05 for more than 10 epochs. Final error is 0.25402649119496346

CPU times: user 4.95 s, sys: 108 ms, total: 5.06 s
Wall time: 4.36 s


In [152]:
preds = model.predict(X_dev)

In [153]:
from sklearn.metrics import classification_report
print(classification_report(y_dev, preds, digits=3))

              precision    recall  f1-score   support

           0      0.988     0.988     0.988       514
           1      0.988     0.988     0.988       486

    accuracy                          0.988      1000
   macro avg      0.988     0.988     0.988      1000
weighted avg      0.988     0.988     0.988      1000



In [154]:
second_data = pd.read_csv('tweets_mod_copy.csv')
second_data['text'] = second_data['text'].apply(lambda x: re.sub(r'https?\S+', '', x))
second_train, second_dev, second_test = np.split(second_data.sample(frac=1, random_state=42), [int(.8*len(second_data)), int(.9*len(second_data))])

In [155]:
print(second_train)

                                                    text  labels
3495   How many illegal buildings should be demolishe...       0
5461                     Who’s fatality is this tho ????       0
9794   #OnThisDay 2018 Chinese state media confirmed ...       1
11105  With any luck you will miss the windstorm on e...       0
1803   Inferno on Black Friday 1939: 71 deaths, 3,700...       1
...                                                  ...     ...
2196   go ahead and make a playlist with your name. g...       0
8561   Ruckelshaus, Sweeney and DDT – rescued from th...       0
11236  😂We learned a long time ago why all major bank...       0
4285   5,000 feral camels culled in drought-hit Austr...       1
8569   Another rescued mumma koala with her little ne...       1

[9096 rows x 2 columns]


In [156]:
second_X_str_train = second_train.text.values
print(len(second_X_str_train))
second_y_train = second_train.labels.values

second_X_str_dev = second_dev.text.values
print(len(second_X_str_dev))
second_y_dev = second_dev.labels.values

9096
1137


In [157]:
%time second_X_train = [bert_classifier_phi(text) for text in second_X_str_train]

CPU times: user 23min 44s, sys: 1.69 s, total: 23min 46s
Wall time: 11min 53s


In [158]:
%time second_X_dev = [bert_classifier_phi(text) for text in second_X_str_dev]

CPU times: user 2min 58s, sys: 196 ms, total: 2min 58s
Wall time: 1min 29s


In [159]:
print(classification_report(second_y_dev, second_preds, digits=3))

              precision    recall  f1-score   support

           0      0.941     0.949     0.945       920
           1      0.775     0.747     0.761       217

    accuracy                          0.910      1137
   macro avg      0.858     0.848     0.853      1137
weighted avg      0.909     0.910     0.910      1137

