# Final Project: Disaster Tweet Classification

In [1]:
__author__ = "Kevin Guo, Pranav Sriram, Raymond Yao"
__version__ = "CS224u, Stanford, Spring 2021"

## Data Pre-Processing

In [2]:
import numpy as np
import pandas as pd
import re
from transformers import BertModel, BertTokenizer
from transformers import AutoTokenizer
import utils
import torch
import torch.nn as nn
from torch_shallow_neural_classifier import TorchShallowNeuralClassifier
from datasets import Dataset
from datasets import load_dataset
#from datasets import train_test_split
from transformers import TrainingArguments
from transformers import Trainer
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split

In [3]:
# Set random seeds
utils.fix_random_seeds()

In [4]:
weights_name = 'bert-base-cased'

In [5]:
bert_tokenizer = BertTokenizer.from_pretrained(weights_name)

In [6]:
bert_model = BertModel.from_pretrained(weights_name)

In [7]:
def bert_phi(text):
    input_ids = bert_tokenizer.encode(text, add_special_tokens=True)
    X = torch.tensor([input_ids])
    with torch.no_grad():
        reps = bert_model(X)
        return reps.last_hidden_state.squeeze(0).numpy()

In [8]:
def bert_classifier_phi(text):
    reps = bert_phi(text)
    #return reps.mean(axis=0)  # Another good, easy option.
    return reps[0]

In [21]:
train_table = pd.read_table('2013_Queensland_Floods_train.tsv')
train_table['text'] = train_table['text'].apply(lambda x: re.sub(r'https?\S+', '', x))

dev_table = pd.read_table('2013_Queensland_Floods_dev.tsv')
dev_table['text'] = dev_table['text'].apply(lambda x: re.sub(r'https?\S+', '', x))

test_table = pd.read_table('2013_Queensland_Floods_test.tsv')
test_table['text'] = test_table['text'].apply(lambda x: re.sub(r'https?\S+', '', x))


train = train_table 
dev = dev_table 
test = test_table

In [22]:
print(train)
print(dev)
print(test)

                                                   text  label
0     I just though about the night I went clubbing ...      0
1     Looks like its going to be another long night ...      0
2     @LaniiBanani hahahaha I just told him id have ...      0
3     Off to meeting.... with so called... Baaps of ...      0
4              Doubt I'll be getting much sleep tonight      0
...                                                 ...    ...
6014  RT @GrillTeam: The Queensland government has s...      1
6015  Can we have 5 NEMA staff from Nigeria  come to...      1
6016  RT @7NewsBrisbane: Foam from rough waves at Al...      1
6017  RT @abcsouthqld: Master Electricians Australia...      1
6018  RT @HomeLoanKing: Leader of Aussie opposition,...      1

[6019 rows x 2 columns]
                                                   text  label
0     Fuck It.. Chelsea should have been all over Br...      0
1     Hey Dana Does @Alistairovereem gets the title ...      0
2     @mimstacey @janecaro gam

In [23]:
X_str_train = train.text.values
print(len(X_str_train))
y_train = train.label.values

X_str_dev = dev.text.values
print(len(X_str_dev))
y_dev = dev.label.values

6019
1003


In [24]:
%time X_train = [bert_classifier_phi(text) for text in X_str_train]

CPU times: user 16min 22s, sys: 7.09 s, total: 16min 29s
Wall time: 16min 23s


In [25]:
%time X_dev = [bert_classifier_phi(text) for text in X_str_dev]

CPU times: user 2min 44s, sys: 1.05 s, total: 2min 45s
Wall time: 2min 44s


In [26]:
model = TorchShallowNeuralClassifier(
    early_stopping=True,
    hidden_dim=300)

In [27]:
%time _ = model.fit(X_train, y_train)

Stopping after epoch 43. Validation score did not improve by tol=1e-05 for more than 10 epochs. Final error is 0.5618827491998672

CPU times: user 8.75 s, sys: 147 ms, total: 8.9 s
Wall time: 8.41 s


In [28]:
preds = model.predict(X_dev)

In [29]:
from sklearn.metrics import classification_report
print(classification_report(y_dev, preds, digits=3))

              precision    recall  f1-score   support

           0      0.893     0.972     0.931       462
           1      0.974     0.900     0.936       541

    accuracy                          0.933      1003
   macro avg      0.933     0.936     0.933      1003
weighted avg      0.937     0.933     0.933      1003



In [30]:
second_data = pd.read_csv('tweets_mod_copy.csv')
second_data['text'] = second_data['text'].apply(lambda x: re.sub(r'https?\S+', '', x))
second_train, second_dev, second_test = np.split(second_data.sample(frac=1, random_state=42), [int(.8*len(second_data)), int(.9*len(second_data))])

In [31]:
print(second_train)

                                                    text  labels
3495   How many illegal buildings should be demolishe...       0
5461                     Who’s fatality is this tho ????       0
9794   #OnThisDay 2018 Chinese state media confirmed ...       1
11105  With any luck you will miss the windstorm on e...       0
1803   Inferno on Black Friday 1939: 71 deaths, 3,700...       1
...                                                  ...     ...
2196   go ahead and make a playlist with your name. g...       0
8561   Ruckelshaus, Sweeney and DDT – rescued from th...       0
11236  😂We learned a long time ago why all major bank...       0
4285   5,000 feral camels culled in drought-hit Austr...       1
8569   Another rescued mumma koala with her little ne...       1

[9096 rows x 2 columns]


In [32]:
second_X_str_train = second_train.text.values
print(len(second_X_str_train))
second_y_train = second_train.labels.values

second_X_str_dev = second_dev.text.values
print(len(second_X_str_dev))
second_y_dev = second_dev.labels.values

9096
1137


In [33]:
%time second_X_train = [bert_classifier_phi(text) for text in second_X_str_train]

CPU times: user 24min 35s, sys: 7.87 s, total: 24min 43s
Wall time: 25min 18s


In [34]:
%time second_X_dev = [bert_classifier_phi(text) for text in second_X_str_dev]

CPU times: user 3min 1s, sys: 883 ms, total: 3min 1s
Wall time: 3min


In [35]:
%time _ = model.fit(second_X_train, second_y_train)

Stopping after epoch 33. Validation score did not improve by tol=1e-05 for more than 10 epochs. Final error is 1.9881386458873749

CPU times: user 9.88 s, sys: 126 ms, total: 10 s
Wall time: 9.45 s


In [36]:
second_preds = model.predict(second_X_dev)

In [37]:
print(classification_report(second_y_dev, second_preds, digits=3))

              precision    recall  f1-score   support

           0      0.920     0.947     0.933       920
           1      0.742     0.650     0.693       217

    accuracy                          0.890      1137
   macro avg      0.831     0.798     0.813      1137
weighted avg      0.886     0.890     0.887      1137

