# Final Project: Disaster Tweet Classification

In [1]:
__author__ = "Kevin Guo, Pranav Sriram, Raymond Yao"
__version__ = "CS224u, Stanford, Spring 2021"

## Data Pre-Processing

In [2]:
import numpy as np
import pandas as pd
import re
from transformers import BertModel, BertTokenizer
from transformers import AutoTokenizer
import utils
import torch
import torch.nn as nn
from torch_shallow_neural_classifier import TorchShallowNeuralClassifier
from datasets import Dataset
from datasets import load_dataset
#from datasets import train_test_split
from transformers import TrainingArguments
from transformers import Trainer
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split

In [3]:
# Set random seeds
utils.fix_random_seeds()

In [4]:
weights_name = 'bert-base-cased'

In [5]:
bert_tokenizer = BertTokenizer.from_pretrained(weights_name)

In [6]:
bert_model = BertModel.from_pretrained(weights_name)

In [7]:
def bert_phi(text):
    input_ids = bert_tokenizer.encode(text, add_special_tokens=True)
    X = torch.tensor([input_ids])
    with torch.no_grad():
        reps = bert_model(X)
        return reps.last_hidden_state.squeeze(0).numpy()

In [8]:
def bert_classifier_phi(text):
    reps = bert_phi(text)
    #return reps.mean(axis=0)  # Another good, easy option.
    return reps[0]

In [9]:
all_data = pd.read_csv('real_and_fake_news.csv')
all_data['text'] = all_data['text'].apply(lambda x: re.sub(r'https?\S+', '', x))
relevant_data, unneeded = np.split(all_data.sample(frac=1, random_state=42), [10000])
train, dev, test = np.split(relevant_data, [int(.8*len(relevant_data)), int(.9*len(relevant_data))])

In [10]:
print(train)
print(dev)
print(test)

                                                    text  label
30464   Obama Announces ‘Unfinished Business’ For 201...      0
41926  IT BEGINS….OBAMA APPOINTED JUDGE Rules Trump U...      0
8877   White House says Obama will not discuss FBI pr...      1
3798   Treasury unit to share records with Senate for...      1
39671  HOLLYWOOD LIBS HAVE EPIC MELT DOWNS…Threaten P...      0
...                                                  ...    ...
22636   Republicans Dine With Trump, Then Try To Rail...      0
2622   Trump Jr., Manafort agree to negotiate over in...      1
6334   After 2016 campaign, more Americans consider R...      1
1801   US Senate may vote this week to add penalties ...      1
20427  FPL shuts one reactor in Florida, reduces powe...      1

[8000 rows x 2 columns]
                                                    text  label
34238  WILL HILLARY ATTEND? ‘CLOWN LIVES MATTER’ Rall...      0
13316  Bosnian Croat war criminal Praljak killed hims...      1
14057  U.S. all

In [11]:
X_str_train = train.text.values
print(len(X_str_train))
y_train = train.label.values

X_str_dev = dev.text.values
print(len(X_str_dev))
y_dev = dev.label.values

8000
1000


In [12]:
%time X_train = [bert_classifier_phi(text) for text in X_str_train]

CPU times: user 20min 19s, sys: 7.72 s, total: 20min 27s
Wall time: 20min 20s


In [13]:
%time X_dev = [bert_classifier_phi(text) for text in X_str_dev]

CPU times: user 2min 30s, sys: 753 ms, total: 2min 31s
Wall time: 2min 30s


In [14]:
model = TorchShallowNeuralClassifier(
    early_stopping=True,
    hidden_dim=300)

In [15]:
%time _ = model.fit(X_train, y_train)

Stopping after epoch 23. Validation score did not improve by tol=1e-05 for more than 10 epochs. Final error is 0.08604448172263801

CPU times: user 6.03 s, sys: 138 ms, total: 6.17 s
Wall time: 5.83 s


In [16]:
preds = model.predict(X_dev)

In [17]:
from sklearn.metrics import classification_report
print(classification_report(y_dev, preds, digits=3))

              precision    recall  f1-score   support

           0      0.994     1.000     0.997       514
           1      1.000     0.994     0.997       486

    accuracy                          0.997      1000
   macro avg      0.997     0.997     0.997      1000
weighted avg      0.997     0.997     0.997      1000



In [37]:
model.to_pickle('intermediate_model')
#model.save_pretrained('int_model')

In [27]:
new_model = TorchShallowNeuralClassifier(
    early_stopping=True,
    hidden_dim=300)
new_model = new_model.from_pickle('intermediate_model')

In [28]:
print(new_model)

TorchShallowNeuralClassifier(
	batch_size=1028,
	max_iter=1000,
	eta=0.001,
	optimizer_class=<class 'torch.optim.adam.Adam'>,
	l2_strength=0,
	gradient_accumulation_steps=1,
	max_grad_norm=None,
	validation_fraction=0.1,
	early_stopping=True,
	n_iter_no_change=10,
	warm_start=False,
	tol=1e-05,
	hidden_dim=300,
	hidden_activation=Tanh())


In [29]:
print(new_model.params)

['batch_size', 'max_iter', 'eta', 'optimizer_class', 'l2_strength', 'gradient_accumulation_steps', 'max_grad_norm', 'validation_fraction', 'early_stopping', 'n_iter_no_change', 'warm_start', 'tol', 'hidden_dim', 'hidden_activation']


In [23]:
second_data = pd.read_csv('tweets_mod_copy.csv')
second_data['text'] = second_data['text'].apply(lambda x: re.sub(r'https?\S+', '', x))
second_train, second_dev, second_test = np.split(second_data.sample(frac=1, random_state=42), [int(.8*len(second_data)), int(.9*len(second_data))])

In [24]:
print(second_train)

                                                    text  labels
3495   How many illegal buildings should be demolishe...       0
5461                     Who’s fatality is this tho ????       0
9794   #OnThisDay 2018 Chinese state media confirmed ...       1
11105  With any luck you will miss the windstorm on e...       0
1803   Inferno on Black Friday 1939: 71 deaths, 3,700...       1
...                                                  ...     ...
2196   go ahead and make a playlist with your name. g...       0
8561   Ruckelshaus, Sweeney and DDT – rescued from th...       0
11236  😂We learned a long time ago why all major bank...       0
4285   5,000 feral camels culled in drought-hit Austr...       1
8569   Another rescued mumma koala with her little ne...       1

[9096 rows x 2 columns]


In [25]:
second_X_str_train = second_train.text.values
print(len(second_X_str_train))
second_y_train = second_train.labels.values

second_X_str_dev = second_dev.text.values
print(len(second_X_str_dev))
second_y_dev = second_dev.labels.values

9096
1137


In [30]:
%time second_X_train = [bert_classifier_phi(text) for text in second_X_str_train]

CPU times: user 24min 40s, sys: 8.04 s, total: 24min 48s
Wall time: 24min 37s


In [31]:
%time second_X_dev = [bert_classifier_phi(text) for text in second_X_str_dev]

CPU times: user 3min 3s, sys: 1.16 s, total: 3min 4s
Wall time: 3min 3s


In [32]:
intermediate_model = new_model.from_pickle('intermediate_model')

In [33]:
%time _ = intermediate_model.fit(second_X_train, second_y_train)

Stopping after epoch 19. Validation score did not improve by tol=1e-05 for more than 10 epochs. Final error is 2.1667959690093994

CPU times: user 5.64 s, sys: 105 ms, total: 5.74 s
Wall time: 5.42 s


In [34]:
second_preds = intermediate_model.predict(second_X_dev)

In [35]:
print(classification_report(second_y_dev, second_preds, digits=3))

              precision    recall  f1-score   support

           0      0.909     0.948     0.928       920
           1      0.730     0.599     0.658       217

    accuracy                          0.881      1137
   macro avg      0.820     0.773     0.793      1137
weighted avg      0.875     0.881     0.877      1137

