# Final Project: Disaster Tweet Classification

In [1]:
__author__ = "Kevin Guo, Pranav Sriram, Raymond Yao"
__version__ = "CS224u, Stanford, Spring 2021"

## Data Pre-Processing

In [2]:
import numpy as np
import pandas as pd
import re
from transformers import BertModel, BertTokenizer
from transformers import AutoTokenizer
import utils
import torch
import torch.nn as nn
from torch_shallow_neural_classifier import TorchShallowNeuralClassifier
from datasets import Dataset
from datasets import load_dataset
#from datasets import train_test_split
from transformers import TrainingArguments
from transformers import Trainer
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split

In [3]:
# Set random seeds
utils.fix_random_seeds()

In [4]:
weights_name = 'bert-base-cased'

In [5]:
bert_tokenizer = BertTokenizer.from_pretrained(weights_name)

In [6]:
bert_model = BertModel.from_pretrained(weights_name)

In [7]:
def bert_phi(text):
    input_ids = bert_tokenizer.encode(text, add_special_tokens=True)
    X = torch.tensor([input_ids])
    with torch.no_grad():
        reps = bert_model(X)
        return reps.last_hidden_state.squeeze(0).numpy()

In [8]:
def bert_classifier_phi(text):
    reps = bert_phi(text)
    #return reps.mean(axis=0)  # Another good, easy option.
    return reps[0]

In [9]:
all_data = pd.read_csv('real_and_fake_news.csv')
all_data['text'] = all_data['text'].apply(lambda x: re.sub(r'https?\S+', '', x))
relevant_data, unneeded = np.split(all_data.sample(frac=1, random_state=42), [10000])
train, dev, test = np.split(relevant_data, [int(.8*len(relevant_data)), int(.9*len(relevant_data))])

In [10]:
print(train)
print(dev)
print(test)

                                                    text  label
30464   Obama Announces ‘Unfinished Business’ For 201...      0
41926  IT BEGINS….OBAMA APPOINTED JUDGE Rules Trump U...      0
8877   White House says Obama will not discuss FBI pr...      1
3798   Treasury unit to share records with Senate for...      1
39671  HOLLYWOOD LIBS HAVE EPIC MELT DOWNS…Threaten P...      0
...                                                  ...    ...
22636   Republicans Dine With Trump, Then Try To Rail...      0
2622   Trump Jr., Manafort agree to negotiate over in...      1
6334   After 2016 campaign, more Americans consider R...      1
1801   US Senate may vote this week to add penalties ...      1
20427  FPL shuts one reactor in Florida, reduces powe...      1

[8000 rows x 2 columns]
                                                    text  label
34238  WILL HILLARY ATTEND? ‘CLOWN LIVES MATTER’ Rall...      0
13316  Bosnian Croat war criminal Praljak killed hims...      1
14057  U.S. all

In [11]:
X_str_train = train.text.values
print(len(X_str_train))
y_train = train.label.values

X_str_dev = dev.text.values
print(len(X_str_dev))
y_dev = dev.label.values

8000
1000


In [None]:
%time X_train = [bert_classifier_phi(text) for text in X_str_train]

In [None]:
%time X_dev = [bert_classifier_phi(text) for text in X_str_dev]

In [None]:
model = TorchShallowNeuralClassifier(
    early_stopping=True,
    hidden_dim=300)

In [None]:
%time _ = model.fit(X_train, y_train)

In [None]:
preds = model.predict(X_dev)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_dev, preds, digits=3))