# Final Project: Disaster Tweet Classification

In [1]:
__author__ = "Kevin Guo, Pranav Sriram, Raymond Yao"
__version__ = "CS224u, Stanford, Spring 2021"

## Data Pre-Processing

In [115]:
import numpy as np
import pandas as pd
import re
from transformers import BertModel, BertTokenizer
from transformers import AutoTokenizer
import utils
import torch
import torch.nn as nn
from torch_shallow_neural_classifier import TorchShallowNeuralClassifier
from datasets import Dataset
from datasets import load_dataset
#from datasets import train_test_split
from transformers import TrainingArguments
from transformers import Trainer
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from torch_rnn_classifier import TorchRNNClassifier

In [3]:
# Set random seeds
utils.fix_random_seeds()

In [4]:
weights_name = 'bert-base-cased'

In [5]:
bert_tokenizer = BertTokenizer.from_pretrained(weights_name)

In [6]:
bert_model = BertModel.from_pretrained(weights_name)

In [7]:
def bert_phi(text):
    input_ids = bert_tokenizer.encode(text, add_special_tokens=True)
    X = torch.tensor([input_ids])
    with torch.no_grad():
        reps = bert_model(X)
        return reps.last_hidden_state.squeeze(0).numpy()

In [8]:
def bert_classifier_phi(text):
    reps = bert_phi(text)
    #return reps.mean(axis=0)  # Another good, easy option.
    return reps[0]

In [9]:
all_data = pd.read_csv('tweets_mod_copy.csv')
all_data['text'] = all_data['text'].apply(lambda x: re.sub(r'https?\S+', '', x))
train, dev, test = np.split(all_data.sample(frac=1, random_state=42), [int(.8*len(all_data)), int(.9*len(all_data))])

In [10]:
print(train)

                                                    text  labels
3495   How many illegal buildings should be demolishe...       0
5461                     Who’s fatality is this tho ????       0
9794   #OnThisDay 2018 Chinese state media confirmed ...       1
11105  With any luck you will miss the windstorm on e...       0
1803   Inferno on Black Friday 1939: 71 deaths, 3,700...       1
...                                                  ...     ...
2196   go ahead and make a playlist with your name. g...       0
8561   Ruckelshaus, Sweeney and DDT – rescued from th...       0
11236  😂We learned a long time ago why all major bank...       0
4285   5,000 feral camels culled in drought-hit Austr...       1
8569   Another rescued mumma koala with her little ne...       1

[9096 rows x 2 columns]


In [11]:
X_str_train = train.text.values
print(len(X_str_train))
y_train = train.labels.values

X_str_dev = dev.text.values
print(len(X_str_dev))
y_dev = dev.labels.values

9096
1137


In [12]:
%time X_train = [bert_classifier_phi(text) for text in X_str_train]

CPU times: user 12min 50s, sys: 14 s, total: 13min 4s
Wall time: 12min 57s


In [13]:
%time X_dev = [bert_classifier_phi(text) for text in X_str_dev]

CPU times: user 1min 35s, sys: 1.81 s, total: 1min 37s
Wall time: 1min 36s


In [179]:
model = TorchShallowNeuralClassifier(
    early_stopping=True,
    hidden_dim=1000,
    hidden_activation=nn.LeakyReLU())

In [180]:
%time _ = model.fit(X_train, y_train)

Stopping after epoch 28. Validation score did not improve by tol=1e-05 for more than 10 epochs. Final error is 1.8006368577480316

CPU times: user 10.7 s, sys: 376 ms, total: 11.1 s
Wall time: 10.1 s


In [181]:
preds = model.predict(X_dev)

In [182]:
from sklearn.metrics import classification_report
print(classification_report(y_dev, preds, digits=3))

              precision    recall  f1-score   support

           0      0.917     0.958     0.937       920
           1      0.778     0.631     0.697       217

    accuracy                          0.895      1137
   macro avg      0.848     0.794     0.817      1137
weighted avg      0.890     0.895     0.891      1137



In [150]:
logistic_model = LogisticRegression()

In [132]:
logistic_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [133]:
preds = logistic_model.predict(X_dev)
from sklearn.metrics import classification_report
print(classification_report(y_dev, preds, digits=3))

              precision    recall  f1-score   support

           0      0.910     0.963     0.936       920
           1      0.791     0.594     0.679       217

    accuracy                          0.893      1137
   macro avg      0.851     0.779     0.807      1137
weighted avg      0.887     0.893     0.887      1137

