# Final Project: Disaster Tweet Classification

In [314]:
__author__ = "Kevin Guo, Pranav Sriram, Raymond Yao"
__version__ = "CS224u, Stanford, Spring 2021"

## Data Pre-Processing

In [315]:
import numpy as np
import pandas as pd
import re
from transformers import BertModel, BertTokenizer
import utils
import torch
import torch.nn as nn
from torch_shallow_neural_classifier import TorchShallowNeuralClassifier
from datasets import Dataset
from datasets import load_dataset
from transformers import TrainingArguments
from transformers import Trainer
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm

In [316]:
# Set random seeds
utils.fix_random_seeds()

In [317]:
df = pd.read_csv('tweets.csv')
df.head(10)

Unnamed: 0,id,keyword,location,text,labels
0,0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,1,ablaze,,Telangana: Section 144 has been imposed in Bha...,1
2,2,ablaze,New York City,Arsonist sets cars ablaze at dealership https:...,1
3,3,ablaze,"Morgantown, WV",Arsonist sets cars ablaze at dealership https:...,1
4,4,ablaze,,"""Lord Jesus, your love brings freedom and pard...",0
5,5,ablaze,OC,"If this child was Chinese, this tweet would ha...",0
6,6,ablaze,"London, England",Several houses have been set ablaze in Ngemsib...,1
7,7,ablaze,Bharat,Asansol: A BJP office in Salanpur village was ...,1
8,8,ablaze,"Accra, Ghana","National Security Minister, Kan Dapaah's side ...",0
9,9,ablaze,Searching,This creature who’s soul is no longer clarent ...,0


In [318]:
# Remove URLs
df['text'] = df['text'].apply(lambda x: re.sub(r'https?\S+', '', x))
del df['id']
del df['keyword']
del df['location']
df.head(10)

Unnamed: 0,text,labels
0,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,Telangana: Section 144 has been imposed in Bha...,1
2,Arsonist sets cars ablaze at dealership,1
3,Arsonist sets cars ablaze at dealership,1
4,"""Lord Jesus, your love brings freedom and pard...",0
5,"If this child was Chinese, this tweet would ha...",0
6,Several houses have been set ablaze in Ngemsib...,1
7,Asansol: A BJP office in Salanpur village was ...,1
8,"National Security Minister, Kan Dapaah's side ...",0
9,This creature who’s soul is no longer clarent ...,0


In [319]:
# Split into train, dev, and test sets using a 80-10-10 ratio
train = df 
dev = df
test = df
#train, dev, test = np.split(df.sample(frac=1, random_state=42), [int(.8*len(df)), int(.9*len(df))])

In [320]:
train_dataset = load_dataset('csv', data_files = 'tweets.csv')
dev_dataset = load_dataset('csv', data_files = 'tweets.csv')
test_dataset = load_dataset('csv', data_files = 'tweets.csv')

# train_dataset = Dataset.from_pandas(train)
# dev_dataset = Dataset.from_pandas(dev)
# test_dataset = Dataset.from_pandas(test)
# dataset = Dataset.from_pandas(df)

Using custom data configuration default-a25b990e618a3e4f
Reusing dataset csv (/Users/raymondyao/.cache/huggingface/datasets/csv/default-a25b990e618a3e4f/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0)
Using custom data configuration default-a25b990e618a3e4f
Reusing dataset csv (/Users/raymondyao/.cache/huggingface/datasets/csv/default-a25b990e618a3e4f/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0)
Using custom data configuration default-a25b990e618a3e4f
Reusing dataset csv (/Users/raymondyao/.cache/huggingface/datasets/csv/default-a25b990e618a3e4f/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0)


In [321]:
weights_name = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(weights_name)
# x_train = train_dataset['text']
# y_train = train_dataset['target']

In [326]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_dev_dataset = dev_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

tokenized_train_dataset.set_format("torch")
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["keyword"])
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["location"])
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["id"])
#tokenized_train_dataset = tokenized_train_dataset.remove_columns(["text"])

tokenized_dev_dataset.set_format("torch")
tokenized_dev_dataset = tokenized_dev_dataset.remove_columns(["keyword"])
tokenized_dev_dataset = tokenized_dev_dataset.remove_columns(["location"])
tokenized_dev_dataset = tokenized_dev_dataset.remove_columns(["id"])
#tokenized_dev_dataset = tokenized_dev_dataset.remove_columns(["text"])

tokenized_test_dataset.set_format("torch")
#tokenized_test_dataset = tokenized_test_dataset.remove_columns(["text"])


#tokenized_train_dataset = tokenized_train_dataset.remove_columns(['__index_level_0__'])
print(tokenized_train_dataset)
print(tokenized_dev_dataset)
tokenized_test_dataset

  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'text', 'token_type_ids'],
        num_rows: 11370
    })
})
DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'text', 'token_type_ids'],
        num_rows: 11370
    })
})


DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'id', 'input_ids', 'keyword', 'labels', 'location', 'text', 'token_type_ids'],
        num_rows: 11370
    })
})

In [327]:
bert_model = BertModel.from_pretrained(weights_name)

In [328]:
training_args = TrainingArguments("test_trainer")
trainer = Trainer(
    model=bert_model, args=training_args, train_dataset=tokenized_train_dataset, eval_dataset=tokenized_dev_dataset
)

In [329]:
trainer.train()

KeyError: 0

In [None]:
train_dataloader = DataLoader(tokenized_train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(tokenized_dev_dataset, batch_size=8)

In [None]:
optimizer = AdamW(bert_model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
bert_model.to(device)

In [None]:
progress_bar = tqdm(range(num_training_steps))
torch.cuda.empty_cache()
print(12)


bert_model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        #batch = {k: v.to(device) for k, v in batch.items()}
        outputs = bert_model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)