# Final Project: Disaster Tweet Classification

In [122]:
__author__ = "Kevin Guo, Pranav Sriram, Raymond Yao"
__version__ = "CS224u, Stanford, Spring 2021"

## Data Pre-Processing

In [123]:
import numpy as np
import pandas as pd
import re
from transformers import BertModel, BertTokenizer
import utils
import torch
import torch.nn as nn
from torch_shallow_neural_classifier import TorchShallowNeuralClassifier

In [124]:
# Set random seeds
utils.fix_random_seeds()

In [125]:
df = pd.read_csv('tweets.csv')
df.head(10)

Unnamed: 0,id,keyword,location,text,target
0,0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Stones were pelted on Muslims' houses and some houses and vehicles were set ablaze…",1
1,1,ablaze,,"Telangana: Section 144 has been imposed in Bhainsa from January 13 to 15, after clash erupted between two groups on January 12. Po…",1
2,2,ablaze,New York City,Arsonist sets cars ablaze at dealership https://t.co/gOQvyJbpVI,1
3,3,ablaze,"Morgantown, WV",Arsonist sets cars ablaze at dealership https://t.co/0gL7NUCPlb https://t.co/u1CcBhOWh9,1
4,4,ablaze,,"""Lord Jesus, your love brings freedom and pardon. Fill me with your Holy Spirit and set my heart ablaze with your l… https://t.co/VlTznnPNi8",0
5,5,ablaze,OC,"If this child was Chinese, this tweet would have gone viral. Social media would be ablaze. SNL would have made a racist j…",0
6,6,ablaze,"London, England","Several houses have been set ablaze in Ngemsibaa village, Oku sub division in the North West Region of Cameroon by… https://t.co/99uHGAzxy2",1
7,7,ablaze,Bharat,Asansol: A BJP office in Salanpur village was set ablaze last night. BJP has alleged that TMC is behind the incident. Police has b…,1
8,8,ablaze,"Accra, Ghana","National Security Minister, Kan Dapaah's side chic has set the internet ablaze with her latest powerful video.… https://t.co/rhzOMQVSlj",0
9,9,ablaze,Searching,This creature who’s soul is no longer clarent but blue ablaze This thing Carrying memories Memories of… https://t.co/tBKSNDrDoX,0


In [126]:
# Remove URLs
df['text'] = df['text'].apply(lambda x: re.sub(r'https?\S+', '', x))
df.head(10)

Unnamed: 0,id,keyword,location,text,target
0,0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Stones were pelted on Muslims' houses and some houses and vehicles were set ablaze…",1
1,1,ablaze,,"Telangana: Section 144 has been imposed in Bhainsa from January 13 to 15, after clash erupted between two groups on January 12. Po…",1
2,2,ablaze,New York City,Arsonist sets cars ablaze at dealership,1
3,3,ablaze,"Morgantown, WV",Arsonist sets cars ablaze at dealership,1
4,4,ablaze,,"""Lord Jesus, your love brings freedom and pardon. Fill me with your Holy Spirit and set my heart ablaze with your l…",0
5,5,ablaze,OC,"If this child was Chinese, this tweet would have gone viral. Social media would be ablaze. SNL would have made a racist j…",0
6,6,ablaze,"London, England","Several houses have been set ablaze in Ngemsibaa village, Oku sub division in the North West Region of Cameroon by…",1
7,7,ablaze,Bharat,Asansol: A BJP office in Salanpur village was set ablaze last night. BJP has alleged that TMC is behind the incident. Police has b…,1
8,8,ablaze,"Accra, Ghana","National Security Minister, Kan Dapaah's side chic has set the internet ablaze with her latest powerful video.…",0
9,9,ablaze,Searching,This creature who’s soul is no longer clarent but blue ablaze This thing Carrying memories Memories of…,0


In [127]:
# Split into train, dev, and test sets using a 80-10-10 ratio
train, dev, test = np.split(df.sample(frac=1, random_state=42), [int(.8*len(df)), int(.9*len(df))])

In [129]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
x_train = train['text'].values
y_train = train['target'].values



Who’s fatality is this tho ????
['Who', '’', 's', 'fatal', '##ity', 'is', 'this', 'th', '##o', '?', '?', '?', '?']


In [88]:
class HfBertClassifierModel(nn.Module):
    def __init__(self, n_classes, weights_name='bert-base-cased'):
        super().__init__()
        self.n_classes = n_classes
        self.weights_name = weights_name
        self.bert = BertModel.from_pretrained(self.weights_name)
        self.bert.train()
        self.hidden_dim = self.bert.embeddings.word_embeddings.embedding_dim
        # The only new parameters -- the classifier:
        self.classifier_layer = nn.Linear(
            self.hidden_dim, self.n_classes)

    def forward(self, indices, mask):
        reps = self.bert(
            indices, attention_mask=mask)
        return self.classifier_layer(reps.pooler_output)

In [93]:
class HfBertClassifier(TorchShallowNeuralClassifier):
    def __init__(self, weights_name, *args, **kwargs):
        self.weights_name = weights_name
        self.tokenizer = BertTokenizer.from_pretrained(self.weights_name)
        super().__init__(*args, **kwargs)
        self.params += ['weights_name']

    def build_graph(self):
        return HfBertClassifierModel(self.n_classes_, self.weights_name)

    def build_dataset(self, X, y=None):
        data = self.tokenizer.batch_encode_plus(
            X,
            max_length=None,
            add_special_tokens=True,
            padding='longest',
            return_attention_mask=True)
        indices = torch.tensor(data['input_ids'])
        mask = torch.tensor(data['attention_mask'])
        if y is None:
            dataset = torch.utils.data.TensorDataset(indices, mask)
        else:
            self.classes_ = sorted(set(y))
            self.n_classes_ = len(self.classes_)
            class2index = dict(zip(self.classes_, range(self.n_classes_)))
            y = [class2index[label] for label in y]
            y = torch.tensor(y)
            dataset = torch.utils.data.TensorDataset(indices, mask, y)
        return dataset

In [95]:
def build_dataset(dataframes, phi, vectorizer=None, vectorize=True):
    """
    Core general function for building experimental datasets.

    Parameters
    ----------
    dataframes : pd.DataFrame or list of pd.DataFrame
        The dataset or datasets to process, as read in by
        `sentiment_reader`.

    phi : feature function
       Any function that takes a string as input and returns a
       bool/int/float-valued dict as output.

    vectorizer : sklearn.feature_extraction.DictVectorizer
       If this is None, then a new `DictVectorizer` is created and
       used to turn the list of dicts created by `phi` into a
       feature matrix. This happens when we are training.
       If this is not None, then it's assumed to be a `DictVectorizer`
       and used to transform the list of dicts. This happens in
       assessment, when we take in new instances and need to
       featurize them as we did in training.

    vectorize : bool
       Whether to use a DictVectorizer. Set this to False for
       deep learning models that process their own input.

    Returns
    -------
    dict
        A dict with keys 'X' (the feature matrix), 'y' (the list of
        labels), 'vectorizer' (the `DictVectorizer`), and
        'raw_examples' (the `nltk.Tree` objects, for error analysis).

    """
    if isinstance(dataframes, (list, tuple)):
        df = pd.concat(dataframes)
    else:
        df = dataframes

    raw_examples = list(df.sentence.values)

    feat_dicts = list(df.sentence.apply(phi).values)

    if 'label' in df.columns:
        labels = list(df.label.values)
    else:
        labels = None

    feat_matrix = None
    if vectorize:
        # In training, we want a new vectorizer:
        if vectorizer is None:
            vectorizer = DictVectorizer(sparse=False)
            feat_matrix = vectorizer.fit_transform(feat_dicts)
        # In assessment, we featurize using the existing vectorizer:
        else:
            feat_matrix = vectorizer.transform(feat_dicts)
    else:
        feat_matrix = feat_dicts

    return {'X': feat_matrix,
            'y': labels,
            'vectorizer': vectorizer,
            'raw_examples': raw_examples}

In [96]:
# Train dataset:
train = build_dataset(
    train_dataframes,
    phi,
    vectorizer=None,
    vectorize=vectorize)

# Manage the assessment set-up:
X_train = train['X']
y_train = train['y']
raw_train = train['raw_examples']
assess_datasets = []
if assess_dataframes is None:
    X_train, X_assess, y_train, y_assess, raw_train, raw_assess = train_test_split(
        X_train, y_train, raw_train,
        train_size=train_size,
        test_size=None,
        random_state=random_state)
    assess_datasets.append({
        'X': X_assess,
        'y': y_assess,
        'vectorizer': train['vectorizer'],
        'raw_examples': raw_assess})
else:
    if not isinstance(assess_dataframes, (tuple, list)):
        assess_dataframes = [assess_dataframes]
    for assess_df in assess_dataframes:
        # Assessment dataset using the training vectorizer:
        assess = build_dataset(
            assess_df,
            phi,
            vectorizer=train['vectorizer'],
            vectorize=vectorize)
        assess_datasets.append(assess)

# Train:
mod = train_func(X_train, y_train)

# Predictions if we have labels:
predictions = []
scores = []
for dataset_num, assess in enumerate(assess_datasets, start=1):
    preds = mod.predict(assess['X'])
    if assess['y'] is None:
        predictions.append(None)
        scores.append(None)
    else:
        if verbose:
            if len(assess_datasets) > 1:
                print("Assessment dataset {}".format(dataset_num))
            print(classification_report(assess['y'], preds, digits=3))
        predictions.append(preds)
        scores.append(score_func(assess['y'], preds))
true_scores = [s for s in scores if s is not None]
if len(true_scores) > 1 and verbose:
    mean_score = np.mean(true_scores)
    print("Mean of macro-F1 scores: {0:.03f}".format(mean_score))


# Return the overall scores and other experimental info:
return {
    'model': mod,
    'phi': phi,
    'train_dataset': train,
    'assess_datasets': assess_datasets,
    'predictions': predictions,
    'metric': score_func.__name__,
    'scores': scores}

NameError: name 'train_dataframes' is not defined