## Disaster Tweets Classification using TinyBERT

### Download Data

In [2]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd



In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/twitter_disaster_tweets.csv", usecols=['text', 'target'])
df.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
df.isnull().sum()

df = df.sample(frac=1).reset_index(drop=True)

In [5]:
df['target'].value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

### Load Data with Hugging Face Datasets Library


In [7]:
df = df.rename(columns={'target': 'label'})

In [8]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)

dataset = dataset.train_test_split(test_size=0.2)

dataset['train'][0]

{'text': 'Severe storm weakening as it moves SE towards Lubbock area.  Outflow boundary may create dust and 50 mph gusts http://t.co/pw3tZU0tay',
 'label': 1}

In [9]:
id2label = {0:'general', 1: 'disaster'}
label2id = {'general': 0, 'disaster': 1}


## Data Tokenization


In [10]:
from transformers import AutoTokenizer
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model_ckpt = 'huawei-noah/TinyBERT_General_4L_312D'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=True)


In [11]:
tokenizer(dataset['train'][0]['text'])

def tokenize(batch):
    temp = tokenizer(batch['text'], padding=True, truncation=True, max_length=100)
    return temp

dataset = dataset.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/6090 [00:00<?, ? examples/s]

Map:   0%|          | 0/1523 [00:00<?, ? examples/s]

In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6090
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1523
    })
})

## Building Model Evaluation Functions
https://huggingface.co/docs/transformers/v4.42.0/en/tasks/sequence_classification#evaluate

In [13]:
# !pip install evaluate

import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## Model Building


In [14]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=len(label2id), label2id=label2id, id2label=id2label)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
train_data = dataset['train'].select(range(200))
test_data=dataset['test'].select(range(100))


In [16]:
args = TrainingArguments(
    output_dir='train_dir',
    overwrite_output_dir=True,
    num_train_epochs=1,
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_data,
    eval_dataset=test_data,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [17]:
args = TrainingArguments(
    output_dir='train_dir',
    overwrite_output_dir=True,
    num_train_epochs=1,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_data,
    eval_dataset=test_data,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [18]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.693886,0.45


TrainOutput(global_step=7, training_loss=0.6930675506591797, metrics={'train_runtime': 18.6755, 'train_samples_per_second': 10.709, 'train_steps_per_second': 0.375, 'total_flos': 470498313600.0, 'train_loss': 0.6930675506591797, 'epoch': 1.0})

In [19]:
trainer.evaluate()

{'eval_loss': 0.6938857436180115,
 'eval_accuracy': 0.45,
 'eval_runtime': 1.8949,
 'eval_samples_per_second': 52.772,
 'eval_steps_per_second': 2.111,
 'epoch': 1.0}

## Model Save and Load for Inference

In [23]:
trainer.save_model('tinybert-disaster-analysis')

In [24]:
data = ['this movie was horrible, the plot was really boring. acting was okay',
        'the movie is really sucked. there is not plot and acting was bad',
        'what a beautiful movie. great plot. acting was good. will see it again']

In [None]:
from transformers import pipeline
import torch

# data = ['this movie was horrible, the plot was really boring. acting was okay',
#         'the movie is really sucked. there is not plot and acting was bad',
#         'what a beautiful movie. great plot. acting was good. will see it again']

data = ['There is a fire in the building', 'I am happy today', 'I am sad today', 
          'I am not feeling well','There is a flood in the city, go to higher ground']

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

classifier = pipeline('text-classification', model='tinybert-disaster-tweet', device=device)

classifier(data)

## Push Model to AWS S3

In [27]:
# upload model folder to s3 bucket ml-models/tinybert-sentiment-analysis
import os
import boto3

s3 = boto3.client('s3')
bucket_name = 'mlops-sheenu'

def upload_directory(directory_path, s3_prefix):
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(root, file).replace("\\", "/")
            relpath = os.path.relpath(file_path, directory_path)
            s3_key = os.path.join(s3_prefix, relpath).replace("\\", "/")
            
            s3.upload_file(file_path, bucket_name, s3_key)


upload_directory('tinybert-disaster-tweet', 'ml-models/tinybert-disaster-tweet')