References: 
- https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f
- https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/
- https://towardsdatascience.com/pytorch-tabular-binary-classification-a0368da5bb89



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from wordcloud import STOPWORDS
from collections import defaultdict
import random
import re
import string

In [None]:
df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
df.sample(5)

0: Not a disaster
1: Disaster

In [None]:
class_names =['Not a disaster', 'Disaster']

In [None]:
test_df.sample(5)

# EDA

In [None]:
df['target'].value_counts()

In [None]:
balance_counts = df.groupby('target')['target'].agg('count').values
fig = go.Figure()
fig.add_trace(go.Bar(
    x=['0'],
    y=[balance_counts[0]],
    text=[balance_counts[0]],
    textposition='auto',
))
fig.add_trace(go.Bar(
    x=['1'],
    y=[balance_counts[1]],
    text=[balance_counts[1]],
    textposition='auto',
))
fig.update_layout(
    title='<span style="font-size:32px;">Dataset distribution by target</span>'
)
fig.show()

Classes are slightly imbalanced

In [None]:
df.info()

In [None]:
# word_count
df['word_count'] = df['text'].apply(lambda x: len(str(x).split()))

# unique_word_count
df['unique_word_count'] = df['text'].apply(lambda x: len(set(str(x).split())))

# stop_word_count
df['stop_word_count'] = df['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))

# url_count
df['url_count'] = df['text'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))

# mean_word_length
df['mean_word_length'] = df['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

# char_count
df['char_count'] = df['text'].apply(lambda x: len(str(x)))

# punctuation_count
df['punctuation_count'] = df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

# hashtag_count
df['hashtag_count'] = df['text'].apply(lambda x: len([c for c in str(x) if c == '#']))

# mention_count
df['mention_count'] = df['text'].apply(lambda x: len([c for c in str(x) if c == '@']))

In [None]:
## Truncate some extreme values for better visuals ##
df['word_count'].loc[df['word_count']>60] = 60 #truncation for better visuals
df['char_count'].loc[df['char_count']>350] = 350 #truncation for better visuals
df['punctuation_count'].loc[df['punctuation_count']>10] = 10 #truncation for better visuals

f, axes = plt.subplots(3, 1, figsize=(20,30))
sns.boxplot(x='target', y='word_count', data=df, ax=axes[0])
axes[0].set_xlabel('Target', fontsize=12)
axes[0].set_title("Number of words in each class", fontsize=15)

sns.boxplot(x='target', y='char_count', data=df, ax=axes[1])
axes[1].set_xlabel('Target', fontsize=12)
axes[1].set_title("Number of characters in each class", fontsize=15)

sns.boxplot(x='target', y='punctuation_count', data=df, ax=axes[2])
axes[2].set_xlabel('Target', fontsize=12)
#plt.ylabel('Number of punctuations in text', fontsize=12)
axes[2].set_title("Number of punctuations in each class", fontsize=15)

In [None]:
df['hashtag_count'].loc[df['hashtag_count']>60] = 60 #truncation for better visuals
df['mention_count'].loc[df['mention_count']>60] = 60 #truncation for better visuals

f, axes = plt.subplots(3, 1, figsize=(20,30))

sns.boxplot(x='target', y='hashtag_count', data=df, ax=axes[0])
axes[0].set_xlabel('Target', fontsize=12)
axes[0].set_title("Number of Hashtags in each class", fontsize=15)

sns.boxplot(x='target', y='mention_count', data=df, ax=axes[1])
axes[1].set_xlabel('Target', fontsize=12)
axes[1].set_title("Number of Mentions in each class", fontsize=15)

sns.boxplot(x='target', y='url_count', data=df, ax=axes[2])
axes[2].set_xlabel('Target', fontsize=12)
axes[2].set_title("Number of URLs in each class", fontsize=15)
plt.show()

In [None]:
exclude = string.punctuation
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

def remove_punc(text):
    return text.translate(str.maketrans('', '', exclude))

In [None]:
text1 = 'Check out my notebook https://www.kaggle.com/campusx/notebook8223fc1abb'
text2 = '!hello *world@ 1'
df['text'] = df['text'].apply(remove_url)
#df['text'] = df['text'].apply(remove_punc)
test_df['text'] = test_df['text'].apply(remove_url)
#test_df['text'] = test_df['text'].apply(remove_punc)

# Modelling

In [None]:
from transformers import BertModel, BertTokenizer
import torch
from torch import nn
from torch.utils.data import DataLoader
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
example_text = 'I will watch #Memento tonight!'
bert_input = tokenizer(example_text, padding='max_length', max_length = 15,
                      truncation = True, return_tensors = 'pt')

In [None]:
print(bert_input)

In [None]:
print(tokenizer.convert_ids_to_tokens(bert_input['input_ids'][0]))
print(bert_input.input_ids) # id representation of each token
print(bert_input.token_type_ids) # a binary mask that identifies in which sequence a token belongs
print(bert_input.attention_mask) #  a binary mask that identifies whether a token is a real word or just padding. If the token contains [CLS], [SEP], or any real word, then the mask would be 1. Meanwhile, if the token is just padding or [PAD], then the mask would be 0.

In [None]:
example_text = tokenizer.decode(bert_input.input_ids[0])
print(example_text)

In [None]:
bert_input['input_ids']

In [None]:
input_ids = torch.tensor(bert_input.input_ids)
attention_mask = torch.tensor(bert_input.attention_mask)
bert_model = BertModel.from_pretrained('bert-base-uncased')
last_hidden_state, pooled_output = bert_model(input_ids=input_ids, attention_mask=attention_mask, return_dict =False)

print(last_hidden_state.shape)
print(bert_model.config.hidden_size)

In [None]:
import tensorflow as tf
tf.keras.utils.plot_model(bert_model)

In [None]:
class TextDataset():
    def __init__(self, data, targets, tokenizer):
        self.data = data
        self.targets = targets
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.targets)
    
    def __getitem__(self, idx):
        
        text = self.data[idx]
        target = self.targets[idx]
        bert_input = self.tokenizer(text, padding='max_length', max_length = 40,
                      truncation = True, return_tensors = 'pt')
        
        return bert_input, torch.tensor(target, dtype=torch.int64)

In [None]:
train_df, val_df = train_test_split(df, test_size=0.20, stratify=df['target'].values)

train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_dataset = TextDataset(train_df['text'], train_df['target'], tokenizer)
val_dataset = TextDataset(val_df['text'], val_df['target'], tokenizer)

In [None]:
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size = batch_size)
val_loader = DataLoader(val_dataset, batch_size = batch_size)

In [None]:
len(train_dataset)

In [None]:
for data, labels in train_loader:
    print(data)
    break

In [None]:
class BertClassifier(nn.Module):
    def __init__(self, dropout):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(self.bert.config.hidden_size,1)
        self.sigmoid = nn.Sigmoid()
    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_out = self.dropout(pooled_output)
        linear_out = self.linear(dropout_out)
        final_layer = self.sigmoid(linear_out)
        
        return final_layer

In [None]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

model = BertClassifier(dropout=0.3)
model = model.to(device)

In [None]:
def train_one_epoch(model, dataloader, criterion, optimizer, device):
    total_train_epoch_loss = 0
    total_train_epoch_acc = 0
    total_train_samples = 0
    model.train()
    for data, labels in dataloader:
        input_ids = data['input_ids'].squeeze(1).to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = labels.to(device)

        output = model(input_ids, attention_mask)
        batch_loss = criterion(output.squeeze(1), labels.float()) # removes 1 from a tensor of size(4,1), resulting in tensor size 4
        total_train_epoch_loss +=batch_loss

        optimizer.zero_grad()
        batch_loss.backward()

        optimizer.step()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        predicted_labels = torch.round(output).squeeze(1)  # Get the predicted labels
        batch_correct = (predicted_labels == labels).sum().item()  # Count the correct predictions
        total_train_epoch_acc += batch_correct
        total_train_samples += len(labels)  # Add the batch size to the total number of samples

    train_epoch_acc = total_train_epoch_acc / total_train_samples
    train_epoch_loss = total_train_epoch_loss / total_train_samples
    return train_epoch_acc, train_epoch_loss

In [None]:
def eval_model(model, dataloader, criterion, optimizer, device):
    model.eval()
    total_val_epoch_loss = 0
    total_val_epoch_acc = 0
    total_val_samples = 0
    with torch.no_grad():
        for data, labels in dataloader:
            input_ids = data['input_ids'].squeeze(1).to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = labels.to(device)

            output = model(input_ids, attention_mask)
            batch_loss = criterion(output.squeeze(1), labels.float()) # removes 1 from a tensor of size(4,1), resulting in tensor size 4
            total_val_epoch_loss +=batch_loss
            predicted_labels = torch.round(output).squeeze(1)  # Get the predicted labels
            batch_correct = (predicted_labels == labels).sum().item()  # Count the correct predictions
            total_val_epoch_acc += batch_correct
            total_val_samples += len(labels)  # Add the batch size to the total number of samples

    val_epoch_acc = total_val_epoch_acc / total_val_samples
    val_epoch_loss = total_val_epoch_loss / total_val_samples
    return val_epoch_acc, val_epoch_loss

# Training

In [None]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 2e-5)
criterion.to(device)
num_epochs =10
history = defaultdict(list)

for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}')
    train_epoch_acc, train_epoch_loss = train_one_epoch(model, train_loader, criterion, optimizer, device)
    
    print(f'Train Accuracy:{train_epoch_acc} | Train Loss:{train_epoch_loss}')
    
    val_epoch_acc, val_epoch_loss = eval_model(model, val_loader, criterion, optimizer, device)
    
    print(f'Val Accuracy:{val_epoch_acc} | Val Loss:{val_epoch_loss}')
    print()
    
    history['train_epoch_acc'].append(train_epoch_acc)
    history['train_epoch_loss'].append(train_epoch_loss)
    history['val_epoch_acc'].append(val_epoch_acc)
    history['val_epoch_loss'].append(val_epoch_loss)

In [None]:
plt.plot(history['train_epoch_acc'], label='train accuracy')
plt.plot(history['val_epoch_acc'], label='validation accuracy')
plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);

# Evaluation

In [None]:
def predictions_on_val(model, dataloader, device):
    model.eval()
    true_labels = []
    pred_labels = []
    with torch.no_grad():
        for data, labels in dataloader:
            input_ids = data['input_ids'].squeeze(1).to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = labels.to(device)
            output = model(input_ids, attention_mask)
            predicted_labels = torch.round(output).squeeze(1)  # Get the predicted labels
            pred_labels.append(predicted_labels.cpu().numpy())
            true_labels.append(labels.squeeze().cpu().numpy())
            
    true_labels = np.concatenate(true_labels)
    pred_labels = np.concatenate(pred_labels)
    return true_labels, pred_labels

In [None]:
true_labels, pred_labels = predictions_on_val(model, val_loader, device)

In [None]:
print(classification_report(true_labels, pred_labels, target_names = class_names))

In [None]:
def show_confusion_matrix(confusion_matrix):
    hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
    hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0)
    hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=0)
    plt.ylabel('True sentiment')
    plt.xlabel('Predicted sentiment');
cm = confusion_matrix(true_labels, pred_labels)
df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
show_confusion_matrix(df_cm)

In [None]:
test_tweet = test_df['text'][random.randint(0, len(test_df))]
bert_input = tokenizer(test_tweet, padding='max_length', max_length = 40,
                      truncation = True, return_tensors = 'pt')

In [None]:
input_ids = bert_input['input_ids'].squeeze(1).to(device)
attention_mask = bert_input['attention_mask'].to(device)
labels = labels.to(device)
output = model(input_ids, attention_mask)
predicted_labels = torch.round(output).squeeze(1).item()
print('Tweet:', test_tweet)
print()
sentiment = 'Not a disaster' if predicted_labels==0 else 'Disaster'
print('Predicted Sentiment:', sentiment)

# Submission

In [None]:
targets = []
for text in test_df['text']:
    bert_input = tokenizer(text, padding='max_length', max_length = 40,
                      truncation = True, return_tensors = 'pt')
    input_ids = bert_input['input_ids'].squeeze(1).to(device)
    attention_mask = bert_input['attention_mask'].to(device)
    labels = labels.to(device)
    output = model(input_ids, attention_mask)
    predicted_labels = torch.round(output).squeeze(1).item()
    targets.append(predicted_labels)

In [None]:
sub = pd.DataFrame(columns = ['id', 'target'])
sub['id'] = test_df.id
sub['target'] = targets

In [None]:
sub.to_csv('submission.csv', index = False)