# Load modules

In [1]:
import re
import emoji
import time
import string

import torch
from tqdm.notebook import tqdm
import nltk

from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader

from transformers import BertForSequenceClassification

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load dataset

In [19]:
# Load reddit dataset
reddit_df = pd.read_csv('Reddit_Data.csv',
                   names=['review', 'sentiment']).drop(0).reset_index(drop=True)

twitter_df = pd.read_csv('Twitter_Data.csv',
                         names=['review', 'sentiment']).drop(0).reset_index(drop=True)

df = pd.concat([reddit_df, twitter_df]).reset_index(drop=True)

# Remove neutral sentiments
df = df[df['sentiment'] != '0']
df['sentiment'].unique()

array(['1', '-1', nan], dtype=object)

In [20]:
df.isnull().sum()

review       2
sentiment    7
dtype: int64

In [21]:
# Drop null
df = df.dropna()
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [22]:
# Map 1 to positive and -1 to negative
df['sentiment'] = df['sentiment'].map({'-1': 'NEGATIVE', '1': 'POSITIVE'})
df.head()

Unnamed: 0,review,sentiment
0,family mormon have never tried explain them t...,POSITIVE
1,buddhism has very much lot compatible with chr...,POSITIVE
2,seriously don say thing first all they won get...,NEGATIVE
4,for your own benefit you may want read living ...,POSITIVE
5,you should all sit down together and watch the...,NEGATIVE


# Process data

In [23]:
def remove_not_ASCII(text):
    text = str(text)
    text = ''.join([word for word in text if word in string.printable])
#     print(text)
    return text

def replace_emoticons(text):
    text = text.replace("<3", "heart ")
    text = re.sub('>:-?\'?"?\(+', 'angry ', text)
    text = re.sub('\)+:-?\'?"?:<', 'angry ', text)
    text = re.sub(':-?\'?"?(o+|O+|0+)', 'surprised ', text)
    text = re.sub(':-?\'?"?(\)+|>+|D+)', 'smile ', text)
    text = re.sub('(\(+|<+)-?\'?"?:', 'smile ', text)
    text = re.sub(':-?\'?"?\(+', 'sad ', text)
    text = re.sub('(\)+|>+|D+)-?\'?"?:', 'sad ', text)
    
    return text

def text_preprocessing(text):
    # Clean dataset
    text = replace_emoticons(text)                           # convert emoticon to text
    text = emoji.demojize(text, delimiters=("", " "))        # convert emoji to text
    text = remove_not_ASCII(text)                            # remove non-ASCII characters

    text = re.sub('<br />', '', text)                        # remove <br />
    text = re.sub('^https?:\/\/S+', '', text)                # remove URLs
    
    text = re.sub('u/\S+', 'user', text)                     # replace user mentions
    text = re.sub('@\S+', 'user', text)
    text = re.sub('r/\S+', 'subreddit', text)                # replace subreddit mentions
    return text


In [24]:
df['review'] = df['review'].apply(text_preprocessing)
df.head()

Unnamed: 0,review,sentiment
0,family mormon have never tried explain them t...,POSITIVE
1,buddhism has very much lot compatible with chr...,POSITIVE
2,seriously don say thing first all they won get...,NEGATIVE
4,for your own benefit you may want read living ...,POSITIVE
5,you should all sit down together and watch the...,NEGATIVE


In [25]:
df['sentiment'].value_counts()

sentiment
POSITIVE    88079
NEGATIVE    43786
Name: count, dtype: int64

In [26]:
possible_labels = df.sentiment.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{'POSITIVE': 0, 'NEGATIVE': 1}

In [27]:
df['label'] = df.sentiment.replace(label_dict)

  df['label'] = df.sentiment.replace(label_dict)


In [28]:
df.head()

Unnamed: 0,review,sentiment,label
0,family mormon have never tried explain them t...,POSITIVE,0
1,buddhism has very much lot compatible with chr...,POSITIVE,0
2,seriously don say thing first all they won get...,NEGATIVE,1
4,for your own benefit you may want read living ...,POSITIVE,0
5,you should all sit down together and watch the...,NEGATIVE,1


In [29]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.label.values, 
                                                  test_size=0.2, 
                                                  random_state=11, 
                                                  stratify=df.label.values)


In [30]:
df['data_type'] = ['not_set']*df.shape[0]

df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'
# df.loc[X_test, 'data_type'] = 'test'

In [31]:
df.groupby(['sentiment', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,review
sentiment,label,data_type,Unnamed: 3_level_1
NEGATIVE,1,train,35029
NEGATIVE,1,val,8757
POSITIVE,0,train,70463
POSITIVE,0,val,17616


# Encode data

In [32]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)

In [42]:
import os

train_data_path = '/home/FYP/c200129/playground/test2/encoded_data_train.pickle'
val_data_path = '/home/FYP/c200129/playground/test2/encoded_data_val.pickle'

if os.path.isfile(train_data_path):
    print(f"Encoded train data found at {train_data_path}")
    with open(train_data_path, 'rb') as handle:
        encoded_data_train = pickle.load(handle)
else:
    print("No encoded train data found, encoding train data...")
    encoded_data_train = tokenizer.batch_encode_plus(
        df[df.data_type=='train'].review.values, 
        add_special_tokens=True, 
        return_attention_mask=True, 
        pad_to_max_length=True, 
        max_length=256, 
        return_tensors='pt'
    )

if os.path.isfile(val_data_path):
    print(f"Encoded val data found at {val_data_path}")
    with open(val_data_path, 'rb') as handle:
        encoded_data_val = pickle.load(handle)
else:
    print("No encoded validation data found, encoding validation data...")
    encoded_data_val = tokenizer.batch_encode_plus(
        df[df.data_type=='val'].review.values, 
        add_special_tokens=True, 
        return_attention_mask=True, 
        pad_to_max_length=True, 
        max_length=256, 
        return_tensors='pt'
    )

# encoded_data_test = tokenizer.batch_encode_plus(
#     df[df.data_type == 'test'].review.values, 
#     add_special_tokens=True, 
#     return_attention_mask=True, 
#     pad_to_max_length=True, 
#     max_length=256, 
#     return_tensors='pt'
# )

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

# input_ids_test = encoded_data_test['input_ids']
# attention_masks_test = encoded_data_test['attention_mask']
# labels_test = torch.tensor(df[df.data_type=='test'].label.values)

Encoded train data found at /home/FYP/c200129/playground/test2/encoded_data_train.pickle
Encoded val data found at /home/FYP/c200129/playground/test2/encoded_data_val.pickle


In [39]:
# Save encoded data
import pickle

with open('encoded_data_train.pickle', 'wb') as handle:
    pickle.dump(encoded_data_train, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('encoded_data_val.pickle', 'wb') as handle:
    pickle.dump(encoded_data_val, handle, protocol=pickle.HIGHEST_PROTOCOL)    

In [43]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
# dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

In [44]:
len(dataset_train), len(dataset_val)

(105492, 26373)

In [45]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

# dataloader_test = DataLoader(dataset_test, 
#                                    sampler=SequentialSampler(dataset_test), 
#                                    batch_size=batch_size)

In [47]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)

In [48]:
# Using Early stopper to stop when the F1 Score prediction drops 
class EarlyStopper:
    def __init__(self, patience=3, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_f1_score_ = 0
        self.min_accuracy = 0

    def early_stop_f1(self, f1_score):
        if f1_score > self.min_f1_score_:
            self.min_f1_score_ = f1_score
            self.counter = 0
        elif f1_score < (self.min_f1_score_ + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

    def early_stop_accuracy(self, accuracy):
        if accuracy > self.min_accuracy:
            self.min_accuracy = accuracy
            self.counter = 0
        elif accuracy < (self.min_accuracy + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

# Train Model

In [49]:
epochs = 5

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

early_stopper = EarlyStopper(patience=3, min_delta=0)

In [50]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    total_preds = 0
    total_correct = 0

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(y_true)
        
        total_preds += len(y_preds)
        num_correct = len(y_preds[y_preds==label])
        total_correct += num_correct
        acc = num_correct/len(y_true)
        
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {acc}')
        print(f'Accuracy (in percentage): {acc*100:.3f}\n')
    
    print(f'Total accuracy: {(total_correct/total_preds)*100:.3f}')
    print('='*50, '\n')

In [51]:
import random

seed_val = 11
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [52]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [53]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [55]:
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    
    for batch in progress_bar:
        model.zero_grad()

        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
        
    torch.save(model.state_dict(), f'finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')
    
    # Check for early stop
    if early_stopper and early_stopper.early_stop_f1(val_f1):
        print(f"Early stopping at epoch ({epoch+1}) due to no improvement in f1 score.")
        break

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/3297 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.21606257921062164
Validation loss: 0.09482081996022977
F1 Score (Weighted): 0.9654441730396272


Epoch 2:   0%|          | 0/3297 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [25]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

# Test model and get accuracy

In [45]:
for i in range(1, epochs+1):
    model.load_state_dict(torch.load(f'finetuned_BERT_epoch_{i}.model', map_location=torch.device('cuda')))
    print(f'Model {i} loaded, getting predictions...')
    _, predictions, true_vals = evaluate(dataloader_validation)
    accuracy_per_class(predictions, true_vals)

Model 1 loaded, getting predictions...
[0 0 0 ... 0 0 0]
Class: 1
Accuracy: 0.9642370572207084
Accuracy (in percentage): 96.424

[1 1 1 ... 1 1 1]
Class: -1
Accuracy: 0.9572961863439141
Accuracy (in percentage): 95.730

[2 2 2 ... 2 2 2]
Class: 0
Accuracy: 0.9715750915750916
Accuracy (in percentage): 97.158

Total accuracy: 96.522

Model 2 loaded, getting predictions...


KeyboardInterrupt: 

# Test with custom data

In [84]:
model.load_state_dict(torch.load(f'finetuned_BERT_epoch_{epochs}.model', map_location=torch.device('cuda')))

text = input("Enter some text: ")
text = text_preprocessing(text)

encoded_text = tokenizer.encode_plus(
    text, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)
# print(encoded_text)

input_ids_text = encoded_text['input_ids']
attention_masks_text = encoded_text['attention_mask']

dataset = TensorDataset(input_ids_text, attention_masks_text)
dataloader = DataLoader(dataset, 
                        sampler=RandomSampler(dataset), 
                        batch_size=1)

for batch in dataloader:
    batch = tuple(b.to(device) for b in batch)
    inputs = {'input_ids':      batch[0],
              'attention_mask': batch[1]
             }

    with torch.no_grad():        
        outputs = model(**inputs)
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()

#     print(logits)
    pred = np.argmax(logits, axis=1).flatten()[0]
    print(f'Prediction: {list(label_dict.keys())[pred]}')


Enter some text: what is AI
Prediction: NEUTRAL


In [65]:
label_dict

{'POSITIVE': 0, 'NEGATIVE': 1, 'NEUTRAL': 2}