In [1]:
import re
import emoji
import time
import string

import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader

from transformers import BertForSequenceClassification

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [46]:
# Load reddit dataset
reddit_df = pd.read_csv('Reddit_Data.csv',
                   names=['review', 'sentiment']).drop(0).reset_index(drop=True)

twitter_df = pd.read_csv('Twitter_Data.csv',
                         names=['review', 'sentiment']).drop(0).reset_index(drop=True)

df = pd.concat([reddit_df, twitter_df]).reset_index(drop=True)

# Change dtype of sentiment to int
# data['sentiment'] = data['sentiment'].astype(int)
df['sentiment'] = df['sentiment'].map({'-1': 'NEGATIVE', '0': 'NEUTRAL', '1': 'POSITIVE'})
df.head()

Unnamed: 0,review,sentiment
0,family mormon have never tried explain them t...,POSITIVE
1,buddhism has very much lot compatible with chr...,POSITIVE
2,seriously don say thing first all they won get...,NEGATIVE
3,what you have learned yours and only yours wha...,NEUTRAL
4,for your own benefit you may want read living ...,POSITIVE


In [47]:
df.isnull().sum()

review       104
sentiment      7
dtype: int64

In [48]:
# Drop null
df = df.dropna()
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [49]:
def remove_not_ASCII(text):
    text = str(text)
    text = ''.join([word for word in text if word in string.printable])
#     print(text)
    return text

def replace_emoticons(text):
    text = text.replace("<3", "heart ")
    text = re.sub('>:-?\'?"?\(+', 'angry ', text)
    text = re.sub('\)+:-?\'?"?:<', 'angry ', text)
    text = re.sub(':-?\'?"?(o+|O+|0+)', 'surprised ', text)
    text = re.sub(':-?\'?"?(\)+|>+|D+)', 'smile ', text)
    text = re.sub('(\(+|<+)-?\'?"?:', 'smile ', text)
    text = re.sub(':-?\'?"?\(+', 'sad ', text)
    text = re.sub('(\)+|>+|D+)-?\'?"?:', 'sad ', text)
    
    return text

def text_preprocessing(text):
    # Clean dataset
    text = remove_not_ASCII(text)                            # remove non-ASCII characters
    text = replace_emoticons(text)                           # convert emoticon to text
    text = emoji.demojize(text, delimiters=("", " "))        # convert emoji to text

    text = re.sub('<br />', '', text)                        # remove <br />
    text = re.sub('^https?:\/\/S+', '', text)                # remove URLs
    
    text = re.sub('u/\S+', 'user', text)                     # replace user mentions
    text = re.sub('@\S+', 'user', text)
    text = re.sub('r/\S+', 'subreddit', text)                # replace subreddit mentions
    return text

df['review'] = df['review'].apply(text_preprocessing)
df.head()

Unnamed: 0,review,sentiment
0,family mormon have never tried explain them t...,POSITIVE
1,buddhism has very much lot compatible with chr...,POSITIVE
2,seriously don say thing first all they won get...,NEGATIVE
3,what you have learned yours and only yours wha...,NEUTRAL
4,for your own benefit you may want read living ...,POSITIVE


In [50]:
df['sentiment'].value_counts()

sentiment
POSITIVE    88079
NEUTRAL     68253
NEGATIVE    43786
Name: count, dtype: int64

In [51]:
possible_labels = df.sentiment.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{'POSITIVE': 0, 'NEGATIVE': 1, 'NEUTRAL': 2}

In [52]:
df['label'] = df.sentiment.replace(label_dict)

  df['label'] = df.sentiment.replace(label_dict)


In [53]:
df.head()

Unnamed: 0,review,sentiment,label
0,family mormon have never tried explain them t...,POSITIVE,0
1,buddhism has very much lot compatible with chr...,POSITIVE,0
2,seriously don say thing first all they won get...,NEGATIVE,1
3,what you have learned yours and only yours wha...,NEUTRAL,2
4,for your own benefit you may want read living ...,POSITIVE,0


In [54]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(df.index.values, 
                                                  df.label.values, 
                                                  test_size=0.2, 
                                                  random_state=11, 
                                                  stratify=df.label.values)

X_val, X_test, y_val, y_test = train_test_split(X_temp, 
                                                  y_temp, 
                                                  test_size=0.5, 
                                                  random_state=11, 
                                                  stratify=y_temp)

In [55]:
df['data_type'] = ['not_set']*df.shape[0]

df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'
df.loc[X_test, 'data_type'] = 'test'

In [56]:
df.groupby(['sentiment', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,review
sentiment,label,data_type,Unnamed: 3_level_1
NEGATIVE,1,test,4378
NEGATIVE,1,train,35029
NEGATIVE,1,val,4379
NEUTRAL,2,test,6826
NEUTRAL,2,train,54602
NEUTRAL,2,val,6825
POSITIVE,0,test,8808
POSITIVE,0,train,70463
POSITIVE,0,val,8808


In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)

In [14]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].review.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].review.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_test = tokenizer.batch_encode_plus(
    df[df.data_type == 'test'].review.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(df[df.data_type=='test'].label.values)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [15]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

In [16]:
len(dataset_train), len(dataset_val), len(dataset_test)

(160094, 20012, 20012)

In [17]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

dataloader_test = DataLoader(dataset_test, 
                                   sampler=SequentialSampler(dataset_test), 
                                   batch_size=batch_size)

In [19]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)



In [20]:
epochs = 5

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

In [41]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    total_preds = 0
    total_correct = 0

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(y_true)
        
        total_preds += len(y_preds)
        num_correct = len(y_preds[y_preds==label])
        total_correct += num_correct
        acc = num_correct/len(y_true)
        
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {acc}')
        print(f'Accuracy (in percentage): {acc*100:.3f}\n')
    
    print(f'Total accuracy: {(total_correct/total_preds)*100:.3f}')
    print('='*50, '\n')

In [42]:
import random

seed_val = 11
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [43]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [44]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [90]:
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    
    for batch in progress_bar:
        model.zero_grad()
        for b in batch:
            print(b)

        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/5003 [00:00<?, ?it/s]

tensor([[ 101, 2023, 4861,  ...,    0,    0,    0],
        [ 101, 2024, 5186,  ...,    0,    0,    0],
        [ 101, 2045, 7579,  ...,    0,    0,    0],
        ...,
        [ 101, 1996, 2111,  ...,    0,    0,    0],
        [ 101, 2298, 1996,  ...,    0,    0,    0],
        [ 101, 3519, 2106,  ...,    0,    0,    0]])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
tensor([0, 0, 0, 1, 2, 2, 0, 0, 2, 2, 2, 0, 0, 2, 1, 2, 2, 2, 2, 0, 2, 2, 1, 2,
        0, 1, 2, 2, 2, 0, 0, 0])
tensor([[  101, 16913,  2072,  ...,     0,     0,     0],
        [  101,  2909,  5060,  ...,     0,     0,     0],
        [  101,  2010,  3265,  ...,     0,     0,     0],
        ...,
        [  101, 15125, 22297,  ...,     0,     0,     0],
        [  101,  1996,  2878,  ...,     0,     0,     0],
        [  101, 25312, 29560,  ...,     

KeyboardInterrupt: 

In [25]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [45]:
for i in range(1, epochs+1):
    model.load_state_dict(torch.load(f'finetuned_BERT_epoch_{i}.model', map_location=torch.device('cuda')))
    print(f'Model {i} loaded, getting predictions...')
    _, predictions, true_vals = evaluate(dataloader_validation)
    accuracy_per_class(predictions, true_vals)

Model 1 loaded, getting predictions...
[0 0 0 ... 0 0 0]
Class: 1
Accuracy: 0.9642370572207084
Accuracy (in percentage): 96.424

[1 1 1 ... 1 1 1]
Class: -1
Accuracy: 0.9572961863439141
Accuracy (in percentage): 95.730

[2 2 2 ... 2 2 2]
Class: 0
Accuracy: 0.9715750915750916
Accuracy (in percentage): 97.158

Total accuracy: 96.522

Model 2 loaded, getting predictions...


KeyboardInterrupt: 

In [84]:
model.load_state_dict(torch.load(f'finetuned_BERT_epoch_{epochs}.model', map_location=torch.device('cuda')))

text = input("Enter some text: ")
text = text_preprocessing(text)

encoded_text = tokenizer.encode_plus(
    text, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)
# print(encoded_text)

input_ids_text = encoded_text['input_ids']
attention_masks_text = encoded_text['attention_mask']

dataset = TensorDataset(input_ids_text, attention_masks_text)
dataloader = DataLoader(dataset, 
                        sampler=RandomSampler(dataset), 
                        batch_size=1)

for batch in dataloader:
    batch = tuple(b.to(device) for b in batch)
    inputs = {'input_ids':      batch[0],
              'attention_mask': batch[1]
             }

    with torch.no_grad():        
        outputs = model(**inputs)
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()

#     print(logits)
    pred = np.argmax(logits, axis=1).flatten()[0]
    print(f'Prediction: {list(label_dict.keys())[pred]}')


Enter some text: what is AI
Prediction: NEUTRAL


In [65]:
label_dict

{'POSITIVE': 0, 'NEGATIVE': 1, 'NEUTRAL': 2}