# Sentiment Analysis withBERT

In [1]:
import torch
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
df = pd.read_csv('Data\cleaned_reviews.csv')
df.head()

Unnamed: 0,hotel_name,reviews,label
0,فندق 72,“ممتاز”. النظافة والطاقم متعاون.,0
1,فندق 72,استثنائي. سهولة إنهاء المعاملة في الاستقبال. ل...,1
2,فندق 72,استثنائي. انصح بأختيار الاسويت و بالاخص غرفه ر...,1
3,فندق 72,“استغرب تقييم الفندق كخمس نجوم”. لا شي. يستحق ...,0
4,فندق 72,جيد. المكان جميل وهاديء. كل شي جيد ونظيف بس كا...,1


In [3]:
index = [] 
for i,j in enumerate(df['reviews']):
    index.append(i)

In [4]:
df['index'] = index

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_val, y_train, y_val = train_test_split(df[:1000].index.values, 
                                                 df[:1000].label.values,
                                                 test_size = .25,
                                                 random_state = 14,
                                                 stratify = df[:1000].label.values)

In [7]:
df['data_type'] = ['not_set']*df.shape[0]

In [8]:
df.loc[X_train , 'data_type'] = 'train'
df.loc[X_val , 'data_type'] = 'val'

In [9]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case = True
)

In [11]:
encoded_train_data = tokenizer.batch_encode_plus(
    df[df.data_type == 'train'][:1000].reviews.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

encoded_val_data = tokenizer.batch_encode_plus(
    df[df.data_type == 'val'][:1000].reviews.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

input_ids_train = encoded_train_data['input_ids']
attention_mask_train = encoded_train_data['attention_mask']
labels_train = torch.tensor(df[df.data_type == 'train'][:1000].label.values)

input_ids_val = encoded_val_data['input_ids']
attention_mask_val = encoded_val_data['attention_mask']
labels_val = torch.tensor(df[df.data_type == 'val'][:1000].label.values)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [12]:
train_dataset = TensorDataset(input_ids_train, attention_mask_train, labels_train)
val_dataset = TensorDataset(input_ids_val, attention_mask_val, labels_val)

In [13]:
len(train_dataset), len(val_dataset)

(750, 250)

In [14]:
from transformers import BertForSequenceClassification

In [15]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [16]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [18]:
data_loader_train = DataLoader(
    train_dataset,
    sampler = RandomSampler(train_dataset),
    batch_size = 32
    )

data_loader_val = DataLoader(
    val_dataset,
    sampler = RandomSampler(val_dataset),
    batch_size = 32
    )

In [19]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [20]:
optimizer = AdamW(
    model.parameters(),
    eps=1e-8,
    lr=1e-5
)

In [21]:
epochs = 2
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(data_loader_train)*10 # 10: number of epochs....
)

In [22]:
import numpy as np

In [23]:
from sklearn.metrics import f1_score

In [24]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis = 1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

In [25]:
def accuracy_per_class(preds, labels):
    labels_inv = {v : k for k, v in labels.items()}
    preds_flat = np.argmax(preds, axis = 1).flatten()
    labels_flat = labels.flatten()
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'class: {label_inv[label]}')
        print(f'accuracy: {len(y_pred[y_preds == label])}/{len(y_ture)}\n')

In [26]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [27]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals


In [28]:
for epoch in tqdm(range(1, epochs+1)):
    model.train()
    loss_train_total = 0
    progress_par = tqdm(data_loader_train,
                       desc = 'Epoch {:1d}'.format(epoch),
                        leave = False,
                        disable = False
                       )
    for batch in progress_par:
        model.zero_grad()
        batch = tuple(b for b in batch)
        inputs = {
            'input_ids' :batch[0],
            'attention_mask' :batch[1],
            'labels' :batch[2]
        }
        outputs = model(**inputs)
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        progress_par.set_postfix({'training_loss' : '{:.3f}'.format(loss.item()/len(batch))})
        
    torch.save(model.state_dict(), f'Models/BERT_ft_epoch{epoch}.model')
    tqdm.write(f'epoch: {epoch}')
    loss_train_avg = loss_train_total/len(data_loader_train)
    val_loss, preds, true_vals = evaluate(data_loader_val)
    val_f1 = f1_score_func(preds, true_vals)
    tqdm.write(f'accuracy: {val_f1} - train_loss: {loss_train_avg} - val_loss: {val_loss}')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(HTML(value='Epoch 1'), FloatProgress(value=0.0, max=24.0), HTML(value='')))

epoch: 1
accuracy: 0.6626595365418894 - train_loss: 0.6668354471524557 - val_loss: 0.6336354985833168


HBox(children=(HTML(value='Epoch 2'), FloatProgress(value=0.0, max=24.0), HTML(value='')))

epoch: 2
accuracy: 0.7812026483271614 - train_loss: 0.5870761747161547 - val_loss: 0.5054949298501015



In [29]:
# laod a model.. 
# model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
#                                                       num_labels=len(labels),
#                                                       output_attentions=False,
#                                                       output_hidden_states=False)

In [30]:
# model.load_state_dict(torch.load('Models/finetuned_bert_epoch_1_gpu_trained.model', map_location = torch.device('cpu')))