# Reviews Classification with BERT
Bidirectional Encoder Representations from Transformers. BERT is a **text representation technique** which is a fusion of variety of state-of-the-art deep learning algorithms, such as bidirectional encoder LSTM and Transformers.

In [14]:
df = pd.read_csv('Data\cleaned_reviews.csv')
df.head()

Unnamed: 0,hotel_name,reviews,label
0,فندق 72,“ممتاز”. النظافة والطاقم متعاون.,0
1,فندق 72,استثنائي. سهولة إنهاء المعاملة في الاستقبال. ل...,1
2,فندق 72,استثنائي. انصح بأختيار الاسويت و بالاخص غرفه ر...,1
3,فندق 72,“استغرب تقييم الفندق كخمس نجوم”. لا شي. يستحق ...,0
4,فندق 72,جيد. المكان جميل وهاديء. كل شي جيد ونظيف بس كا...,1


In [80]:
index = [] 
for i,j in enumerate(df['reviews']):
    index.append(i)
df['index'] = index

### Using Pytoch

In [85]:
import torch
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

In [86]:
X_train, X_val, y_train, y_val = train_test_split(df[:1000].index.values, 
                                                 df[:1000].label.values,
                                                 test_size = .25,
                                                 random_state = 14,
                                                 stratify = df[:1000].label.values)

In [7]:
df['data_type'] = ['not_set']*df.shape[0]

In [8]:
df.loc[X_train , 'data_type'] = 'train'
df.loc[X_val , 'data_type'] = 'val'

In [9]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case = True
)

In [11]:
encoded_train_data = tokenizer.batch_encode_plus(
    df[df.data_type == 'train'][:1000].reviews.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

encoded_val_data = tokenizer.batch_encode_plus(
    df[df.data_type == 'val'][:1000].reviews.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

input_ids_train = encoded_train_data['input_ids']
attention_mask_train = encoded_train_data['attention_mask']
labels_train = torch.tensor(df[df.data_type == 'train'][:1000].label.values)

input_ids_val = encoded_val_data['input_ids']
attention_mask_val = encoded_val_data['attention_mask']
labels_val = torch.tensor(df[df.data_type == 'val'][:1000].label.values)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [12]:
train_dataset = TensorDataset(input_ids_train, attention_mask_train, labels_train)
val_dataset = TensorDataset(input_ids_val, attention_mask_val, labels_val)

In [13]:
len(train_dataset), len(val_dataset)

(750, 250)

In [14]:
from transformers import BertForSequenceClassification

In [15]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [16]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [18]:
data_loader_train = DataLoader(
    train_dataset,
    sampler = RandomSampler(train_dataset),
    batch_size = 32
    )

data_loader_val = DataLoader(
    val_dataset,
    sampler = RandomSampler(val_dataset),
    batch_size = 32
    )

In [19]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [20]:
optimizer = AdamW(
    model.parameters(),
    eps=1e-8,
    lr=1e-5
)

In [21]:
epochs = 2
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(data_loader_train)*10 # 10: number of epochs....
)

In [22]:
import numpy as np

In [23]:
from sklearn.metrics import f1_score

In [24]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis = 1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

In [25]:
def accuracy_per_class(preds, labels):
    labels_inv = {v : k for k, v in labels.items()}
    preds_flat = np.argmax(preds, axis = 1).flatten()
    labels_flat = labels.flatten()
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'class: {label_inv[label]}')
        print(f'accuracy: {len(y_pred[y_preds == label])}/{len(y_ture)}\n')

In [26]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [27]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals


In [28]:
# model training loop..  it takes a very long time .
# in this cell we re-train the BERT model on our dataset.

for epoch in tqdm(range(1, epochs+1)):
    model.train()
    loss_train_total = 0
    progress_par = tqdm(data_loader_train,
                       desc = 'Epoch {:1d}'.format(epoch),
                        leave = False,
                        disable = False
                       )
    for batch in progress_par:
        model.zero_grad()
        batch = tuple(b for b in batch)
        inputs = {
            'input_ids' :batch[0],
            'attention_mask' :batch[1],
            'labels' :batch[2]
        }
        outputs = model(**inputs)
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        progress_par.set_postfix({'training_loss' : '{:.3f}'.format(loss.item()/len(batch))})
        
    torch.save(model.state_dict(), f'Models/BERT_ft_epoch{epoch}.model')
    tqdm.write(f'epoch: {epoch}')
    loss_train_avg = loss_train_total/len(data_loader_train)
    val_loss, preds, true_vals = evaluate(data_loader_val)
    val_f1 = f1_score_func(preds, true_vals)
    tqdm.write(f'accuracy: {val_f1} - train_loss: {loss_train_avg} - val_loss: {val_loss}')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(HTML(value='Epoch 1'), FloatProgress(value=0.0, max=24.0), HTML(value='')))

epoch: 1
accuracy: 0.6626595365418894 - train_loss: 0.6668354471524557 - val_loss: 0.6336354985833168


HBox(children=(HTML(value='Epoch 2'), FloatProgress(value=0.0, max=24.0), HTML(value='')))

epoch: 2
accuracy: 0.7812026483271614 - train_loss: 0.5870761747161547 - val_loss: 0.5054949298501015



### Using tensorflow

we will import **tensorflow_hub**, which basically is a place where you can find all the prebuilt and pretrained models developed in TensorFlo.

In [9]:
# !pip install bert-for-tf2
# !pip install sentencepiece
# !pip install tensorflow-hub

In [81]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import bert
import random
import math

from tensorflow.keras import layers

In [83]:
needed = ['reviews', 'label']
df = df[needed]
df.head(10)

Unnamed: 0,reviews,label
0,“ممتاز”. النظافة والطاقم متعاون.,0
1,استثنائي. سهولة إنهاء المعاملة في الاستقبال. ل...,1
2,استثنائي. انصح بأختيار الاسويت و بالاخص غرفه ر...,1
3,“استغرب تقييم الفندق كخمس نجوم”. لا شي. يستحق ...,0
4,جيد. المكان جميل وهاديء. كل شي جيد ونظيف بس كا...,1
5,ممتاز. موقع الفندق ونظافته والاطلاله على البحر...,1
6,“جيدجداً”. الافطار جيد والسرير ممتاز ومريح واط...,1
7,“فندق ممتاز”. الاثاث، النظافه.,1
8,“الراحة و الهدوء”. مكان مناسب ومريح انصح به خ...,1
9,استثنائي. المكان روعه تحديدا الغرف المطله على ...,1


In [84]:
df.isnull().values.any()
df.shape

(105698, 2)

**BERT MODEL**

In [31]:
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False) # will not be training the BERT embedding
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

In [32]:
def tokenize_reviews(text_reviews):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text_reviews))

In [34]:
tokenized_reviews = [tokenize_reviews(review) for review in df.reviews]

In [38]:
y = np.array(df['label'])
# y = np.array(list(map(lambda x: 1 if x=="positive" else 0, y)))

In [39]:
reviews_with_len = [[review, y[i], len(review)]
                 for i, review in enumerate(tokenized_reviews)]

In [42]:
random.shuffle(reviews_with_len)

In [75]:
# sort the data by the length of the reviews and remove the length attribute from all the reviews.
reviews_with_len.sort(key=lambda x: x[2])
sorted_reviews_labels = [(review_lab[0], review_lab[1]) for review_lab in reviews_with_len] 

In [45]:
processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_reviews_labels, output_types=(tf.int32, tf.int32))

In [46]:
BATCH_SIZE = 32 # after processing 32 reviews, the weights of the neural network will be updated...
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

In [63]:
# split the dataset into subsets for training and testing...
TOTAL_BATCHES = math.ceil(len(sorted_reviews_labels) / BATCH_SIZE)
TEST_BATCHES = TOTAL_BATCHES // 10
batched_dataset.shuffle(TOTAL_BATCHES)
test_data = batched_dataset.take(TEST_BATCHES)
train_data = batched_dataset.skip(TEST_BATCHES)

In [50]:
class TEXT_MODEL(tf.keras.Model):
    
    def __init__(self,
                 vocabulary_size,
                 embedding_dimensions=128,
                 cnn_filters=50,
                 dnn_units=512,
                 model_output_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="text_model"):
        super(TEXT_MODEL, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocabulary_size,
                                          embedding_dimensions)
        self.cnn_layer1 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=2,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer2 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=3,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer3 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=4,
                                        padding="valid",
                                        activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if model_output_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=model_output_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        l = self.embedding(inputs)
        l_1 = self.cnn_layer1(l) 
        l_1 = self.pool(l_1) 
        l_2 = self.cnn_layer2(l) 
        l_2 = self.pool(l_2)
        l_3 = self.cnn_layer3(l)
        l_3 = self.pool(l_3) 
        
        concatenated = tf.concat([l_1, l_2, l_3], axis=-1) # (batch_size, 3 * cnn_filters)
        concatenated = self.dense_1(concatenated)
        concatenated = self.dropout(concatenated, training)
        model_output = self.last_dense(concatenated)
        
        return model_output

In [59]:
text_model = TEXT_MODEL(vocabulary_size = len(tokenizer.vocab),
                        embedding_dimensions = 200,
                        cnn_filters = 100,
                        dnn_units = 256,
                        model_output_classes = 2,
                        dropout_rate = 0.2)

In [68]:
def model_compile(model_name : str, out_put_classes : int):
    if out_put_classes == 2:
        return model_name.compile(loss="binary_crossentropy",
                           optimizer="adam",
                           metrics=["accuracy"])
    else:
        return model_name.compile(loss="sparse_categorical_crossentropy",
                           optimizer="adam",
                           metrics=["sparse_categorical_accuracy"])

In [None]:
model_compile(text_model, 2)

In [61]:
text_model.fit(train_data, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x25c829de108>

In [65]:
results = text_model.evaluate(test_data)
print(results)

[0.10359612852334976, 0.965624988079071]
