<a href="https://colab.research.google.com/github/SentiBert/Bert-Model/blob/master/DMML2_SentiBert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Checking the config of COLAB!

In [None]:
import psutil
def get_size(bytes, suffix="B"):
    factor = 1024
    for unit in ["", "K", "M", "G", "T", "P"]:
        if bytes < factor:
            return f"{bytes:.2f}{unit}{suffix}"
        bytes /= factor
print("="*40, "Memory Information", "="*40)
svmem = psutil.virtual_memory()
print(f"Total: {get_size(svmem.total)}") ; print(f"Available: {get_size(svmem.available)}")
print(f"Used: {get_size(svmem.used)}") ; print(f"Percentage: {svmem.percent}%")

Total: 12.72GB
Available: 11.88GB
Used: 579.71MB
Percentage: 6.6%


In [None]:
! nvidia-smi

Mon Jul 13 09:45:21 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P8    35W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## EDA

In [None]:
import torch
import pandas as pd
from tqdm.notebook import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [None]:
df = pd.read_csv('/content/gdrive/My Drive/data/TAReviewsWithoutNull.csv')
df.set_index('ID_TA', inplace=True)

In [None]:
df = df.iloc[:,0:2]

In [None]:
df

Unnamed: 0_level_0,Reviews,Rating
ID_TA,Unnamed: 1_level_1,Unnamed: 2_level_1
d11752080,Just like home,5.0
d693419,Great food and staff,4.5
d696959,Satisfaction,4.5
d1239229,True five star dinner,5.0
d6864170,Best meal.... EVER,4.5
...,...,...
d8287558,Good Service,2.5
d5768767,Super local eatery,4.5
d4961508,Cordon Bleu,2.0
d4354266,"Don't waste your time, go somewhere else!",2.0


In [None]:
df.Reviews.iloc[0]

'Just like home'

In [None]:
df.Rating.value_counts()

 4.0    34691
 4.5    27601
 3.5    17266
 5.0     7305
 3.0     6533
 2.5     2016
 2.0      847
 1.0      278
 1.5      268
-1.0       12
Name: Rating, dtype: int64

In [None]:
possible_labels = df.Rating.unique()
print(possible_labels)

[ 5.   4.5  4.   3.5  3.   2.5  2.   1.5  1.  -1. ]


In [None]:
emotion_dict = {}
for index, label in enumerate(possible_labels):
    emotion_dict[label] = index

In [None]:
emotion_dict

{-1.0: 9,
 1.0: 8,
 1.5: 7,
 2.0: 6,
 2.5: 5,
 3.0: 4,
 3.5: 3,
 4.0: 2,
 4.5: 1,
 5.0: 0}

In [None]:
df['labels'] = df.Rating.replace(emotion_dict)
df.head()

Unnamed: 0_level_0,Reviews,Rating,labels
ID_TA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
d11752080,Just like home,5.0,0.0
d693419,Great food and staff,4.5,1.0
d696959,Satisfaction,4.5,1.0
d1239229,True five star dinner,5.0,0.0
d6864170,Best meal.... EVER,4.5,1.0


# Train/Test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df.index.values,
    df.labels.values,
    test_size=0.15,
    random_state=17,
    stratify=df.labels.values)

In [None]:
df['data_type'] = ['not_set']*df.shape[0]

In [None]:
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_test, 'data_type'] = 'test'

In [None]:
df.groupby(['Rating','labels','data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Reviews
Rating,labels,data_type,Unnamed: 3_level_1
-1.0,9.0,test,2
-1.0,9.0,train,10
1.0,8.0,test,42
1.0,8.0,train,236
1.5,7.0,test,40
1.5,7.0,train,228
2.0,6.0,test,127
2.0,6.0,train,720
2.5,5.0,test,302
2.5,5.0,train,1714


# Loading tokenizer and encoding our data

In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |▍                               | 10kB 17.2MB/s eta 0:00:01[K     |▉                               | 20kB 1.7MB/s eta 0:00:01[K     |█▎                              | 30kB 2.1MB/s eta 0:00:01[K     |█▊                              | 40kB 2.4MB/s eta 0:00:01[K     |██▏                             | 51kB 2.0MB/s eta 0:00:01[K     |██▋                             | 61kB 2.2MB/s eta 0:00:01[K     |███                             | 71kB 2.4MB/s eta 0:00:01[K     |███▍                            | 81kB 2.7MB/s eta 0:00:01[K     |███▉                            | 92kB 2.8MB/s eta 0:00:01[K     |████▎                           | 102kB 2.7MB/s eta 0:00:01[K     |████▊                           | 112kB 2.7MB/s eta 0:00:01[K     |█████▏                          | 122kB 2.7M

In [None]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [None]:
#the data is tokenized using a bert pretrained model
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case=True
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
# convert text to an encoded form (basically numbers) using batch_encode_plus()
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].Reviews.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

encoded_data_test = tokenizer.batch_encode_plus(
    df[df.data_type=='test'].Reviews.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

#splitting data into the format in which bert needs as input
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].labels.values)

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(df[df.data_type=='test'].labels.values)

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

In [None]:
len(dataset_train)

82261

In [None]:
len(dataset_test)

14556

# Setting up the BERT Pretrained Model

In [None]:
from transformers import BertForSequenceClassification

In [None]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = len(emotion_dict),
    output_attentions=False,
    output_hidden_states=False
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

# Creating Data Loaders

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [None]:
batch_size = 16

dataloader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size
)

dataloader_test = DataLoader(
    dataset_test,
    sampler=RandomSampler(dataset_test),
    batch_size=batch_size
)

# Setting up optimizer and scheduler

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [None]:
optimizer = AdamW(
    model.parameters(),
    lr=1e-5,
    eps=1e-8
)

In [None]:
epochs = 10
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(dataloader_train)*epochs
)

# Performance Metrics

In [None]:
import numpy as np

In [None]:
from sklearn.metrics import f1_score

In [None]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

In [None]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in emotion_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

# Creating our training loop

In [None]:
import random
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cuda


In [None]:
def evaluate(dataloader_test):
    model.eval()
    loss_test_total = 0
    predictions, true_test = [], []
    
    for batch in dataloader_test:
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2]}
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        loss = outputs[0]
        logits = outputs[1]
        loss_test_total += loss.item()
        
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_test.append(label_ids)
        
    loss_test_avg = loss_test_total/len(dataloader_test)
    
    predictions = np.concatenate(predictions, axis=0)
    true_test = np.concatenate(true_test, axis=0)
    
    return loss_test_avg, predictions, true_test

In [None]:
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0
    
    progress_bar = tqdm(dataloader_train,
                       desc = 'Epoch {:1d}'.format(epoch),
                       leave = False,
                        disable = False)
    
    for batch in progress_bar:
        
        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2]}
        
        outputs = model(**inputs)
        
        loss=outputs[0]
        loss_train_total += loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
        
    torch.save(model.state_dict(), f'/content/gdrive/My Drive/Models/SentiBERT_ft_epoch{epoch}.model')
    
    tqdm.write('\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    test_loss, predictions, true_test = evaluate(dataloader_test)
    test_f1 = f1_score_func(predictions, true_test)
    tqdm.write(f'Validation loss: {test_loss}')
    tqdm.write(f'F1 score (weighted): {test_f1}')

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=5142.0, style=ProgressStyle(description_wid…




RuntimeError: ignored