In [1]:
import torch
import pandas as pd
from tqdm.notebook import trange, tqdm

In [2]:
# TDQ is a A Fast, Extensible Progress Bar for Python and CLI
for i in trange(10):
    print(i)

  0%|          | 0/10 [00:00<?, ?it/s]

0
1
2
3
4
5
6
7
8
9


In [3]:
torch.cuda.is_available()

False

In [4]:
df = pd.read_csv('D:/filefile/PG study/Deep-Learning-using-BERT-main/Data/smile-annotations-final.csv', 
                 names =['id', 'text', 'category'])
df.set_index('id', inplace=True)
df.text.iloc[0]

'@aandraous @britishmuseum @AndrewsAntonio Merci pour le partage! @openwinemap'

In [5]:
df.category.value_counts()

nocode               1572
happy                1137
not-relevant          214
angry                  57
surprise               35
sad                    32
happy|surprise         11
happy|sad               9
disgust|angry           7
disgust                 6
sad|disgust             2
sad|angry               2
sad|disgust|angry       1
Name: category, dtype: int64

In [6]:
df = df[~df.category.str.contains('\|')]
df = df[df.category!= 'nocode']
df.category.value_counts()

happy           1137
not-relevant     214
angry             57
surprise          35
sad               32
disgust            6
Name: category, dtype: int64

In [7]:
possible_labels = df.category.unique()
label_dict ={}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{'happy': 0,
 'not-relevant': 1,
 'angry': 2,
 'disgust': 3,
 'sad': 4,
 'surprise': 5}

In [8]:
df['label'] = df.category.replace(label_dict)
df.head()

Unnamed: 0_level_0,text,category,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy,0
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy,0
614877582664835073,@Sofabsports thank you for following me back. ...,happy,0
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy,0
611570404268883969,@NationalGallery @ThePoldarkian I have always ...,happy,0


In [9]:
df['text'].iloc[0]

'Dorian Gray with Rainbow Scarf #LoveWins (from @britishmuseum http://t.co/Q4XSwL0esu) http://t.co/h0evbTBWRq'

In [10]:
df.index.values

array([614484565059596288, 614746522043973632, 614877582664835073, ...,
       613678555935973376, 615246897670922240, 613016084371914753],
      dtype=int64)

In [11]:
df.label.values

array([0, 0, 0, ..., 0, 0, 1], dtype=int64)

In [12]:
#Training/Validation Split
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    df.index.values,
    df.label.values,
    test_size=0.15,
    random_state=17,
    stratify=df.label.values
)

In [13]:
X_train

array([614767094345936896, 610755488372948992, 610609791073931266, ...,
       613744184495894529, 610873494910443520, 610741907426267136],
      dtype=int64)

In [14]:
y_train

array([0, 0, 0, ..., 0, 0, 2], dtype=int64)

In [15]:
df['data_type']=['no_set']*df.shape[0]

In [16]:
X_train

array([614767094345936896, 610755488372948992, 610609791073931266, ...,
       613744184495894529, 610873494910443520, 610741907426267136],
      dtype=int64)

In [17]:
df.loc[X_train, 'data_type']='train'
df.loc[X_val,'data_type']='val'
df.groupby(['category','label','data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text
category,label,data_type,Unnamed: 3_level_1
angry,2,train,48
angry,2,val,9
disgust,3,train,5
disgust,3,val,1
happy,0,train,966
happy,0,val,171
not-relevant,1,train,182
not-relevant,1,val,32
sad,4,train,27
sad,4,val,5


In [18]:
#Loading Tokenizer and Encoding our data
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case=True
)
df.data_type=='train'

id
614484565059596288    True
614746522043973632    True
614877582664835073    True
611932373039644672    True
611570404268883969    True
                      ... 
611258135270060033    True
612214539468279808    True
613678555935973376    True
615246897670922240    True
613016084371914753    True
Name: data_type, Length: 1481, dtype: bool

In [19]:
df[df.data_type=='train']

Unnamed: 0_level_0,text,category,label,data_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy,0,train
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy,0,train
614877582664835073,@Sofabsports thank you for following me back. ...,happy,0,train
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy,0,train
611570404268883969,@NationalGallery @ThePoldarkian I have always ...,happy,0,train
...,...,...,...,...
611258135270060033,@_TheWhitechapel @Campaignforwool @SlowTextile...,not-relevant,1,train
612214539468279808,“@britishmuseum: Thanks for ranking us #1 in @...,happy,0,train
613678555935973376,MT @AliHaggett: Looking forward to our public ...,happy,0,train
615246897670922240,@MrStuchbery @britishmuseum Mesmerising.,happy,0,train


In [20]:
df[df.data_type=='train'].text.values

#Encode texts by using tokenizer.batch_encode_plus
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    #pad_to_max_length=True,
    padding=True,
    truncation=True,
    max_length=256,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    #pad_to_max_length=True,
    padding=True,
    truncation=True,
    max_length=256,
    return_tensors='pt'
)

In [21]:
#For the train
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

#For the validation
input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

In [22]:
#It is created the TensorDataset adapted to Bert for the train and validation
dataset_train = TensorDataset(
    input_ids_train,
    attention_masks_train,
    labels_train
)

dataset_val = TensorDataset(
    input_ids_val,
    attention_masks_val,
    labels_val
)

In [23]:
print(len(dataset_train),len(dataset_val))

1258 223


In [24]:
#Setting up BERT Pretrained Model
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                     num_labels=len(label_dict),
                                                     output_attentions=False,
                                                     output_hidden_states=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [25]:
#Creating Data Loaders
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

#In Google Colab -- GPU Instance (k80)
#batch_size =32
#epoch =10

batch_size = 4 #32
dataloader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size
)

dataloader_val = DataLoader(
    dataset_val,
    sampler=SequentialSampler(dataset_val),
    batch_size=batch_size
)

In [26]:
#Setting Up Optimizer and Scheduler
#AdamW is a variant of the optimizer Adam that has an improved implementation of weight decay
from transformers import AdamW, get_linear_schedule_with_warmup
optimizer = AdamW(
    model.parameters(),
    lr=1e-5,#2e-5 > 5e-5 #default:1e-3
    eps=1e-8 #default:1e-8 epsilon
)

epochs = 10

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(dataloader_train)*epochs
)

In [27]:
#Defining the Performance Metrics
import numpy as np
from sklearn.metrics import f1_score

#preds=[0.9 0.05 0.05 0 0 0]
#preds = [1 0 0 0 0]

def f1_score_func(preds,labels):
    preds_flat = np.argmax(preds, axis = 1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average = 'weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    preds_flat = np.argmax(preds, axis = 1).flatten()
    labels_flat = labels.flatten()
    
    for label in np.unique(labels_flat):
        y_pred = preds_flat[labels_flat == label]
        y_true = labels_flat[labels_flat == label]
        print(f'Class:{label_dict_inverse[label]}')
        #In Python 3.6, the f-string, formatted string literal, was introduced(PEP 498). In short, it is a way to format your string that is more readable and fast.
        print(f'Accuracy:{len(y_pred[y_pred==label])}/{len(y_true)}\n')

In [28]:
#Creating our Training Loop
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

#cuda

cpu


In [38]:
'''to save memory during evaluation and test, 
you could wrap the validation and test code into a with torch.no_grad() block.'''
# with torch.no_grad():
#     model.eval()
#     y_pred = model(valX)
#     val_loss = criterion(y_pred, valY)
#NameError: name 'valX' is not defined

# with torch.no_grad():
#     model.eval()
#     y_pred = model(test)
#     test_loss = criterion(y_pred, testY)

#NameError: name 'test' is not defined

'to save memory during evaluation and test, \nyou could wrap the validation and test code into a with torch.no_grad() block.'

In [29]:
def evaluate(dataloader_val):
    model.eval()
    loss_val_total = 0
    predictions, true_vals = [], []
    for batch in tqdm(dataloader_val):
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2],
                 }
        with torch.no_grad():
            outputs = model(**inputs)
        
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()
        
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val)
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
    
    return loss_val_avg, predictions, true_vals

In [40]:
for epoch in tqdm(range(1,epochs+1)):
    model.train()
    
    loss_train_total = 0
    
    progress_bar = tqdm(dataloader_train,
                        desc='Epoch {:1d}'.format(epoch),
                        leave=False,
                        disable=False)
    
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids':batch[0],
            'attention_mask':batch[1],
            'labels':batch[2]
        }
        outputs = model(**inputs)
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        progress_bar.set_postfix(
            {'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
        
    #torch.save(model.state_dict(), f'Models/BERT_fit_epoch{epoch}.model')
    tqdm.write('\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss:{loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_val)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation{val_loss}')
    tqdm.write(f'F1 Score (weighted): {val_f1}')
torch.save(model.state_dict(), f'Models/BERT_fit_epoch{epoch}.model')

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch {epoch}
Training loss:0.8254267480993082


  0%|          | 0/56 [00:00<?, ?it/s]

Validation0.6086587038423333
F1 Score (weighted): 0.7870316200233131


Epoch 2:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch {epoch}
Training loss:0.4992410193389607


  0%|          | 0/56 [00:00<?, ?it/s]

Validation0.6581442244600372
F1 Score (weighted): 0.78784939594867


Epoch 3:   0%|          | 0/315 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [41]:
#Evaluating our Model
accuracy_per_class(predictions, true_vals)

Class:happy
Accuracy:165/171

Class:not-relevant
Accuracy:19/32

Class:angry
Accuracy:0/9

Class:disgust
Accuracy:0/1

Class:sad
Accuracy:0/5

Class:surprise
Accuracy:0/5

