# Sentiment Classification with BERT

In [1]:
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

device

device(type='cuda')

## Load Dataset

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('./data/train.csv')
X = df['text']
Y = df['suicide']

In [3]:
sentences = [sen for sen in X]

sentences[: 5]

["It makes me happy to think that I'd rather commit suicide than to live an unhappy lifeAt least that is something that many of the people who are not suicidal cannot do.\n\nI used to worry a lot about my future being worried about if I fuck things up and I create a bad ad unhappy life for me that I'd have to deal for the rest of my life, but now that I know that I can just end it all if things go bad, that makes me feel much better for some reason...",
 "My dad got the Corona Virus... Please pray for hi am ya'll 🥺",
 'the everlasting question why is this art!!!!!!1211111!!!!!',
 'If someone ik finds my reddit acct i am saying someone catfished using my pictures ong Im a loser but i could at least try to hide it 🥴',
 "I lost everything in span of a month.Hey guys, \n\numm...so I'm pretty shook up right now. \n\nLast month the woman who I thought I will marry left me five years into our relationship. \n\nI sort of dealt with it and did my best to move on...got my own place and started t

In [4]:
sen_labels = torch.tensor(Y.values)

sen_labels[: 5]

tensor([1, 0, 0, 0, 1])

## Tokenize

In [5]:
from transformers import BertTokenizer

# get pre-trained tokenizer model
tokenizer = BertTokenizer.from_pretrained('./pretrained/bert-base-uncased')

tokenizer

BertTokenizer(name_or_path='./pretrained/bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [6]:
max_length = 512

tokenized = tokenizer(sentences, padding=True, truncation=True, max_length=max_length, return_tensors='pt')

print(tokenized['input_ids'][0])
print(tokenized['token_type_ids'][0])
print(tokenized['attention_mask'][0])

tensor([  101,  2009,  3084,  2033,  3407,  2000,  2228,  2008,  1045,  1005,
         1040,  2738, 10797,  5920,  2084,  2000,  2444,  2019, 12511,  2166,
         4017,  2560,  2008,  2003,  2242,  2008,  2116,  1997,  1996,  2111,
         2040,  2024,  2025, 26094,  3685,  2079,  1012,  1045,  2109,  2000,
         4737,  1037,  2843,  2055,  2026,  2925,  2108,  5191,  2055,  2065,
         1045,  6616,  2477,  2039,  1998,  1045,  3443,  1037,  2919,  4748,
        12511,  2166,  2005,  2033,  2008,  1045,  1005,  1040,  2031,  2000,
         3066,  2005,  1996,  2717,  1997,  2026,  2166,  1010,  2021,  2085,
         2008,  1045,  2113,  2008,  1045,  2064,  2074,  2203,  2009,  2035,
         2065,  2477,  2175,  2919,  1010,  2008,  3084,  2033,  2514,  2172,
         2488,  2005,  2070,  3114,  1012,  1012,  1012,   102,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [7]:
print(tokenized['input_ids'].size())
print(tokenized['token_type_ids'].size())
print(tokenized['attention_mask'].size())

torch.Size([185659, 512])
torch.Size([185659, 512])
torch.Size([185659, 512])


In [8]:
sen_ids = tokenized['input_ids']
attention_mask = tokenized['attention_mask']

- Or using code below to do steps separately

``` python
# tokenize
sentences = ["[CLS] " + sen + " [SEP]" for sen in sentences]
sen_tokens = [tokenizer.tokenize(sen) for sen in sentences]
print(sen_tokens[0])

# transform tokens to ids
sen_ids = [tokenizer.convert_tokens_to_ids(sen) for sen in sen_tokens]
print(sen_ids[0])

# padding
max_length = 200
sen_ids = pad_sequences(sen_ids, maxlen=max_length, dtype='long', truncating='post', padding='post')
print(sen_ids[0])

# create attention mask
attention_mask = [[1 if id > 0 else 0 for id in sen] for sen in sen_ids]
print(attention_mask[0])
```

### release memory

In [9]:
del tokenizer
del tokenized
del sentences
del X
del Y
del df

In [14]:
import gc
gc.collect()

7446

## Cross Validation

A stratified 5-fold cross validation will be applied to the model.

### Define training method

In [10]:
# training model
def model_train(model, train_dataloader, optimizer, scheduler, epochs):
    for epoch in range(epochs):
        train_losses = []

        model.train()
        for i, batch_data in enumerate(train_dataloader):
            input_ids, input_masks, input_labels = tuple(data.to(device) for data in batch_data)

            optimizer.zero_grad()
            output =  model(input_ids, attention_mask=input_masks, labels=input_labels, token_type_ids=None)

            loss = output['loss']
            train_losses.append(loss.item())

            loss.backward()
            optimizer.step()
            scheduler.step()
        
        # train_loss.append(np.mean(train_losses))

        print("Epoch: {}/{}".format((epoch + 1), epochs),
              "\n\tTraining Loss: {:.4f}".format(np.mean(train_losses)))
        
    return model

### Define validation method

In [11]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# validation model
def model_validation(model, val_dataloader):
    val_losses = []
    accuracy_score_list, recall_score_list, precision_score_list, f1_score_list = [], [], [], []

    model.eval()
    for batch_data in val_dataloader:
        input_ids, input_masks, input_labels = tuple(data.to(device) for data in batch_data)

        with torch.no_grad():
            preds = model(input_ids, attention_mask=input_masks, labels=input_labels, token_type_ids=None)

        loss = preds['loss']
        val_losses.append(loss.item())
        preds = preds['logits'].detach().to('cpu').numpy()
        labels = input_labels.to('cpu').numpy()

        preds = preds.argmax(1).flatten() # shape = [1, : ]
        labels = labels.flatten()

        # Evaluate model
        AccuracyScore = accuracy_score(labels, preds)
        RecallScore = recall_score(labels, preds)
        PrecisionScore = precision_score(labels, preds)
        F1Score = f1_score(labels, preds)
        
        # Add to lists
        accuracy_score_list.append(AccuracyScore)
        recall_score_list.append(RecallScore)
        precision_score_list.append(PrecisionScore)
        f1_score_list.append(F1Score)

    # val_loss.append(np.mean(val_losses))

    print("Validation Loss: {:.4f}".format(np.mean(val_losses)),
          "Validation Accuracy: {:.4f}%".format(np.mean(accuracy_score_list) * 100))
    
    return np.mean(accuracy_score_list), np.mean(recall_score_list), np.mean(precision_score_list), np.mean(f1_score_list)

In [15]:
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW


skfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=60)
accuracy_score_lists, recall_score_lists, precision_score_lists, f1_score_lists = [], [], [], []

epochs = 4
learning_rate = 5e-5
epsilon = 1e-8

for time, (train_index, val_index) in enumerate(skfolds.split(sen_ids, sen_labels)):
    print('Time: ', time + 1)
    X_train, X_val = sen_ids[train_index], sen_ids[val_index]
    Y_train, Y_val = sen_labels[train_index], sen_labels[val_index]
    mask_train, mask_val = attention_mask[train_index], attention_mask[val_index]

    # pack dataloaders
    batch_size = 32

    train_dataset = TensorDataset(X_train, mask_train, Y_train)
    val_dataset = TensorDataset(X_val, mask_val, Y_val)

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

    model = BertForSequenceClassification.from_pretrained('./pretrained/bert-base-uncased', num_labels=2)
    model = model.cuda()

    optimizer = AdamW(model.parameters(), lr=learning_rate, eps=epsilon)

    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, 
                                                num_training_steps=len(train_dataloader) * epochs)

    # train model
    model = model_train(model, train_dataloader, optimizer, scheduler, epochs)

    # validate model
    AccuracyScore, RecallScore, PrecisionScore, F1Score = model_validation(model, val_dataloader)

    accuracy_score_lists.append(AccuracyScore)
    recall_score_lists.append(RecallScore)
    precision_score_lists.append(PrecisionScore)
    f1_score_lists.append(F1Score)

print("Accuracy: {:.2%}".format(np.average(accuracy_score_lists)))
print("Recall: {:.2%}".format(np.average(recall_score_lists)))
print("Precision: {:.2%}".format(np.average(precision_score_lists)))
print("F1_score: {:.2%}".format(np.average(f1_score_lists)))

Time:  1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./pretrained/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 1/4 
	Training Loss: 0.0916
Epoch: 2/4 
	Training Loss: 0.0426
Epoch: 3/4 
	Training Loss: 0.0169
Epoch: 4/4 
	Training Loss: 0.0043
Validation Loss: 0.1037 Validation Accuracy: 97.9140%
Time:  2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./pretrained/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 1/4 
	Training Loss: 0.0913
Epoch: 2/4 
	Training Loss: 0.0418
Epoch: 3/4 
	Training Loss: 0.0169
Epoch: 4/4 
	Training Loss: 0.0045
Validation Loss: 0.1112 Validation Accuracy: 97.7040%
Time:  3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./pretrained/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 1/4 
	Training Loss: 0.0920
Epoch: 2/4 
	Training Loss: 0.0431
Epoch: 3/4 
	Training Loss: 0.0179
Epoch: 4/4 
	Training Loss: 0.0055
Validation Loss: 0.0989 Validation Accuracy: 97.7982%
Time:  4


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./pretrained/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 1/4 
	Training Loss: 0.0899
Epoch: 2/4 
	Training Loss: 0.0408
Epoch: 3/4 
	Training Loss: 0.0157
Epoch: 4/4 
	Training Loss: 0.0045
Validation Loss: 0.1025 Validation Accuracy: 97.9355%
Time:  5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./pretrained/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 1/4 
	Training Loss: 0.0898
Epoch: 2/4 
	Training Loss: 0.0418
Epoch: 3/4 
	Training Loss: 0.0168
Epoch: 4/4 
	Training Loss: 0.0049
Validation Loss: 0.1038 Validation Accuracy: 97.8117%
Accuracy: 97.83%
Recall: 97.77%
Precision: 97.89%
F1_score: 97.76%


## Save model

### Train the model with all data and save it

In [16]:
train_dataset = TensorDataset(sen_ids, attention_mask, sen_labels)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

model = BertForSequenceClassification.from_pretrained('./pretrained/bert-base-uncased', num_labels=2)
model = model.cuda()

optimizer = AdamW(model.parameters(), lr=learning_rate, eps=epsilon)

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, 
                                                num_training_steps=len(train_dataloader) * epochs)

# train model
model = model_train(model, train_dataloader, optimizer, scheduler, epochs)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./pretrained/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 1/4 
	Training Loss: 0.0886
Epoch: 2/4 
	Training Loss: 0.0420
Epoch: 3/4 
	Training Loss: 0.0182
Epoch: 4/4 
	Training Loss: 0.0056


RuntimeError: Parent directory ./model does not exist.

In [17]:
torch.save({
    'state_dict': model.state_dict(),
    'config': model.config
}, './model/Bert_classifier.pth')

using code below to load the model:

``` python
checkpoint = torch.load('./model/Bert_classifier.pth')

model = BertForSequenceClassification.from_pretrained('./pretrained/bert-base-uncased', config=checkpoint['config'])
model.load_state_dict(checkpoint['state_dict'])
```