<a href="https://colab.research.google.com/github/Nielspace/BERT/blob/master/Bert%3D%3EMultitext_Classification_Fine_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/48/35/ad2c5b1b8f99feaaf9d7cdadaeef261f098c6e1a6a2935d4d07662a6b780/transformers-2.11.0-py3-none-any.whl (674kB)
[K     |████████████████████████████████| 675kB 8.3MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 25.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 31.1MB/s 
Collecting tokenizers==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/14/e5/a26eb4716523808bb0a799fcfdceb6ebf77a18169d9591b2f46a9adb87d9/tokenizers-0.7.0-cp36-cp36m-manylinux1_x86_64.whl (3.8MB)
[K     |███

In [3]:
import transformers

class config:
    MODEL = 'bert-base-uncased'

    HIDDEN = 768

    MAX_LENGTH = 64
    TRAIN_BATCH_SIZE = 32
    VALID_BATCH_SIZE = 32

    EPOCHS = 2

    LR = (2e-5, 3e-5, 5e-5)
    EPS = 1e-8

    SEED = 23



In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import transformers
import torch
from torch.utils.data import Dataset, DataLoader



data = pd.read_json("News_Category_Dataset_v2.json", lines=True)

print(data)

data = data.dropna()
data = data.sample(n=20000)

data['text'] = data.headline + data.short_description
encoder = LabelEncoder()
data["classes"] = encoder.fit_transform((data['category']))
data = data[['text', 'category', 'classes']]


class TextClassificationDataset:

    def __init__(self,
                 texts,
                 labels):

        self.texts = texts
        self.labels = labels
        self.max_seq_length = config.MAX_LENGTH

        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            config.MODEL
        )

        self.sep_vid = self.tokenizer.vocab["[SEP]"]
        self.cls_vid = self.tokenizer.vocab["[CLS]"]
        self.pad_vid = self.tokenizer.vocab["[PAD]"]

    def __len__(self):

        return len(self.texts)

    def __getitem__(self, item):

        texts = str(self.texts[item])
        texts = " ".join(texts.split())
        inputs = self.tokenizer.encode_plus(
            texts,
            None,
            add_special_tokens=True,
            pad_to_max_length=True,
            return_attention_mask=True,
            max_length=self.max_seq_length,
            return_token_type_ids=False,
            return_tensors="pt",
        )

        true_seq_length = len(inputs['input_ids'][0])
        pad_size = self.max_seq_length - true_seq_length
        pad_ids = torch.Tensor([self.pad_vid] * pad_size).long()
        ids = torch.cat((inputs['input_ids'][0], pad_ids))


        output_dict = {
            "ids": ids.flatten(),
            'attention_mask': inputs["attention_mask"][0].flatten(),
            'target' : torch.tensor(self.labels[item], dtype=torch.long)

        }

        

        return output_dict


SEED = 42

n_classes = len(data['classes'].unique())

train, val = train_test_split(
    data, test_size=0.30, random_state=SEED)


train.reset_index(drop=True, inplace=True)
val.reset_index(drop=True, inplace=True)




             category  ...       date
0               CRIME  ... 2018-05-26
1       ENTERTAINMENT  ... 2018-05-26
2       ENTERTAINMENT  ... 2018-05-26
3       ENTERTAINMENT  ... 2018-05-26
4       ENTERTAINMENT  ... 2018-05-26
...               ...  ...        ...
200848           TECH  ... 2012-01-28
200849         SPORTS  ... 2012-01-28
200850         SPORTS  ... 2012-01-28
200851         SPORTS  ... 2012-01-28
200852         SPORTS  ... 2012-01-28

[200853 rows x 6 columns]


In [10]:
from transformers import BertForSequenceClassification, BertModel
import torch
import torch.nn as nn





class Model(nn.Module):
    def __init__(self, n_classes):
        super(Model, self).__init__()
        self.bert = BertModel.from_pretrained(config.MODEL)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(config.HIDDEN, n_classes)

            
    
    def forward(self, input_ids, attention_mask=None):
        _, pooled_output = self.bert(
          input_ids=input_ids,
          attention_mask=attention_mask
        )
        output = self.drop(pooled_output)
        # output = self.softmax(output)
        return self.out(output)

In [23]:
from transformers import AdamW, BertConfig, get_linear_schedule_with_warmup
import torch
from torch.utils.data import DataLoader
import numpy as np
import time
import datetime
import random
from tqdm import tqdm


traindata = TextClassificationDataset(
    train.text.to_numpy(), 
    train.classes.to_numpy()
    )

trainLoader = DataLoader(
    traindata, 
    shuffle=True, 
    batch_size=config.TRAIN_BATCH_SIZE)


valdata = TextClassificationDataset(
    val.text.to_numpy(), 
    train.classes.to_numpy()
    )

valLoader = DataLoader(
    valdata, 
    shuffle=True, 
    batch_size=config.TRAIN_BATCH_SIZE)



if torch.cuda.is_available():
    device = torch.device("cuda")

    print("Cuda available")

else:
    print("No GPU's available")
    device = torch.device("cpu")

TOTAL_STEPS = len(trainLoader)*config.EPOCHS

model = Model(n_classes)
model.to(device)

optimizer = AdamW(
    model.parameters(),
    lr = config.LR[-1],
    eps=config.EPS
    )
scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=TOTAL_STEPS
)

def accuracy_check(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    label_flat = labels.flatten()
    return np.sum(pred_flat == label_flat) / len(label_flat)


def format_time(elasped):
    elasped_rounded = int(round(elasped))

    return str(datetime.timedelta(seconds=elasped_rounded))


random.seed(config.SEED)
np.random.seed(config.SEED)
torch.manual_seed(config.SEED)
torch.cuda.manual_seed(config.SEED)

training_stats = []

total_t0 = time.time()



def loss_fn(outputs, targets):
    return nn.CrossEntropyLoss()(outputs, targets.view(-1, 1))

for epoch in range(0, config.EPOCHS):
    print("")
    print("============EPOCHS {:}/{:}=============".format(epoch + 1, config.EPOCHS))
    print("Training")

    t0 = time.time()

    total_train_loss = 0

    model.train()
    for step, batch in enumerate(trainLoader):
       

        if step % 40 == 0 and not step == 0: 
            elapsed = format_time(time.time() - t0)


            print(" Batch {:>5,} of {:>5,}. Elasped: {:}".format(step, len(trainLoader), elapsed))

        b_input_ids = batch['ids'].to(device)
        b_input_mask = batch['attention_mask'].to(device)
        b_labels = batch['target'].to(device)
        
        model.zero_grad()

        output = model(
            b_input_ids,
            attention_mask = b_input_mask,
            )

        # output = torch.nn.functional.softmax(output, dim=1)

        # print(output)

        # lossfn = torch.nn.CrossEntropyLoss().to(device)
        loss = lossfn(output, b_labels).to(device)
        
        total_train_loss+=loss.item()



        loss.backward()


        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()


        avg_train_loss = total_train_loss/len(trainLoader)


        training_time = format_time(time.time() - t0)
        

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))


    #=========================================
    #           Validation
    #=========================================

    print("")
    print("Running Validation")
    
    t0 = time.time()

    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_step = 0

    for batch in valLoader:

        b_input_ids = batch['ids'].to(device)
        b_input_mask = batch['attention_mask'].to(device)
        b_labels = batch['target'].to(device)

        with torch.no_grad():
            model.zero_grad()

            output = model(
                b_input_ids,
                attention_mask = b_input_mask,
                )

        # output = torch.nn.functional.softmax(output)

        # lossfn = torch.nn.CrossEntropyLoss().to(device)
        loss = lossfn(output, b_labels).to(device)
        
        total_eval_loss+=loss.item()

        output = output.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()


        total_eval_accuracy += accuracy_check(output, label_ids)

        
    
    avg_val_accuracy = total_eval_accuracy/len(valLoader)

    print("   Accuracy: {0:.2f}".format(avg_val_accuracy))

    avg_val_loss = total_eval_loss/len(valLoader)

    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    training_stats.append(
        {
            "epochs":epoch + 1,
            "Trainning Loss":avg_train_loss,
            "Valid Loss": avg_val_loss,
            "Valid Acc": avg_val_accuracy,
            "Training Time": validation_time
        }
    )


print("")
print("Training Complete")
print("Total Time took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
        

            









Cuda available

Training
 Batch    40 of   438. Elasped: 0:00:14
 Batch    80 of   438. Elasped: 0:00:28
 Batch   120 of   438. Elasped: 0:00:43
 Batch   160 of   438. Elasped: 0:00:57
 Batch   200 of   438. Elasped: 0:01:11
 Batch   240 of   438. Elasped: 0:01:26
 Batch   280 of   438. Elasped: 0:01:41
 Batch   320 of   438. Elasped: 0:01:55
 Batch   360 of   438. Elasped: 0:02:10
 Batch   400 of   438. Elasped: 0:02:25

  Average training loss: 2.15
  Training epoch took: 0:02:40

Running Validation
   Accuracy: 0.07
  Validation Loss: 4.79
  Validation took: 0:00:25

Training
 Batch    40 of   438. Elasped: 0:00:15
 Batch    80 of   438. Elasped: 0:00:31
 Batch   120 of   438. Elasped: 0:00:46
 Batch   160 of   438. Elasped: 0:01:02
 Batch   200 of   438. Elasped: 0:01:17
 Batch   240 of   438. Elasped: 0:01:33
 Batch   280 of   438. Elasped: 0:01:49
 Batch   320 of   438. Elasped: 0:02:04
 Batch   360 of   438. Elasped: 0:02:20
 Batch   400 of   438. Elasped: 0:02:36

  Average tra

In [21]:
output.shape

torch.Size([32, 41])