In [10]:
!pip install pytorch-lightning==1.2.3 --quiet
!pip install transformers==3.0.0 --quiet

In [11]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import matplotlib.pyplot as plt
%matplotlib inline
import pytorch_lightning as pl
from pytorch_lightning.metrics.functional import accuracy
import transformers
from transformers import BertModel, BertConfig
import sklearn
from sklearn.datasets import fetch_20newsgroups
from transformers import AutoModel, BertTokenizerFast
import pandas as pd


In [12]:
print(torch.__version__)
print(transformers.__version__)
print(sklearn.__version__)

1.8.1+cu101
3.0.0
0.22.2.post1


In [13]:
class TextClassifier(pl.LightningModule):

    def __init__(self, max_seq_len=350, batch_size=256, learning_rate = 0.001):
        super().__init__()
        self.learning_rate = learning_rate
        self.max_seq_len = max_seq_len
        self.batch_size = batch_size
        self.loss = nn.CrossEntropyLoss()
        self.test_accuracy = pl.metrics.Accuracy()

        self.pretrain_model  = AutoModel.from_pretrained('bert-base-uncased')
        self.pretrain_model.eval()
        for param in self.pretrain_model.parameters():
            param.requires_grad = False


        self.new_layers = nn.Sequential(
            nn.Linear(768, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512,20),
            nn.LogSoftmax(dim=1)
        )

        

    def forward(self, encode_id, mask):
        _, output= self.pretrain_model(encode_id, attention_mask=mask)
        output = self.new_layers(output)
        return output

    def prepare_data(self):
      train_data = fetch_20newsgroups(subset='train', shuffle=True)
      test_data = fetch_20newsgroups(subset='test', shuffle=False)
      tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

      # tokenize and encode sequences in the training set
      tokens_train = tokenizer.batch_encode_plus(
          train_data["data"],
          max_length = self.max_seq_len,
          pad_to_max_length=True,
          truncation=True,
          return_token_type_ids=False
      )

      # tokenize and encode sequences in the test set
      tokens_test = tokenizer.batch_encode_plus(
          test_data["data"],
          max_length = self.max_seq_len,
          pad_to_max_length=True,
          truncation=True,
          return_token_type_ids=False
      )

      self.train_seq = torch.tensor(tokens_train['input_ids'])
      self.train_mask = torch.tensor(tokens_train['attention_mask'])
      self.train_y = torch.tensor(train_data["target"])

      self.test_seq = torch.tensor(tokens_test['input_ids'])
      self.test_mask = torch.tensor(tokens_test['attention_mask'])
      self.test_y = torch.tensor(test_data["target"])

    def train_dataloader(self):
      train_dataset = TensorDataset(self.train_seq, self.train_mask, self.train_y)
      self.train_dataloader_obj = DataLoader(train_dataset, batch_size=self.batch_size)
      return self.train_dataloader_obj


    def test_dataloader(self):
      test_dataset = TensorDataset(self.test_seq, self.test_mask, self.test_y)
      self.test_dataloader_obj = DataLoader(test_dataset, batch_size=self.batch_size)
      return self.test_dataloader_obj

    def training_step(self, batch, batch_idx):
      encode_id, mask, targets = batch
      outputs = self(encode_id, mask) 
      preds = torch.argmax(outputs, dim=1)
      train_accuracy = accuracy(preds, targets)
      loss = self.loss(outputs, targets)
      self.log('train_accuracy', train_accuracy, prog_bar=True, on_step=False, on_epoch=True)
      self.log('train_loss', loss, on_step=False, on_epoch=True)
      return {"loss":loss}
    

    def test_step(self, batch, batch_idx):
      encode_id, mask, targets = batch
      outputs = self.forward(encode_id, mask)
      preds = torch.argmax(outputs, dim=1)
      self.test_accuracy(preds, targets)
      loss = self.loss(outputs, targets)
      return {"test_loss":loss, "test_accuracy":self.test_accuracy}

    def test_epoch_end(self, outs):
      total_train_accuracy = self.test_accuracy.compute()
      self.log('total_train_accuracy', total_train_accuracy, on_step=False, on_epoch=True)
      print("Total training accuracy:", total_train_accuracy)

    def configure_optimizers(self):
      params = self.parameters()
      optimizer = optim.Adam(params=params, lr = self.learning_rate)
      return optimizer


    


    

In [14]:
model = TextClassifier()
trainer = pl.Trainer(max_epochs=10, progress_bar_refresh_rate=30, gpus=1)
trainer.fit(model)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




GPU available: True, used: True
TPU available: None, using: 0 TPU cores
Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…





  | Name           | Type             | Params
----------------------------------------------------
0 | loss           | CrossEntropyLoss | 0     
1 | test_accuracy  | Accuracy         | 0     
2 | pretrain_model | BertModel        | 109 M 
3 | new_layers     | Sequential       | 403 K 
----------------------------------------------------
403 K     Trainable params
109 M     Non-trainable params
109 M     Total params
439.545   Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…




1

In [15]:
trainer.test()

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

Total training accuracy: tensor(0.4770, device='cuda:0')

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'total_train_accuracy': 0.47703132033348083}
--------------------------------------------------------------------------------


[{'total_train_accuracy': 0.47703132033348083}]