In [1]:
!pip install pytorch-lightning==1.5.2 --quiet
!pip install transformers==3.1.0 --quiet

[K     |████████████████████████████████| 1.0 MB 5.3 MB/s 
[K     |████████████████████████████████| 134 kB 37.6 MB/s 
[K     |████████████████████████████████| 596 kB 16.3 MB/s 
[K     |████████████████████████████████| 397 kB 47.0 MB/s 
[K     |████████████████████████████████| 829 kB 20.4 MB/s 
[K     |████████████████████████████████| 1.1 MB 52.7 MB/s 
[K     |████████████████████████████████| 94 kB 3.3 MB/s 
[K     |████████████████████████████████| 144 kB 51.6 MB/s 
[K     |████████████████████████████████| 271 kB 39.9 MB/s 
[?25h  Building wheel for future (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 884 kB 5.1 MB/s 
[K     |████████████████████████████████| 1.2 MB 50.8 MB/s 
[K     |████████████████████████████████| 895 kB 53.7 MB/s 
[K     |████████████████████████████████| 3.0 MB 53.6 MB/s 
[?25h

In [2]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import matplotlib.pyplot as plt
%matplotlib inline
import pytorch_lightning as pl
from torchmetrics.functional import accuracy
import transformers
from transformers import BertModel, BertConfig
import sklearn
from sklearn.datasets import fetch_20newsgroups
from transformers import AutoModel, BertTokenizerFast
import pandas as pd

In [3]:
print(torch.__version__)
print(transformers.__version__)
print(sklearn.__version__)

1.10.0+cu111
3.1.0
1.0.2


In [4]:
!gdown --id 1eTtRs5cUlBP5dXsx-FTAlmXuB6JQi2qj
!unzip PUBHEALTH.zip

Downloading...
From: https://drive.google.com/uc?id=1eTtRs5cUlBP5dXsx-FTAlmXuB6JQi2qj
To: /content/PUBHEALTH.zip
100% 24.9M/24.9M [00:00<00:00, 89.4MB/s]
Archive:  PUBHEALTH.zip
   creating: PUBHEALTH/
  inflating: PUBHEALTH/train.tsv     
  inflating: PUBHEALTH/dev.tsv       
  inflating: PUBHEALTH/test.tsv      


In [5]:
pub_health_train = pd.read_csv("PUBHEALTH/train.tsv", sep='\t')
pub_health_test = pd.read_csv("PUBHEALTH/test.tsv", sep='\t')

In [6]:
pub_health_train = pub_health_train[pub_health_train['label'] != 'snopes']
pub_health_train = pub_health_train[['main_text','label']]
pub_health_train = pub_health_train.dropna(subset=['main_text', 'label'])

pub_health_train.head()

Unnamed: 0,main_text,label
0,"""Hillary Clinton is in the political crosshair...",false
1,While the financial costs of screening mammogr...,mixture
2,The news release quotes lead researcher Robert...,mixture
3,"The story does discuss costs, but the framing ...",true
4,"""Although the story didn’t cite the cost of ap...",true


In [7]:
pub_health_test = pub_health_test[['main_text','label']]
pub_health_test = pub_health_test.dropna(subset=['main_text', 'label'])

In [8]:
pub_health_train['label'] = pub_health_train['label'].map({"true":0, "false":1, "unproven":2, "mixture":3})
pub_health_test['label'] = pub_health_test['label'].map({"true":0, "false":1, "unproven":2, "mixture":3})

In [11]:
class HealthClaimClassifier(pl.LightningModule):

    def __init__(self, max_seq_len=512, batch_size=128, learning_rate = 0.001):
        super().__init__()
        self.learning_rate = learning_rate
        self.max_seq_len = max_seq_len
        self.batch_size = batch_size
        self.loss = nn.CrossEntropyLoss()

        self.pretrain_model  = AutoModel.from_pretrained('bert-base-uncased')
        self.pretrain_model.eval()
        for param in self.pretrain_model.parameters():
            param.requires_grad = False


        self.new_layers = nn.Sequential(
            nn.Linear(768, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512,4),
            nn.LogSoftmax(dim=1)
        )

    def prepare_data(self):
      tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

      tokens_train = tokenizer.batch_encode_plus(
          pub_health_train["main_text"].tolist(),
          max_length = self.max_seq_len,
          pad_to_max_length=True,
          truncation=True,
          return_token_type_ids=False
      )

      tokens_test = tokenizer.batch_encode_plus(
          pub_health_test["main_text"].tolist(),
          max_length = self.max_seq_len,
          pad_to_max_length=True,
          truncation=True,
          return_token_type_ids=False
      )

      self.train_seq = torch.tensor(tokens_train['input_ids'])
      self.train_mask = torch.tensor(tokens_train['attention_mask'])
      self.train_y = torch.tensor(pub_health_train["label"].tolist())

      self.test_seq = torch.tensor(tokens_test['input_ids'])
      self.test_mask = torch.tensor(tokens_test['attention_mask'])
      self.test_y = torch.tensor(pub_health_test["label"].tolist())

    def forward(self, encode_id, mask):
        _, output= self.pretrain_model(encode_id, attention_mask=mask)
        output = self.new_layers(output)
        return output

    def train_dataloader(self):
      train_dataset = TensorDataset(self.train_seq, self.train_mask, self.train_y)
      self.train_dataloader_obj = DataLoader(train_dataset, batch_size=self.batch_size)
      return self.train_dataloader_obj


    def test_dataloader(self):
      test_dataset = TensorDataset(self.test_seq, self.test_mask, self.test_y)
      self.test_dataloader_obj = DataLoader(test_dataset, batch_size=self.batch_size)
      return self.test_dataloader_obj

    def training_step(self, batch, batch_idx):
      encode_id, mask, targets = batch
      outputs = self(encode_id, mask) 
      preds = torch.argmax(outputs, dim=1)
      train_accuracy = accuracy(preds, targets)
      loss = self.loss(outputs, targets)
      self.log('train_accuracy', train_accuracy, prog_bar=True, on_step=False, on_epoch=True)
      self.log('train_loss', loss, on_step=False, on_epoch=True)
      return {"loss":loss, 'train_accuracy': train_accuracy}

    def test_step(self, batch, batch_idx):
      encode_id, mask, targets = batch
      outputs = self.forward(encode_id, mask)
      preds = torch.argmax(outputs, dim=1)
      test_accuracy = accuracy(preds, targets)
      loss = self.loss(outputs, targets)
      return {"test_loss":loss, "test_accuracy":test_accuracy}

    def test_epoch_end(self, outputs):
      test_outs = []
      for test_out in outputs:
          out = test_out['test_accuracy']
          test_outs.append(out)
      total_test_accuracy = torch.stack(test_outs).mean()
      self.log('total_test_accuracy', total_test_accuracy, on_step=False, on_epoch=True)
      return total_test_accuracy

    def configure_optimizers(self):
      params = self.parameters()
      optimizer = optim.Adam(params=params, lr = self.learning_rate)
      return optimizer

In [12]:
model = HealthClaimClassifier()

trainer = pl.Trainer(fast_dev_run=True, gpus=-1)
trainer.fit(model)

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Running in fast_dev_run mode: will run a full train, val, test and prediction loop using 1 batch(es).


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type             | Params
----------------------------------------------------
0 | loss           | CrossEntropyLoss | 0     
1 | pretrain_model | BertModel        | 109 M 
2 | new_layers     | Sequential       | 395 K 
----------------------------------------------------
395 K     Trainable params
109 M     Non-trainable params
109 M     Total params
439.512   Total estimated model params size (MB)
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: 0it [00:00, ?it/s]

In [13]:
model = HealthClaimClassifier()
trainer = pl.Trainer(max_epochs=10, progress_bar_refresh_rate=30, gpus=-1)
trainer.fit(model)

  f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type             | Params
----------------------------------------------------
0 | loss           | CrossEntropyLoss | 0     
1 | pretrain_model | BertModel        | 109 M 
2 | new_layers     | Sequential       | 395 K 
----------------------------------------------------
395 K     Trainable params
109 M     Non-trainable params
109 M     Total params
439.512   Total estimated model params size (MB)
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Training: 0it [00:00, ?it/s]

In [14]:
trainer.test()


  f"`.{fn}(ckpt_path=None)` was called without a model."
Restoring states from the checkpoint path at /content/lightning_logs/version_0/checkpoints/epoch=9-step=769.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from checkpoint at /content/lightning_logs/version_0/checkpoints/epoch=9-step=769.ckpt
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'total_test_accuracy': 0.6095871925354004}
--------------------------------------------------------------------------------


[{'total_test_accuracy': 0.6095871925354004}]