# Installing Dependencies and Imports

In [1]:
!pip install nlp==0.4.0
!pip install transformers==3.5.1
!pip install torch==1.4.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nlp==0.4.0
  Downloading nlp-0.4.0-py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 5.2 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 55.9 MB/s 
Installing collected packages: xxhash, nlp
Successfully installed nlp-0.4.0 xxhash-3.0.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==3.5.1
  Downloading transformers-3.5.1-py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 4.9 MB/s 
Collecting sentencepiece==0.1.91
  Downloading sentencepiece-0.1.91-cp37-cp37m-manylinux1_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 59.3 MB/s 
Collecting tokenizers==0.9.3
  Downloading tokenizers-0.9.3-cp37-cp37m-manylinux1_x86_6

In [2]:
from transformers import BertForSequenceClassification,BertTokenizerFast,Trainer,TrainingArguments
from nlp import load_dataset
import torch
import numpy as np

# Download and load the IMDB Dataset

In [3]:
!gdown https://drive.google.com/uc?id=11_M4ootuT7I1G0RlihcC0cA3Elqotlc- 
dataset = load_dataset('csv',data_files='./imdbs.csv',split='train')

Downloading...
From: https://drive.google.com/uc?id=11_M4ootuT7I1G0RlihcC0cA3Elqotlc-
To: /content/imdbs.csv
  0% 0.00/132k [00:00<?, ?B/s]100% 132k/132k [00:00<00:00, 83.5MB/s]


Downloading:   0%|          | 0.00/2.75k [00:00<?, ?B/s]



Downloading and preparing dataset csv/default-11046c2826f07a01 (download: Unknown size, generated: Unknown size, post-processed: Unknown sizetotal: Unknown size) to /root/.cache/huggingface/datasets/csv/default-11046c2826f07a01/0.0.0/ede98314803c971fef04bcee45d660c62f3332e8a74491e0b876106f3d99bd9b...


0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-11046c2826f07a01/0.0.0/ede98314803c971fef04bcee45d660c62f3332e8a74491e0b876106f3d99bd9b. Subsequent calls will reuse this data.


In [4]:
type(dataset)

nlp.arrow_dataset.Dataset

In [5]:
dataset = dataset.train_test_split(test_size=0.3)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
dataset

{'train': Dataset(features: {'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None)}, num_rows: 70),
 'test': Dataset(features: {'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None)}, num_rows: 30)}

In [7]:
train_set = dataset['train']
test_set = dataset['test']

# Initializing BERT for Sequence Classification

In [8]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [9]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

# Data Preprocessing

In [10]:
def preprocess(data):
 return tokenizer(data['text'], padding=True, truncation=True)

In [11]:
train_set = train_set.map(preprocess, batched=True,batch_size=len(train_set))
test_set = test_set.map(preprocess, batched=True, batch_size=len(test_set))

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [12]:
train_set.set_format('torch',columns=['input_ids', 'attention_mask', 'label'])
test_set.set_format('torch',columns=['input_ids', 'attention_mask', 'label'])

# Initializing Hyperparameters and Trainer arguments

In [18]:
batch_size = 8
epochs = 20
warmup_steps = 500
weight_decay = 0.01

In [19]:
training_args = TrainingArguments(
 output_dir='./results',
 num_train_epochs=epochs,
 per_device_train_batch_size=batch_size,
 per_device_eval_batch_size=batch_size,
 warmup_steps=warmup_steps,
 weight_decay=weight_decay,
 evaluate_during_training=True,
 logging_dir='./logs',
)

In [20]:
trainer = Trainer(
 model=model,
 args=training_args,
 train_dataset=train_set,
 eval_dataset=test_set
)

# Training and Evaluation

In [21]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=180, training_loss=0.37738172743055554)

In [22]:
trainer.evaluate()

{'eval_loss': 0.6592559814453125, 'epoch': 20.0}