In [1]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import torch
torch.cuda.empty_cache()
import pandas as pd
import torch.nn.functional as F
import numpy as np



from collections import defaultdict
from torch import  nn 
from datasets import load_dataset
from transformers import RobertaTokenizer, RobertaModel,AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset,DataLoader
from sklearn.model_selection import train_test_split
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

# Loading datasets

In [5]:
import datasets
df= datasets.load_dataset('imdb')



  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
df

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=True)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = df.map(tokenize_function, batched=True)



Map:   0%|          | 0/25000 [00:00<?, ? examples/s]



In [8]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

In [9]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
})

In [10]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))



In [11]:
len(small_train_dataset)

1000

In [12]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)

In [28]:
%ls

[0m[01;34mdrive[0m/  [01;34msample_data[0m/


In [30]:
from transformers import AutoModelForSequenceClassification
from torch.optim import AdamW

# This model is equal to BERT + a linear layer for classification. 

model = AutoModelForSequenceClassification.from_pretrained("prajjwal1/bert-tiny", num_labels=2)
torch.save(model.state_dict(), "drive/MyDrive/odd/Adversarial_Attacks_NLP/best_model_state.bin" )

optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = nn.CrossEntropyLoss().to(device)


Some weights of the model checkpoint at prajjwal1/bert-tiny were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initia

In [36]:
from transformers import get_scheduler
from tqdm import tqdm


num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)

lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [39]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
  progress_bar = tqdm(range(n_examples))

  model.train()
  losses = []
  correct_predictions = 0

  for batch in data_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    targets = batch['labels'].to(device)
    outputs = model(**batch)
    #loss = outputs.loss

    _, preds = torch.max(outputs.logits, dim=1)
    loss = loss_fn(outputs.logits, targets)

    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())

    loss.backward()

    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)

  return correct_predictions.double()/n_examples, np.mean(losses)


###############################################################################

def eval_model(model, data_loader, loss_fn, device, n_examples):
  model.eval()
  losses = []
  correct_predictions = 0

  for i, batch in tqdm(enumerate(data_loader), desc="evaluating", total=data_loader.__len__()):
      batch = {k: v.to(device) for k, v in batch.items()}
      with torch.no_grad():
          outputs = model(**batch)

      logits = outputs.logits

      targets = batch['labels']
      loss = loss_fn(logits, targets)

      _, tag_seq  = torch.max(logits, 1)

      correct_predictions += torch.sum(tag_seq == targets)
      losses.append(loss.item())

  return correct_predictions.double()/n_examples, np.mean(losses)

In [40]:
%%time

history = defaultdict(list)
best_accuracy = 0

for epoch in range(num_epochs):

  print(f'Epoch {epoch + 1}/{num_epochs}')
  print('-' * 10)

  train_acc, train_loss = train_epoch(model,
    train_dataloader,    
    loss_fn, 
    optimizer, 
    device, 
    lr_scheduler, 
    len(small_train_dataset)
  )

  print(f'Train loss {train_loss} accuracy {train_acc}')

  val_acc, val_loss = eval_model(
    model,
    eval_dataloader,
    loss_fn, 
    device, 
    len(small_eval_dataset)
  )

  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()

  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)

  if val_acc > best_accuracy:
    torch.save(model.state_dict(),'best_model_state.bin')
    best_accuracy = val_acc
#%%


Epoch 1/3
----------


 12%|█▎        | 125/1000 [01:17<09:04,  1.61it/s]


Train loss 0.6923195271492004 accuracy 0.511


evaluating: 100%|██████████| 125/125 [00:25<00:00,  4.92it/s]


Val   loss 0.6899438853263855 accuracy 0.507

Epoch 2/3
----------


 12%|█▎        | 125/1000 [01:15<08:47,  1.66it/s]


Train loss 0.6850934090614319 accuracy 0.566


evaluating: 100%|██████████| 125/125 [00:34<00:00,  3.62it/s]


Val   loss 0.6822845611572266 accuracy 0.535

Epoch 3/3
----------


 12%|█▎        | 125/1000 [01:24<09:49,  1.48it/s]


Train loss 0.669013958454132 accuracy 0.604


evaluating: 100%|██████████| 125/125 [00:23<00:00,  5.41it/s]


Val   loss 0.6730518712997436 accuracy 0.609

CPU times: user 4min 55s, sys: 8.94 s, total: 5min 4s
Wall time: 5min 20s
