In [1]:
!pip install datasets















[0m

In [2]:
!pip install transformers









[0m

In [3]:
import datasets
import seaborn as sns
import matplotlib.pyplot as plt
import torch
torch.cuda.empty_cache()
import pandas as pd
import torch.nn.functional as F
import numpy as np



from collections import defaultdict
from torch import  nn 
from datasets import load_dataset
from transformers import RobertaTokenizer, RobertaModel,AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset,DataLoader
from sklearn.model_selection import train_test_split
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

### Load dataset

In [None]:
df= datasets.load_dataset('imdb')

### Data preprocessing

In [None]:
PRE_TRAINED_MODEL_NAME = 'roberta-base'

In [None]:
tokenizer = RobertaTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
sample_txt = 'When was I last outside? I am stuck at home for 2 weeks.'

In [None]:
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print(f' Sentence: {sample_txt}')
print(f'   Tokens: {tokens}')
print(f'Token IDs: {token_ids}')

### Choosing Sequence Length

BERT works with fixed-length sequences. We'll use a simple strategy to choose the max length. Let's store the token length of each review:

In [None]:
token_lens = []

for txt in df['train']['text'][:1000]:
  tokens = tokenizer.encode(txt, max_length=1024)
  token_lens.append(len(tokens))

and plot the distribution:

In [None]:
sns.distplot(token_lens)
plt.xlabel('Token count');
plt.show()

Most of the reviews seem to contain less than 400 tokens, but we'll be on the 

---

safe side and choose a maximum length of 512.

In [None]:
MAX_LEN = 512

### Building a sentiment classifier

We have all building blocks required to create a PyTorch dataset. Let's do it:

In [None]:
class GPReviewDataset(Dataset):

  def __init__(self, reviews, targets, tokenizer, max_len):
    self.reviews = reviews
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.reviews)
  
  def __getitem__(self, item):
    review = str(self.reviews[item])
    target = self.targets[item]

    encoding = self.tokenizer.encode_plus(
      review,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return {
      'review_text': review,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

The tokenizer is doing most of the heavy lifting for us. We also return the review texts, so it'll be easier to evaluate the predictions from our model. Let's split the data:

In [None]:
df_train = pd.DataFrame(df['train'] )
df_test=pd.DataFrame(df['test'] )
df=pd.concat([df_train,df_test],axis=0)

In [None]:

df_train, df_test = train_test_split(df, test_size=0.1, random_state=42)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=42)

In [None]:
df_train.shape, df_val.shape, df_test.shape

We also need to create a couple of data loaders. Here's a helper function to do it:

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = GPReviewDataset(
    reviews=df.text.to_numpy(),
    targets=df.label.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )

In [None]:
BATCH_SIZE = 16
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

Let's have a look at an example batch from our training data loader:

In [None]:
data = next(iter(train_data_loader))
data.keys()

In [None]:
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['targets'].shape)

### Sentiment classification with Bert

In [None]:
robert_model = RobertaModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
encoding = tokenizer.encode_plus(
  sample_txt,
  max_length=32,
  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=False,
  pad_to_max_length=True,
  return_attention_mask=True,
  return_tensors='pt',  # Return PyTorch tensors
)
last_hidden_state, pooled_output = robert_model(
  input_ids=encoding['input_ids'], 
  attention_mask=encoding['attention_mask']
)

In [None]:
class SentimentClassifier(nn.Module):

  def __init__(self, n_classes):
    super(SentimentClassifier, self).__init__()
    self.bert = RobertaModel.from_pretrained(PRE_TRAINED_MODEL_NAME,return_dict=False)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  
  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = self.drop(pooled_output)
    return self.out(output)

In [None]:
!wget https://www.kaggleusercontent.com/kf/122039412/eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..P92S6OWaPhm65AZ8Urq0ng.i0Dm1rOafJeZtcJRc-LusN7PqT2VXhxz5dBa-V_dQAblqaoxc6S8sM5IWZvziRfqeRK9DMUbvKTvO35CDjsU4NzrMuuy3qhjj2B9PaijcgzdNPPfKi-hOxI52vvpgdOQFeD6cIua6MaYwjkd_k-WbNbhezKqhE8suesFTW_jQ05zPnx20JLF5ElmvYYByfyfh16cBE44yEwafGZqpdlAfUyb8pem5D-vv8zYbyFLPK-juSSWPZHha8J5ekvyyjEo2rYlAqzFby4oU7AgGW1v53vn8l_hXK15iBPPqPf4jJDvwwr2AYVJg1p0c2_rN8pp3YMAJSzgklyKhOn6uHooiuou3C7tdlN-bTdC6F9QdfqpbYcDGl1tZUzw3bv7z4eh4XTlaRBbEAspWU2gr9AA5mGUOtF3KeJ_ZL4FwTdXvAlXg8nzzGysJUipstBX0efTATaj6FImKQeLSS2rf1ddwXoJt3sYmiWfy3Ob2iezvLNAu-_M5PuqVcWzmFHMjeV4RLS6rZ7MOWRKyULoc25EbwyiUOJBnXqoXvwCdikncMK6c58oOlcyyL1Lq4lxybE4WjzMP0xpEQffyfHoSPLuO3Kxo-WqLQrfE7i9oTas1_F6YDn1NywE6DrfMarT1ME2S6x33xHG78ioK-lO0RHHuw.y5z-YvcIMOJmez72ZczcnQ/best_model_state.bin

In [None]:
model = SentimentClassifier(2)
model.load_state_dict(torch.load('best_model_state.bin'))### to load  save model
model = model.to(device)

In [None]:
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)

print(input_ids.shape) # batch size x seq length
print(attention_mask.shape) # batch size x seq length

We can show the softmax that we will use for our ODD detection

In [None]:
# F.softmax(model(input_ids, attention_mask), dim=1)

### Training

To reproduce the training procedure from the BERT paper, we'll use the [AdamW](https://huggingface.co/transformers/main_classes/optimizer_schedules.html#adamw) optimizer provided by Hugging Face. It corrects weight decay, so it's similar to the original paper. We'll also use a linear scheduler with no warmup steps:

In [None]:

EPOCHS = 10

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:

def train_epoch(
  model, 
  data_loader, 
  loss_fn, 
  optimizer, 
  device, 
  scheduler, 
  n_examples
):
  model = model.train()

  losses = []
  correct_predictions = 0
  
  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["targets"].to(device)

    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )

    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, targets)

    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses)

:

### Evaluation of model

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()

  losses = []
  correct_predictions = 0

  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)

      loss = loss_fn(outputs, targets)

      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())

  return correct_predictions.double() / n_examples, np.mean(losses)

Using those two, we can write our training loop. We'll also store the training history:

In [None]:
%%time

history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):

  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)

  train_acc, train_loss = train_epoch(
    model,
    train_data_loader,    
    loss_fn, 
    optimizer, 
    device, 
    scheduler, 
    len(df_train)
  )

  print(f'Train loss {train_loss} accuracy {train_acc}')

  val_acc, val_loss = eval_model(
    model,
    val_data_loader,
    loss_fn, 
    device, 
    len(df_val)
  )

  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()

  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)

  if val_acc > best_accuracy:
    torch.save(model.state_dict(),'best_model_state.bin')
    best_accuracy = val_acc

In [None]:
test_acc, _ = eval_model(
  model,
  test_data_loader,
  loss_fn,
  device,
  len(df_test)
)

test_acc.item()

#### ODD WITH MAX SOFTMAX

In [None]:
clean_adversarial_sample=pd.read_csv('/kaggle/input/clean-attack-imbb-fool/0')[['text','result_type']]

In [None]:
clean_adversarial_sample

In [None]:
def create_attack_data_loader(df, tokenizer, max_len, batch_size):
  ds = GPReviewDataset(
    reviews=df.text.to_numpy(),
    tokenizer=tokenizer,
    targets=df.result_type.to_numpy(),
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
  )
attack_data_loader = create_attack_data_loader(clean_adversarial_sample, tokenizer, MAX_LEN, BATCH_SIZE)

Note that we're storing the state of the best model, indicated by the highest validation accuracy.

Whoo, this took some time! We can look at the training vs validation accuracy:

In [None]:
def get_predictions(model, data_loader):
  model = model.eval()
  
  review_texts = []
  predictions = []
  prediction_probs = []
  real_values = []

  with torch.no_grad():
    for d in data_loader:

      texts = d["review_text"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)

      probs = F.softmax(outputs, dim=1)

      review_texts.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(probs)
      real_values.extend(targets)

  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  return review_texts, predictions, prediction_probs, real_values

In [None]:
y_review_texts, y_pred, y_pred_probs,attack_label_real = get_predictions(
  model,
  attack_data_loader
)

Get the maximun for odd

In [None]:
 prob=np.amax(y_pred_probs.numpy(), axis=1)
 prob

In [None]:
attack_label_real

In [None]:
from sklearn import metrics
fpr, tpr,thresholds = metrics.roc_curve(attack_label_real,prob, pos_label=0)
print('AUROC:',metrics.auc(fpr, tpr))

In [None]:
from sklearn.metrics import average_precision_score

print('AUPR:',average_precision_score(attack_label_real,prob,pos_label=0))