In [1]:
!pip install transformers



### Necessary packages

In [2]:
import torch 
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader,SubsetRandomSampler
import torch.optim as optim

import os
import copy
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pylab import rcParams
import csv
import time
from tqdm import tqdm
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from transformers import AutoTokenizer,AutoModel,AutoModelForSequenceClassification,AdamW,get_linear_schedule_with_warmup

seed_val = 42 
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [3]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print("Running on gpu",torch.cuda.get_device_name(0))
else:
    device = 'cpu'
    print('No GPU found Running on cpu')

Running on gpu Tesla P100-PCIE-16GB


In [4]:
from google.colab import drive 
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
dataset_dir = "/content/drive/MyDrive/Amazon/dataset/new3.csv"

train_df = pd.read_csv(dataset_dir,index_col=None)

In [6]:
train_df.drop(['0','Unnamed: 0'],axis=1,inplace=True)

In [7]:
train_df.rename(columns={'1': 'TITLE',
                   '3': 'BROWSE_NODE_ID'},
          inplace=True, errors='raise')

In [8]:
train_df.shape

(264781, 3)

In [9]:
train_df = train_df.drop_duplicates()
train_df.shape

(264781, 3)

In [10]:
train_df = train_df[train_df['TITLE'].notnull()]

In [11]:
train_df['TITLE'].isnull().sum()

0

In [12]:
le = LabelEncoder()
train_df['BROWSE_NODE_ID'] = le.fit_transform(train_df['BROWSE_NODE_ID'])
train_df['BROWSE_NODE_ID'].max()

9907

In [13]:
sentences = train_df['TITLE'].values 
labels = train_df['BROWSE_NODE_ID'].values

In [14]:
print(sentences.shape,labels.shape)

(264781,) (264781,)


In [15]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
train_sentences,val_sentences,train_labels,val_labels = train_test_split(sentences,labels,test_size = 0.1,random_state=seed_val)

In [17]:
print(f"No. of training sentences {len(train_sentences)}")
print(f"No. of validation sentences {len(val_sentences)}")

No. of training sentences 238302
No. of validation sentences 26479


In [18]:
train_df.memory_usage(deep= True)*(1e-6)

Index               2.118248
TITLE             266.718228
2                  38.295362
BROWSE_NODE_ID      2.118248
dtype: float64

In [19]:
# indices , cnts = np.unique(labels,return_counts=True)

In [20]:
# sns.countplot(y = cnts[ (cnts >=10) & (cnts <=100)] )

In [21]:
model_name = 'bert-base-multilingual-cased'
max_input_length = 128
batch_size = 64

### Data Preprocessing

In [22]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [23]:
idx = 100000
sample_text = sentences[idx]
tokens =tokenizer.tokenize(sample_text)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print('Sample text {}'.format(sample_text))
print('Tokens {}'.format(tokens))
print('Token IDS {}'.format(token_ids))

Token indices sequence length is longer than the specified maximum sequence length for this model (558 > 512). Running this sequence through the model will result in indexing errors


Sample text 0.057975 0.026596 -0.010622 0.061523 0.085003 0.045561 -0.025591 -0.11164 -0.078361 0.089149 0.091196 0.078409 0.07734 0.042945 -0.003196 0.04448 -0.01521 -0.059665 -0.00032686 -0.052502 0.047911 -0.0087723 -0.036279 0.025851 0.034307 -0.049543 0.058067 0.061216 0.013107 -0.045237 0.030591 -0.022425 0.031103 -0.0013939 0.034915 -0.030469 -0.0092505 -0.0098639 0.0096247 -0.028933 -0.072186 0.081712 0.03693 -0.051753 0.045296 0.035664 0.067193 -0.023226 0.0229 -0.055481 -0.12465 0.013372 -0.010935 -0.013963 -0.093246 -0.069558 -0.033261 0.032664 0.063967 0.0063827 -0.013734 0.043319 -0.11166 0.029856 0.036687 0.045824 0.054759 0.02211 -0.024031 0.08427 0.10079 0.02853 -0.01582 -0.056987 0.068676 0.1076 -0.020674 -0.12757 0.066519 0.0030416 0.0016169 0.018566 -0.15164 -0.035377 0.0036236 0.054584 0.043646 -0.061214 -0.047801 0.0044398 -0.10973 0.019909 0.070821 0.046859 -0.039432 -0.00045641 0.011947 -0.055183 0.042343 -0.044559 
Tokens ['0', '.', '05', '##7', '##97', '##5', '

In [24]:
tokenizer.sep_token,tokenizer.sep_token_id

('[SEP]', 102)

In [25]:
tokenizer.cls_token,tokenizer.cls_token_id

('[CLS]', 101)

In [26]:
tokenizer.pad_token,tokenizer.pad_token_id

('[PAD]', 0)

In [27]:
tokenizer.unk_token,tokenizer.unk_token_id

('[UNK]', 100)

In [28]:
encoding = tokenizer.encode_plus(
    sample_text,
    max_length = max_input_length,
    add_special_tokens = True,
    pad_to_max_length=True,
    return_attention_mask = True,
    return_token_type_ids = False,
    return_tensors = 'pt'
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [29]:
encoding

{'input_ids': tensor([[   101,    121,    119,  10831,  11305, 100595,  11166,    121,    119,
          75302, 108710,  11211,    118,    121,    119,  49470,  11211,  71793,
            121,    119, 106084,  92161,  10884,    121,    119,  11052,  28847,
          10929,  10884,    121,    119,  98603,  11166,  11211,  10759,    118,
            121,    119,  73887, 108710,  10759,    118,    121,    119,  15821,
          51658,    118,    121,    119, 107573,  60878,  10759,    121,    119,
          11052,  74178,  99808,    121,    119,  11035,  37115,  11373,  11211,
            121,    119, 107573,  38109,  11373,    121,    119,  10878,  11305,
          78301,    121,    119,  97672,  11373,  76977,    118,    121,    119,
          26861,  54055,  11211,    121,    119, 101679,  32168,    118,    121,
            119,  75737,  47499,    118,    121,    119,  10831,  11373,  87372,
          11166,    118,    121,    119,  10259,  68430,  11211, 103450,    118,
            12

In [30]:
encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [31]:
base_model = AutoModel.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [32]:
base_model(**encoding)['pooler_output']

tensor([[ 5.1511e-02,  6.5430e-02,  1.1095e-01,  9.7697e-02,  8.1404e-02,
          1.1452e-01,  1.8644e-02,  1.4287e-01, -8.6418e-02,  5.2267e-02,
          1.5464e-01, -1.2392e-02,  5.3117e-02, -3.7797e-02,  1.1085e-01,
          1.2588e-02,  8.3484e-02, -7.0417e-02, -3.5017e-02, -5.4347e-02,
         -9.9998e-01, -2.6268e-02,  1.0919e-01, -1.7575e-02, -1.4697e-01,
         -1.2892e-01,  5.2459e-02, -3.4963e-02, -1.0806e-02,  8.1087e-02,
         -1.2304e-01, -9.9997e-01, -2.7054e-01,  1.1235e-01, -2.7089e-03,
          7.0055e-03, -7.7862e-02,  3.2846e-03,  4.5763e-02, -2.1460e-01,
         -3.2374e-02, -8.4805e-02,  7.0363e-02, -3.5499e-03, -2.8374e-03,
         -4.9908e-02, -8.6526e-02,  9.8377e-03, -1.3654e-01,  9.1327e-02,
          1.1719e-01,  7.8216e-02,  2.1621e-01,  9.7141e-02, -1.3358e-03,
         -1.2015e-01,  7.6611e-02,  5.7900e-02,  1.0300e-01,  8.0755e-02,
         -6.0743e-02, -2.6530e-02, -8.9517e-02, -7.0763e-03, -3.2200e-02,
         -1.0450e-01, -8.8776e-02,  4.

## Choosing token length

In [33]:
# token_lens = []
# for txt in sentences:
#     tokens = tokenizer.encode(txt,max_length=512)
#     token_lens.append(len(tokens))

In [34]:
# sns.displot(token_lens)

In [35]:
class AmazonDataset(Dataset):

  def __init__(self, sentences, labels, tokenizer, max_length,with_labels=True):
    self.sentences = sentences
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_length = max_length
    self.with_labels = with_labels
  
  def __len__(self):
    return len(self.sentences)
  
  def __getitem__(self, idx):
    sentence = str(self.sentences[idx])
    encoding = self.tokenizer.encode_plus(
      sentence,
      add_special_tokens=True,
      max_length=self.max_length,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    if self.with_labels:
        
        label = self.labels[idx]

        return {
            'sentence': sentence,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }
    else:
        return {
            'sentence': sentence,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }

In [36]:
def create_data_loaders(sentences,labels,tokenizer,max_input_length,batch_size,with_labels):
    ds = AmazonDataset(
        sentences =sentences,
        labels=labels,
        tokenizer=tokenizer,
        max_length=max_input_length,
        with_labels = with_labels
    )

    return DataLoader(
        ds,
        batch_size=batch_size
    )

In [37]:
train_loader = create_data_loaders(
    train_sentences,
    train_labels,
    tokenizer,
    max_input_length=max_input_length,
    batch_size=batch_size,
    with_labels = True
)

val_loader = create_data_loaders(
    val_sentences,
    val_labels,
    tokenizer,
    max_input_length=max_input_length,
    batch_size=batch_size,
    with_labels = True
)

In [38]:
class AmazonClassifier(nn.Module):

  def __init__(self,base_model_name, n_classes):
    super(AmazonClassifier, self).__init__()
    self.base_model = AutoModel.from_pretrained(base_model_name)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.base_model.config.hidden_size, n_classes)
  
  def forward(self, input_ids, attention_mask):
    pooled_output = self.base_model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )['pooler_output']
    output = self.drop(pooled_output)
    return self.out(output)

In [39]:
model = AmazonClassifier(base_model_name=model_name,n_classes=len(np.unique(labels)))

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [40]:
model.to(device)

AmazonClassifier(
  (base_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [41]:
num_epochs = 3

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

total_steps = len(train_loader) * num_epochs

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

In [42]:
def train_epoch(
  model, 
  data_loader, 
  loss_fn, 
  optimizer, 
  device, 
  scheduler, 
  n_examples
):
  model = model.train()

  losses = []
  correct_predictions = 0
  
  for i,d in enumerate(data_loader):
    if i%100 == 0:
        print(f"Processing batch {i+1}/{len(data_loader)}")
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    labels = d["labels"].to(device)

    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )

    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, labels)

    correct_predictions += torch.sum(preds == labels)
    # print(loss)
    losses.append(loss.item())

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses)

In [43]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()

  losses = []
  correct_predictions = 0

  with torch.no_grad():
    for i,d in enumerate(data_loader):
      if i%100 == 0:
          print(f"Processing batch {i+1}/{len(data_loader)}")
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      labels = d["labels"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)

      loss = loss_fn(outputs, labels)

      correct_predictions += torch.sum(preds == labels)
      losses.append(loss.item())

  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
%%time

history = defaultdict(list)
best_accuracy = 0

for epoch in tqdm(range(num_epochs)):

  print(f'Epoch {epoch + 1}/{num_epochs}')
  print('-' * 10)

  train_acc, train_loss = train_epoch(
    model,
    train_loader,    
    loss_fn, 
    optimizer, 
    device, 
    scheduler, 
    len(train_labels)
  )

  print(f'Train loss {train_loss} accuracy {train_acc}')

  val_acc, val_loss = eval_model(
    model,
    val_loader,
    loss_fn, 
    device, 
    len(val_labels)
  )

  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()

  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)

  if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_model_state.bin')
    best_accuracy = val_acc



Epoch 1/3
----------
Processing batch 1/3724
Processing batch 101/3724
Processing batch 201/3724
Processing batch 301/3724
Processing batch 401/3724
Processing batch 501/3724
Processing batch 601/3724
Processing batch 701/3724
Processing batch 801/3724
Processing batch 901/3724
Processing batch 1001/3724
Processing batch 1101/3724
Processing batch 1201/3724
Processing batch 1301/3724
Processing batch 1401/3724
Processing batch 1501/3724
Processing batch 1601/3724
Processing batch 1701/3724
Processing batch 1801/3724


In [None]:
plt.plot(history['train_acc'], label='train accuracy')
plt.plot(history['val_acc'], label='validation accuracy')

plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);

In [None]:
def get_predictions(model, data_loader):
  model = model.eval()
  
  sentences = []
  predictions = []
  prediction_probs = []
  real_values = []

  with torch.no_grad():
    for d in data_loader:

      semtemces = d["sentence"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      labels = d["labels"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)

      probs = F.softmax(outputs, dim=1)

      texts.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(probs)
      real_values.extend(labels)

  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  return review_texts, predictions, prediction_probs, real_values