In [1]:
! pip install pytorch-pretrained-bert

Collecting pytorch-pretrained-bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.8/123.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pytorch-pretrained-bert
Successfully installed pytorch-pretrained-bert-0.6.2


In [2]:
import torch
import torch.nn as nn
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
import torch.optim as optim
 
# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
#import logging
#logging.basicConfig(level=logging.INFO)
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
from argparse import Namespace
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

from tqdm import tqdm as tq
from tqdm import notebook
 



In [3]:
dat = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
dat

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [4]:
#Данные сбалансированы по классам.
(dat.sentiment=='positive').sum()

25000

In [5]:
from sklearn.model_selection import train_test_split

data_train, data_val = train_test_split(dat, test_size=0.20, random_state=42)
data_train['split']='train'
data_val['split']='val'
data_with_split=pd.concat([data_train, data_val], ignore_index=True)
data_with_split

Unnamed: 0,review,sentiment,split
0,That's what I kept asking myself during the ma...,negative,train
1,I did not watch the entire movie. I could not ...,negative,train
2,A touching love story reminiscent of In the M...,positive,train
3,This latter-day Fulci schlocker is a totally a...,negative,train
4,"First of all, I firmly believe that Norwegian ...",negative,train
...,...,...,...
49995,Although Casper van Dien and Michael Rooker ar...,negative,val
49996,I liked this movie. I wasn't really sure what ...,positive,val
49997,Yes non-Singaporean's can't see what's the big...,positive,val
49998,"As far as films go, this is likable enough. En...",negative,val


In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

100%|██████████| 231508/231508 [00:00<00:00, 2602489.70B/s]


In [7]:
class IMDBDataset(Dataset):
    def __init__(self, IMDB_df, max_seq_length):
        """
        Args:
            IMDB_df (pandas.DataFrame): the dataset with bert_tokens
            
        """
        self.IMDB_df = IMDB_df 
        
        self._max_seq_length = max_seq_length

        self.train_df = self.IMDB_df[self.IMDB_df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.IMDB_df[self.IMDB_df.split=='val']
        self.validation_size = len(self.val_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size), 
                             'val': (self.val_df, self.validation_size)}

        self.set_split('train')
       

    def set_split(self, split="train"):
        self._data_split = split
        self._data_df, self._data_size = self._lookup_dict[split]

    def __len__(self):
        return self._data_size

    def __getitem__(self, index):
        
        row = self._data_df.iloc[index]
        review=row['review']
        tokens=tokenizer.tokenize(review)
        if len(tokens)<self._max_seq_length:
            token_index=tokenizer.convert_tokens_to_ids(tokens)
            pad=[0]*(self._max_seq_length-len(tokens))
            token_index=token_index+pad
        else:
            token_index=tokenizer.convert_tokens_to_ids(tokens[:self._max_seq_length])
            
        data_vector = torch.LongTensor(token_index) 
        
        target = int(row.sentiment=='positive')
        
            

        return {'x_data': data_vector, 
                'y_target': torch.LongTensor([target]),
                'x_length': len(tokens)}

    def get_num_batches(self, batch_size):
        
        return len(self) // batch_size


In [8]:
data=IMDBDataset(data_with_split, 256)

In [9]:
(data.__getitem__(2)).items()

dict_items([('x_data', tensor([ 1037,  7244,  2293,  2466, 14563,  1997,  1999,  1996,  6888,  2005,
         2293,  1005,  1012,  5059,  4600,  2006,  2822,  4623,  1998,  2129,
         2023,  2003,  2109,  2011,  2789,  2111,  2000, 10639,  5346,  2000,
         2169,  2060,  1010,  1996,  2466,  7679,  2006,  1037,  2082, 24741,
         2040,  4122,  2061,  2172,  2000,  2022,  1037,  2944,  3836,  2004,
         2092,  2004,  1037,  2204,  3129,  1998,  2269,  1012,  1037,  3026,
         3076,  2003,  2200,  6296,  2000,  2032,  1012,  2004,  1996,  2466,
         4895, 10371,  2015,  2057,  2156,  1996,  6699,  2917,  1996,  3302,
         1999,  2010,  2322,  2095,  3510,  1998,  2129,  2002, 24665, 23804,
         2015,  2007,  1996,  7191, 21883,  2015,  2008,  2227,  2032,  1012,
         1037,  3376,  1998,  3048,  2466,  1012,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0

In [10]:
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"): 
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
            
        yield out_data_dict

In [11]:
gen=generate_batches(data, 16)

In [12]:
next(gen)

{'x_data': tensor([[ 1045,  2245,  8909,  ...,  2187,  2077,  1996],
         [ 1045,  2031,  2000,  ...,  2572,  2025,  1037],
         [ 1045,  2387,  1062,  ...,  2013,  1996,  4378],
         ...,
         [ 1996, 13972,  3727,  ...,     0,     0,     0],
         [ 3172,  2001,  1037,  ...,  1997,  1037,  2843],
         [ 2111,  3422,  5691,  ...,  1005, 11598,  2015]]),
 'y_target': tensor([[0],
         [0],
         [1],
         [0],
         [0],
         [0],
         [0],
         [0],
         [1],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [1]]),
 'x_length': tensor([512, 381, 791, 276, 167, 418, 268, 255, 158, 189, 160, 752, 380,  63,
         628, 335])}

In [13]:
class BertForSequenceClassification(nn.Module):
  
    def __init__(self, num_labels=1):
        super(BertForSequenceClassification, self).__init__()
        self.num_labels = num_labels 
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(config.hidden_dropout_prob) 
        self.classifier = nn.Linear(config.hidden_size, num_labels)
        nn.init.xavier_normal_(self.classifier.weight)
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) 
        pooled_output = self.dropout(pooled_output) 
        logits = self.classifier(pooled_output)
        
        return logits

In [14]:
BertModel.from_pretrained('bert-base-uncased')

100%|██████████| 407873900/407873900 [00:08<00:00, 48785224.63B/s]


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (den

In [15]:
config=Namespace(
hidden_dropout_prob=0.1,
hidden_size=768
)

In [16]:
model=BertForSequenceClassification()
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=Fa

In [17]:
def compute_accuracy(y_prd, y_trgt, device):
    y_trgt=y_trgt.to(device)
        
    preds = (torch.sigmoid(y_prd)>=0.5).to(torch.LongTensor()).to(device)
    all_cor=torch.sum(preds*y_trgt, dim=0)
    
    return all_cor/torch.sum(y_trgt, dim=0)

In [18]:
args = Namespace(
    # Training hyper parameter
    num_epochs=5,
    learning_rate=1e-3,
    batch_size=16,
    seed=1337,
    #early_stopping_criteria=5,
    cuda=True
    )

In [19]:
def make_train_state(args):
    return {'stop_early': False,
            #'early_stopping_step': 0,
            #'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': []
            }
            

In [20]:
make_train_state(args)

{'stop_early': False,
 'learning_rate': 0.001,
 'epoch_index': 0,
 'train_loss': [],
 'train_acc': [],
 'val_loss': [],
 'val_acc': []}

In [21]:
# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")
    
print("Using CUDA: {}".format(args.cuda))

Using CUDA: True


In [22]:
args.device

device(type='cuda')

Обучение и валидация модели.

In [23]:
classifier = model.to(args.device)    
#loss_func = nn.CrossEntropyLoss()
loss_func=torch.nn.BCEWithLogitsLoss()
#optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
lrlast = .001
lrmain = .00001
optimizer = optim.Adam(
    [
        {"params":model.bert.parameters(),"lr": lrmain},
        {"params":model.classifier.parameters(), "lr": lrlast},
       
   ])

sig=nn.Sigmoid()
                                

train_state = make_train_state(args)

epoch_bar = notebook.tqdm(desc='training routine', 
                          total=args.num_epochs,
                          position=0)

data.set_split('train')
train_bar = notebook.tqdm(desc='split=train',
                          total=data.get_num_batches(args.batch_size), 
                          position=1, 
                          leave=True)
data.set_split('val')
val_bar = notebook.tqdm(desc='split=val',
                        total=data.get_num_batches(args.batch_size), 
                        position=1, 
                        leave=True)

try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        # Iterate over training dataset

        # setup: batch generator, set loss and acc to 0, set train mode on
        data.set_split('train')
        batch_generator = generate_batches(data, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        classifier.train()

        for batch_index, batch_dict in enumerate(batch_generator):
            n_batch=batch_index
            # the training routine is these 5 steps:

            # --------------------------------------    
            # step 1. zero the gradients
                        
            optimizer.zero_grad()

            # step 2. compute the output
            y_pred = classifier(batch_dict['x_data'])
            
            # step 3. compute the loss
            
            loss = loss_func(y_pred, batch_dict['y_target'].float())
    
            #running_loss += (loss.item() - running_loss) / (batch_index + 1)
            running_loss += loss.item()

            # step 4. use loss to produce gradients
            loss.backward()

            # step 5. use optimizer to take gradient step
            optimizer.step()
            # -----------------------------------------
            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'], args.device)
            #running_acc += (acc_t - running_acc) / (batch_index + 1)
            running_acc += acc_t

            # update bar
            train_bar.set_postfix(loss=running_loss/(batch_index + 1), acc=running_acc/(batch_index + 1), epoch=epoch_index)
            train_bar.update()

        train_state['train_loss'].append(running_loss/(n_batch+1))
        train_state['train_acc'].append(running_acc/(n_batch+1))

        # Iterate over val dataset

        # setup: batch generator, set loss and acc to 0; set eval mode on

        data.set_split('val')
        batch_generator = generate_batches(data, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.
        running_acc = 0.
        classifier.eval()
        

        for batch_index, batch_dict in enumerate(batch_generator):
            n_batch=batch_index
            # compute the output
            y_pred = classifier(batch_dict['x_data'])
            
            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'].float())
            #running_loss += (loss.item() - running_loss) / (batch_index + 1)
            running_loss += loss.item()
            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'], args.device)
            #running_acc += (acc_t - running_acc) / (batch_index + 1)
            running_acc += acc_t
            
            val_bar.set_postfix(loss=running_loss/(batch_index + 1), acc=running_acc/(batch_index + 1), epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss/(n_batch+1))
        train_state['val_acc'].append(running_acc/(n_batch+1))

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()

        if train_state['stop_early']:
            break
            
except KeyboardInterrupt:
    print("Exiting loop")

training routine:   0%|          | 0/5 [00:00<?, ?it/s]

split=train:   0%|          | 0/2500 [00:00<?, ?it/s]

split=val:   0%|          | 0/625 [00:00<?, ?it/s]

Exiting loop


In [24]:
train_state['val_acc']

[tensor([0.9211], device='cuda:0'),
 tensor([0.9143], device='cuda:0'),
 tensor([0.9320], device='cuda:0')]