# Prediciton with BERT Models

In [16]:
%store -r original_sentences
%store -r modified_sentences

1. Model import

In [17]:
import torch
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [29]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
bert = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

In [30]:
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token
init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

In [28]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

In [21]:
import torch.nn as nn
class BERTGRUSentiment(nn.Module):
    def __init__(self,
                 bert,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout):
        super().__init__()
        self.bert = bert
        embedding_dim = bert.config.to_dict()['hidden_size']
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    def forward(self, text):
        #text = [batch size, sent len]    
        with torch.no_grad():
            embedded = self.bert(text)[0]      
            #embedded = [batch size, sent len, emb dim]
        _, hidden = self.rnn(embedded)
        #hidden = [n layers * n directions, batch size, emb dim]
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
        #hidden = [batch size, hid dim]
        output = self.out(hidden)
        #output = [batch size, out dim]
        return output

In [22]:
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25

model = BERTGRUSentiment(bert,
                         HIDDEN_DIM,
                         OUTPUT_DIM,
                         N_LAYERS,
                         BIDIRECTIONAL,
                         DROPOUT)

In [39]:
def predict_sentiment(model, tokenizer, sentence):
    model.eval()
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length-2]
    indexed = [init_token_idx] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    x = model(tensor)
    # prediction = torch.sigmoid(model(tensor))
    # prediction = torch.sigmoid(x)
    # return prediction.item()
    return x
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



Was ich habe:
- liste mit original Sätzen
- liste mit modifizierten Sätzen

Was ich brauche
- Dict:

Wie mach ich das:
{
  'original': ''
  'modified': ''
  'prediction_origial: ''
  'predictions_modified: ''
}



In [40]:
original_predictions = []

In [32]:
?predict_sentiment

[0;31mSignature:[0m [0mpredict_sentiment[0m[0;34m([0m[0mmodel[0m[0;34m,[0m [0mtokenizer[0m[0;34m,[0m [0msentence[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mFile:[0m      ~/code/adv-absa/helper/<ipython-input-23-d36f4d6fafd8>
[0;31mType:[0m      function


In [41]:
for sentence in original_sentences:
    original_predictions.append(predict_sentiment(bert, tokenizer, sentence))

KeyboardInterrupt: 

In [42]:
print(predict_sentiment(bert, tokenizer, 'hello ich bin nora'))

(tensor([[-0.4498, -1.0759, -0.1929,  0.1885,  1.2633]],
       grad_fn=<AddmmBackward>),)


In [53]:
def listtodict(original_sentences, original_predictions, modified_sentences, modified_predictions):
    predictions = zip(original_sentences, original_predictions, modified_sentences, modified_predictions)
    return predictions

In [54]:
listtodict(original_sentences)

TypeError: listtodict() missing 3 required positional arguments: 'original_predictions', 'modified_sentences', and 'modified_predictions'

In [None]:
for sentence in original_sentences:
    original_predictions.append(predict_sentiment(model, tokenizer, sentence))

In [None]:
def listtodict(original_sentences, modified_sentences):
    predictions = zip(original_sentences, modified_sentences)
    return predictions