# Important Word Detection

1. Load Dataset 
2. pick 50 Documents for now
3. make sentence splitting
4. drop each word after the other

In [3]:
import pandas as pd
import json
from nltk.corpus import gutenberg
from nltk.tokenize import sent_tokenize

## 1. Load Data

In [4]:
def load_jsonline(filename, limit):
    data = []
    with open(filename) as f:
        counter = 0
        for line in f:
            counter += 1
            py_obj = json.loads(line)
            data.append(py_obj)
            if counter > limit:
                break
    return data

## 2. Pick first 60 examples for now

In [5]:
data = load_jsonline('../data/items_reviews_18.jl', 59)

In [6]:
print(data[0])

{'target_id': 3321611, 'source_id': 277605655, 'title': 'This place is incredible!', 'text': 'I visited this b&b during a short trip to ride the famous Belgian pavé and it was perfect. The owners were really lovely people, the room was very comfortable and the breakfast was a delicious feast- ideal for big days out on the bicycle! It is in a really good location for riding or driving into Oudenaarde (approx 10 mins) and there are some brilliant restaurants close by. I cannot recommend this place enough!', 'user_rating': 5, 'lang': 'en', '_type': 'TripAdvisorHotelReviewItem'}


## 3. Sentence Splitting
- have list with text items
- have list with splitted sentences

In [8]:
# List of Text form Reviews
sentences = []
for obj in data:
    sentences.append(obj["text"])

In [10]:
# List of single Sentences found in all available Text
sentence_list = sent_tokenize(". ".join(sentences))

In [11]:
sentence_list[:2]

['I visited this b&b during a short trip to ride the famous Belgian pavé and it was perfect.',
 'The owners were really lovely people, the room was very comfortable and the breakfast was a delicious feast- ideal for big days out on the bicycle!']

In [12]:
print('Lenght sentences:', len(sentences),'\n', 'Length sentence_list:', len(sentence_list))

Lenght sentences: 60 
 Length sentence_list: 435


## 4. Input Reduction

- Make a List with List with tokenized sentences
- check length
- go over one item (length) many times and remove item at index
- append item to list

In [13]:
# List of Lists of tokenized sentences
tok_sentences = []
i = 0
for sentence in sentence_list:
    tok_sentences.append(sentence_list[i].split(' '))
    i += 1

len(tok_sentences)


435

In [14]:
print(tok_sentences[:2])

[['I', 'visited', 'this', 'b&b', 'during', 'a', 'short', 'trip', 'to', 'ride', 'the', 'famous', 'Belgian', 'pavé', 'and', 'it', 'was', 'perfect.'], ['The', 'owners', 'were', 'really', 'lovely', 'people,', 'the', 'room', 'was', 'very', 'comfortable', 'and', 'the', 'breakfast', 'was', 'a', 'delicious', 'feast-', 'ideal', 'for', 'big', 'days', 'out', 'on', 'the', 'bicycle!']]


In [15]:
def detokenize(tok_sentence):
    sentence = ' '.join(tok_sentence)
    return sentence

In [16]:
def get_token_dropped_sentence_at_pos(sent,token):
    tok_mod_sentence = sent.copy()    
    tok_mod_sentence.pop(token)
    return tok_mod_sentence

In [None]:
# go over the list of tokens in a sentence
# and drop each word after the other
# go over sentences in list of tokenized sentences
sentence_packages = []
for sent in range(len(tok_sentences)):
    original_sentence = detokenize(tok_sentences[sent])
    modified_sentences = []
# go over token in sentence
    for token in range(len(tok_sentences[sent])):
        tok_mod_sentence = get_token_dropped_sentence_at_pos(tok_sentences[sent], token)
        modified_sentences.append((tok_sentences[sent][token], detokenize(tok_mod_sentence)))
    sentence_packages.append(
        {
            'original_sentence':original_sentence,
            'modified_sentences':modified_sentences
        }        
    )

In [18]:
len(sentence_packages)

435

In [19]:
print(sentence_packages[-1])

{'original_sentence': 'All in all, a gem!', 'modified_sentences': [('All', 'in all, a gem!'), ('in', 'All all, a gem!'), ('all,', 'All in a gem!'), ('a', 'All in all, gem!'), ('gem!', 'All in all, a')]}


Juhuuuuu :D

# 5. Predict with BERT for Sentiment Classification

In [22]:
import torch
import random
import numpy as np

SEED = 42
sentence_packages.append(
        {
            'original_sentence':original_sentence,
            'modified_sentences':modified_sentences
        }        
    )
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [23]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
len(tokenizer.vocab)

30522

In [24]:
tokens = tokenizer.tokenize('Hello WORLD how ARE yoU?')
print(tokens)

['hello', 'world', 'how', 'are', 'you', '?']


In [25]:
indexes = tokenizer.convert_tokens_to_ids(tokens)
print(indexes)

[7592, 2088, 2129, 2024, 2017, 1029]


In [26]:
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token, eos_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


In [27]:
init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [28]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']
print(max_input_length)

512


In [29]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

In [30]:
from transformers import BertTokenizer, BertModel
bert = BertModel.from_pretrained('bert-base-uncased')

In [31]:
import torch.nn as nn

class BERTGRUSentiment(nn.Module):
    def __init__(self,
                 bert,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout):
        
        super().__init__()
        
        self.bert = bert
        
        embedding_dim = bert.config.to_dict()['hidden_size']
        
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)
        
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        #text = [batch size, sent len]    
        with torch.no_grad():
            embedded = self.bert(text)[0]      
            #embedded = [batch size, sent len, emb dim]
        _, hidden = self.rnn(embedded)
        #hidden = [n layers * n directions, batch size, emb dim]
        
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
        #hidden = [batch size, hid dim]
        output = self.out(hidden)
        #output = [batch size, out dim]
        return output

In [32]:
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25

model = BERTGRUSentiment(bert,
                         HIDDEN_DIM,
                         OUTPUT_DIM,
                         N_LAYERS,
                         BIDIRECTIONAL,
                         DROPOUT)

In [None]:
def predict_sentiment(model, tokenizer, sentence):
    model.eval()
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length-2]
    indexed = [init_token_idx] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
important_words = []

for package in sentence_packages:
    original_sentence = package['original_sentence']
    # print('new package: ' + original_sentence)

    original_result = predict_sentiment(model, tokenizer, original_sentence)
    highest_relative = 0
    highest_relative_word = None

    for item in package['modified_sentences']:
        word = item[0]
        sentence = item[1]
        modified_result = predict_sentiment(model, tokenizer, sentence)
        relative = abs(original_result - modified_result)
# OBACHT >=
        if relative >= highest_relative:
            highest_relative = relative
            highest_relative_word = word
            
    important_words.append(highest_relative_word)

assert(len(important_words)==len(sentence_packages))    
print(important_words)

In [37]:
%store important_words
%store sentence_packages

Stored 'important_words' (list)
Stored 'sentence_packages' (list)
