# Important Word Detection

1. Load Dataset 
2. pick 50 Documents for now
3. make sentence splitting
4. drop each word after the other

In [11]:
import pandas as pd
import json
from nltk.corpus import gutenberg
from nltk.tokenize import sent_tokenize
import os

import utils.text_processing as tp

## 1. Load Data

In [9]:
def load_jsonline(filename, limit):
    data = []
    with open(filename) as f:
        counter = 0
        for line in f:
            counter += 1
            py_obj = json.loads(line)
            data.append(py_obj)
            if counter > limit:
                break
    return data

## 2. Pick first 60 examples for now

In [12]:
data = load_jsonline(os.path.join('data', 'items_reviews_18.jl'), 59)

In [13]:
print(data[0])

{'target_id': 3321611, 'source_id': 277605655, 'title': 'This place is incredible!', 'text': 'I visited this b&b during a short trip to ride the famous Belgian pavé and it was perfect. The owners were really lovely people, the room was very comfortable and the breakfast was a delicious feast- ideal for big days out on the bicycle! It is in a really good location for riding or driving into Oudenaarde (approx 10 mins) and there are some brilliant restaurants close by. I cannot recommend this place enough!', 'user_rating': 5, 'lang': 'en', '_type': 'TripAdvisorHotelReviewItem'}


## 3. Sentence Splitting
- have list with text items
- have list with splitted sentences

In [14]:
# List of Text form Reviews
sentences = []
for obj in data:
    sentences.append(obj["text"])

In [15]:
# List of single Sentences found in all available Text
sentence_list = sent_tokenize(". ".join(sentences))

In [16]:
sentence_list[:2]

['I visited this b&b during a short trip to ride the famous Belgian pavé and it was perfect.',
 'The owners were really lovely people, the room was very comfortable and the breakfast was a delicious feast- ideal for big days out on the bicycle!']

In [17]:
print('Lenght sentences:', len(sentences),'\n', 'Length sentence_list:', len(sentence_list))

Lenght sentences: 60 
 Length sentence_list: 435


## 4. Input Reduction

- Make a List with List with tokenized sentences
- check length
- go over one item (length) many times and remove item at index
- append item to list

In [18]:
# List of Lists of tokenized sentences
tok_sentences = []
i = 0
for sentence in sentence_list:
    tok_sentences.append(sentence_list[i].split(' '))
    i += 1

len(tok_sentences)


435

In [19]:
print(tok_sentences[:2])

[['I', 'visited', 'this', 'b&b', 'during', 'a', 'short', 'trip', 'to', 'ride', 'the', 'famous', 'Belgian', 'pavé', 'and', 'it', 'was', 'perfect.'], ['The', 'owners', 'were', 'really', 'lovely', 'people,', 'the', 'room', 'was', 'very', 'comfortable', 'and', 'the', 'breakfast', 'was', 'a', 'delicious', 'feast-', 'ideal', 'for', 'big', 'days', 'out', 'on', 'the', 'bicycle!']]


In [20]:
def detokenize(tok_sentence):
    sentence = ' '.join(tok_sentence)
    return sentence

In [21]:
def get_token_dropped_sentence_at_pos(sent,token):
    tok_mod_sentence = sent.copy()    
    tok_mod_sentence.pop(token)
    return tok_mod_sentence

In [22]:
# go over the list of tokens in a sentence
# and drop each word after the other
# go over sentences in list of tokenized sentences
sentence_packages = []
for sent in range(len(tok_sentences)):
    original_sentence = detokenize(tok_sentences[sent])
    modified_sentences = []
# go over token in sentence
    for token in range(len(tok_sentences[sent])):
        tok_mod_sentence = get_token_dropped_sentence_at_pos(tok_sentences[sent], token)
        modified_sentences.append((tok_sentences[sent][token], detokenize(tok_mod_sentence)))
    sentence_packages.append(
        {
            'original_sentence':original_sentence,
            'modified_sentences':modified_sentences
        }        
    )

In [23]:
len(sentence_packages)

435

In [24]:
print(sentence_packages[-1])

{'original_sentence': 'All in all, a gem!', 'modified_sentences': [('All', 'in all, a gem!'), ('in', 'All all, a gem!'), ('all,', 'All in a gem!'), ('a', 'All in all, gem!'), ('gem!', 'All in all, a')]}


Juhuuuuu :D

# 5. Predict with BERT for Sentiment Classification

In [25]:
import torch
import random
import numpy as np
import pandas as pd

from transformers import AutoTokenizer, AutoModelForSequenceClassification

import utils.text_processing as tp


tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
model.eval();
len(tokenizer.vocab)

105879

In [26]:
tokens = tokenizer.tokenize('Hello WORLD how ARE yoU?')
print(tokens)

['hello', 'world', 'how', 'are', 'you', '?']


In [27]:
indexes = tokenizer.convert_tokens_to_ids(tokens)
print(indexes)

[29155, 10228, 12548, 10320, 10855, 136]


In [28]:
important_words = []

for package in sentence_packages:
    original_sentence = package['original_sentence']
    # print('new package: ' + original_sentence)

    original_result = tp.predict_sentiment(model, tokenizer, original_sentence)
    highest_relative = 0
    highest_relative_word = None

    for item in package['modified_sentences']:
        word = item[0]
        sentence = item[1]
        modified_result = tp.predict_sentiment(model, tokenizer, sentence)
        relative = abs(original_result - modified_result)
# OBACHT >=
        if relative >= highest_relative:
            highest_relative = relative
            highest_relative_word = word
            
    important_words.append(highest_relative_word)

assert(len(important_words)==len(sentence_packages))    
print(important_words)

['perfect.', 'bicycle!', 'some', 'cannot', 'in', 'expect', 'fantastic', 'superb', "couldn't", 'have', 'Materke.', 'riding.', 'perfect', 'again!.', 'as', 'already', 'very', 'options.', 'safe', '(and', 'enough', 'confusion', 'all', 'impossible...they', 'late.', 'city.', 'busy', 'ideal.', 'worth', 'Preto.', 'bathrooms', 'Great', 'Staff', '&', 'work.', 'eggs.', 'Wonderful', 'wait', 'star', 'fabulous', 'included.', 'welcome,', 'and', 'damage.', 'floors.', 'bed.', 'garden.', 'no', 'gratefully.', 'if', 'wanted', 'castle.', 'friendly', 'coffee.', 'again', 'wanted', 'castle.', 'friendly', 'coffee.', 'again', 'and', 'process', 'woman', 'and', 'excellent', 'recommend', 'high', 'offer.', 'but', 'although', 'serviceable.', 'bit', 'never', 'slightly', 'find.', 'located.', 'and', 'disappointed!!', 'rooms!!', 'and', 'comfortable', 'Stayed', 'hard', 'so', 'here.', 'and', 'grounds', 'of', 'Basically', 'discordant', 'average.', 'disappointing', 'comfortable,friendly', 'beautiful.', 'special.', 'very', 't

In [29]:
%store important_words
%store sentence_packages

Stored 'important_words' (list)
Stored 'sentence_packages' (list)
