In [1]:
feedback_simple = 'The schnitzel tastes good. The soup was too hot. The waiter was quick and polite.'

In [2]:
sentences = feedback_simple.split('.')

print('\n'.join(sentences))

The schnitzel tastes good
 The soup was too hot
 The waiter was quick and polite



In [3]:
words = sentences[0].split(' ')
print('\n'.join(words))

The
schnitzel
tastes
good


In [4]:
feedback_rude = '''The waiter was very rude, 
e.g. when I accidentally opened the wrong door
he screamed "Private!".'''

In [5]:
sentences = feedback_rude.split('.')

print('\n'.join(sentences))

The waiter was very rude, 
e
g
 when I accidentally opened the wrong door
he screamed "Private!"



In [6]:
import spacy

nlp_en = spacy.load('en')

In [7]:
document = nlp_en(feedback_simple)

for sentence in document.sents:
    print(sentence)

The schnitzel tastes good.
The soup was too hot.
The waiter was quick and polite.


In [8]:
document_rude = nlp_en(feedback_rude)

for sentence in document_rude.sents:
    print(sentence)

The waiter was very rude, 
e.g. when I accidentally opened the wrong door
he screamed "Private!".


In [9]:
first_sent = next(document.sents)

for word in first_sent:
    print(word)

The
schnitzel
tastes
good
.


In [10]:
tastes_token = first_sent[2]

print(tastes_token)

tastes


In [11]:
tastes_token.lemma_  # basic form of word

'taste'

In [12]:
tastes_token.pos_  # "part of speech" = role of word in sentence

'VERB'

In [13]:
print(tastes_token.pos_)

VERB


In [14]:
tastes_token.pos

99

In [15]:
from spacy.symbols import ADJ, NOUN, VERB

print(VERB)

99


In [16]:
print(NOUN)

91


In [17]:
from spacy.symbols import NAMES

print(NAMES[99])  # 99 = VERB

VERB


In [18]:
from spacy.symbols import IDS

print(IDS['VERB'])

99


In [19]:
from enum import Enum

class Topic(Enum):
    AMBIENCE = 1
    FOOD = 2
    HYGIENE = 3
    SERVICE = 4
    VALUE = 5

In [20]:
class Rating(Enum):
    VERY_BAD = -3
    BAD = -2
    SOMEWHAT_BAD = -1
    SOMEWHAT_GOOD = 1
    GOOD = 2
    VERY_GOOD = 3

In [21]:
import re
from spacy.tokens import Token

In [22]:
class LexiconEntry:
    _IS_REGEX_REGEX = re.compile(r'.*[.+*\[$^\\]')

    def __init__(self, lemma: str, topic: Topic, rating: Rating):
        assert lemma is not None
        self.lemma = lemma
        self._lower_lemma = lemma.lower()
        self.topic = topic
        self.rating = rating
        self.is_regex = bool(LexiconEntry._IS_REGEX_REGEX.match(self.lemma))
        self._regex = re.compile(lemma, re.IGNORECASE) if self.is_regex else None

    def matching(self, token: Token) -> float:
        """
        A weight between 0.0 and 1.0 on how much ``token`` matches this entry.
        """
        assert token is not None
        result = 0.0
        if self.is_regex:
            if self._regex.match(token.text):
                result = 0.6
            elif self._regex.match(token.lemma_):
                result = 0.5
        else:
            if token.text == self.lemma:
                result = 1.0
            elif token.text.lower() == self.lemma:
                result = 0.9
            elif token.lemma_ == self.lemma:
                result = 0.8
            elif token.lemma_.lower() == self.lemma:
                result = 0.7
        return result

    def __str__(self) -> str:
        result = 'LexiconEntry(%s' % self.lemma
        if self.topic is not None:
            result += ', topic=%s' % self.topic.name
        if self.rating is not None:
            result += ', rating=%s' % self.rating.name
        if self.is_regex:
            result += ', is_regex=%s' % self.is_regex
        result += ')'
        return result

    def __repr__(self) -> str:
        return self.__str__()

In [31]:
from math import isclose

class Lexicon:
    def __init__(self):
#         List[LexiconEntry] = []
        lst =  []
        self.entries = lst

    
    def append(self, lemma: str, topic: Topic, rating: Rating):
        lexicon_entry = LexiconEntry(lemma, topic, rating)
        self.entries.append(lexicon_entry)

    def lexicon_entry_for(self, token: Token) -> LexiconEntry:
        """
        Entry in lexicon that best matches ``token``.
        """
        result = None
        lexicon_size = len(self.entries)
        lexicon_entry_index = 0
        best_matching = 0.0
        while lexicon_entry_index < lexicon_size and not isclose(best_matching, 1.0):
            lexicon_entry = self.entries[lexicon_entry_index]
            matching = lexicon_entry.matching(token)
            if matching > best_matching:
                result = lexicon_entry
                best_matching = matching
            lexicon_entry_index += 1
        return result

In [32]:
lexicon = Lexicon()
lexicon.append('waiter'     , Topic.SERVICE , None)
lexicon.append('waitress'   , Topic.SERVICE , None)
lexicon.append('wait'       , None          , Rating.BAD)
lexicon.append('quick'      , None          , Rating.GOOD)
lexicon.append('.*schnitzel', Topic.FOOD    , None)
lexicon.append('music'      , Topic.AMBIENCE, None)
lexicon.append('loud'       , None          , Rating.BAD)
lexicon.append('tasty'      , Topic.FOOD    , Rating.GOOD)
lexicon.append('polite'     , Topic.SERVICE , Rating.GOOD)

In [35]:
feedback_text = 'The music was very loud.'
feedback = nlp_en(feedback_text)

for token in next(feedback.sents):
    lexicon_entry = lexicon.lexicon_entry_for(token)
    print(token)

The
music
was
very
loud
.


In [37]:
feedback_text = 'The music was very loud.'
feedback = nlp_en(feedback_text)

for sent in feedback.sents:
    print(sent)
    for token in sent:
        lexicon_entry = lexicon.lexicon_entry_for(token)
        if lexicon_entry is not None:
            if lexicon_entry.topic is not None:
                print('    ', lexicon_entry.topic)
            if lexicon_entry.rating is not None:
                print('    ', lexicon_entry.rating)

The music was very loud.
     Topic.AMBIENCE
     Rating.BAD


In [38]:
INTENSIFIERS = {
    'really',
    'terribly',
    'very',
}

def is_intensifier(token: Token) -> bool:
    return token.lemma_.lower() in INTENSIFIERS

DIMINISHERS = {
    'barely',
    'slightly',
    'somewhat',
}

def is_diminisher(token: Token) -> bool:
    return token.lemma_.lower() in DIMINISHERS

In [40]:
very_token = next(nlp_en(feedback_text).sents)[3]

print(very_token)

very


In [41]:
is_intensifier(very_token)

True

In [42]:
def signum(value) -> int:
    if value > 0:
        return 1
    elif value < 0:
        return -1
    else:
        return 0

_MIN_RATING_VALUE = Rating.VERY_BAD.value
_MAX_RATING_VALUE = Rating.VERY_GOOD.value


def _ranged_rating(rating_value: int) -> Rating:
    return Rating(min(_MAX_RATING_VALUE, max(_MIN_RATING_VALUE, rating_value)))

def diminished(rating: Rating) -> Rating:
    if abs(rating.value) > 1:
        return _ranged_rating(rating.value - signum(rating.value))
    else:
        return rating

def intensified(rating: Rating) -> Rating:
    if abs(rating.value) > 1:
        return _ranged_rating(rating.value + signum(rating.value))
    else:
        return rating

print(diminished(Rating.BAD))
print(diminished(Rating.SOMEWHAT_BAD))
print(intensified(Rating.BAD))

Rating.SOMEWHAT_BAD
Rating.SOMEWHAT_BAD
Rating.VERY_BAD


In [43]:
NEGATIONS = {
    'no',
    'not',
    'none',
}

def is_negation(token: Token) -> bool:
    return token.lemma_.lower() in NEGATIONS

In [44]:
_RATING_TO_NEGATED_RATING_MAP = {
    Rating.VERY_BAD     : Rating.SOMEWHAT_GOOD,
    Rating.BAD          : Rating.GOOD,
    Rating.SOMEWHAT_BAD : Rating.GOOD,  # hypothetical?
    Rating.SOMEWHAT_GOOD: Rating.BAD,  # hypothetical?
    Rating.GOOD         : Rating.BAD,
    Rating.VERY_GOOD    : Rating.SOMEWHAT_BAD,
}

def negated_rating(rating: Rating) -> Rating:
    assert rating is not None
    return _RATING_TO_NEGATED_RATING_MAP[rating]

print(Rating.GOOD, ' -> ', negated_rating(Rating.GOOD))
print(Rating.VERY_BAD, ' -> ', negated_rating(Rating.VERY_BAD))

Rating.GOOD  ->  Rating.BAD
Rating.VERY_BAD  ->  Rating.SOMEWHAT_GOOD


In [45]:
Token.set_extension('topic', default=None)
Token.set_extension('rating', default=None)
Token.set_extension('is_negation', default=False)
Token.set_extension('is_intensifier', default=False)
Token.set_extension('is_diminisher', default=False)

In [46]:
token = next(nlp_en('schnitzel').sents)[0]
print(token.lemma_)
token._.topic = Topic.FOOD
print(token._.topic)

schnitzel
Topic.FOOD


In [47]:
def debugged_token(token: Token) -> str:
    result = 'Token(%s, lemma=%s' % (token.text, token.lemma_)
    if token._.topic is not None:
        result += ', topic=' + token._.topic.name
    if token._.rating is not None:
        result += ', rating=' + token._.rating.name
    if token._.is_diminisher:
        result += ', diminisher'
    if token._.is_intensifier:
        result += ', intensifier'
    if token._.is_negation:
        result += ', negation'
    result += ')'
    return result

print(debugged_token(token))

Token(schnitzel, lemma=schnitzel, topic=FOOD)


In [48]:
def opinion_matcher(doc):
    for sentence in doc.sents:
        for token in sentence:
            if is_intensifier(token):
                token._.is_intensifier = True
            elif is_diminisher(token):
                token._.is_diminisher = True
            elif is_negation(token):
                token._.is_negation = True
            else:
                lexicon_entry = lexicon.lexicon_entry_for(token)
                if lexicon_entry is not None:
                    token._.rating = lexicon_entry.rating
                    token._.topic = lexicon_entry.topic
    return doc

In [49]:
if nlp_en.has_pipe('opinion_matcher'):
    nlp_en.remove_pipe('opinion_matcher')
nlp_en.add_pipe(opinion_matcher)

In [50]:
def is_essential(token: Token) -> bool:
    return token._.topic is not None \
        or token._.rating is not None \
        or token._.is_diminisher \
        or token._.is_intensifier \
        or token._.is_negation
        
def essential_tokens(tokens):
    return [token for token in tokens if is_essential(token)]


In [51]:
document = nlp_en('The schnitzel is not very tasty.')

opinion_essence = essential_tokens(document)
for token in opinion_essence:
    print(debugged_token(token))

Token(schnitzel, lemma=schnitzel, topic=FOOD)
Token(not, lemma=not, negation)
Token(very, lemma=very, intensifier)
Token(tasty, lemma=tasty, topic=FOOD, rating=GOOD)


In [52]:
def is_rating_modifier(token: Token):
    return token._.is_diminisher \
        or token._.is_intensifier \
        or token._.is_negation

In [53]:
def combine_ratings(tokens):
    # Find the first rating (if any).
    rating_token_index = next(
        (
            token_index for token_index in range(len(tokens))
            if tokens[token_index]._.rating is not None
        ),
        None  # Default if no rating token can be found
        
    )

    if rating_token_index is not None:
        # Apply modifiers to the left on the rating.
        original_rating_token = tokens[rating_token_index]
        combined_rating = original_rating_token._.rating
        modifier_token_index = rating_token_index - 1
        modified = True  # Did the last iteration modify anything?
        while modified and modifier_token_index >= 0:
            modifier_token = tokens[modifier_token_index]
            if is_intensifier(modifier_token):
                combined_rating = intensified(combined_rating)
            elif is_diminisher(modifier_token):
                combined_rating = diminished(combined_rating)
            elif is_negation(modifier_token):
                combined_rating = negated_rating(combined_rating)
            else:
                # We are done, no more modifiers 
                # to the left of this rating.
                modified = False
            if modified:
                # Discord the current modifier 
                # and move on to the token on the left.
                del tokens[modifier_token_index]
                modifier_token_index -= 1
        original_rating_token._.rating = combined_rating

In [54]:
document = nlp_en('The schnitzel is not very tasty.')

opinion_essence = essential_tokens(document)
print('essential tokens:')
for token in opinion_essence:
    print('  ', debugged_token(token))

combine_ratings(opinion_essence)
print('combined tokens:')
for token in opinion_essence:
    print('  ', debugged_token(token))

essential tokens:
   Token(schnitzel, lemma=schnitzel, topic=FOOD)
   Token(not, lemma=not, negation)
   Token(very, lemma=very, intensifier)
   Token(tasty, lemma=tasty, topic=FOOD, rating=GOOD)
combined tokens:
   Token(schnitzel, lemma=schnitzel, topic=FOOD)
   Token(tasty, lemma=tasty, topic=FOOD, rating=SOMEWHAT_BAD)


In [55]:
from typing import List, Tuple  # for fancy type hints

def topic_and_rating_of(tokens: List[Token]) -> Tuple[Topic, Rating]:
    result_topic = None
    result_rating = None
    opinion_essence = essential_tokens(tokens)
    # print('  1: ', opinion_essence)
    combine_ratings(opinion_essence)
    # print('  2: ', opinion_essence)
    for token in opinion_essence:
        # print(debugged_token(token))
        if (token._.topic is not None) and (result_topic is None):
            result_topic = token._.topic
        if (token._.rating is not None) and (result_rating is None):
            result_rating = token._.rating
        if (result_topic is not None) and (result_rating is not None):
            break
    return result_topic, result_rating

sentence = next(nlp_en('The schnitzel is not very tasty.').sents)

print(sentence)
print(topic_and_rating_of(sentence))

The schnitzel is not very tasty.
(<Topic.FOOD: 2>, <Rating.SOMEWHAT_BAD: -1>)


In [57]:
def opinions(feedback_text: str):
    feedback = nlp_en(feedback_text)
    for tokens in feedback.sents:
        yield(topic_and_rating_of(tokens))

In [58]:
feedback_text = """
The schnitzel was not very tasty. 
The waiter was polite.
The football game ended 2:1."""

for topic, rating in opinions(feedback_text):
    print(topic, rating)

Topic.FOOD Rating.SOMEWHAT_BAD
Topic.SERVICE Rating.GOOD
None None
