In [1]:
%load_ext autoreload
%autoreload 2

## Evaluation de la classification

### Importation des bibliothèques

In [2]:
import os
os.chdir("..")

In [3]:
import re
import pandas as pd
import spacy
nlp = spacy.load("en_core_web_sm")
from collections import Counter

In [4]:
tagged_hotels_corpus = open("data/tagged_hotels_corpus.txt", "r")

In [5]:
tagged_hotels_corpus.seek(3)
current_review_number = 1
review_text = []
temp_review_text = ""
for line in tagged_hotels_corpus.readlines():
    splited = line.split("###")
    sentence_text=splited[1]
    sentence_meta_data = splited[0]
    meta_data_items = sentence_meta_data.strip().split(" ")
    regex_groups = re.search(r'\[c([0-9]+)\]\[s([0-9]+)\]', meta_data_items[0]).groups()
    review_id = int(regex_groups[0])
    sentence_number = int(regex_groups[1])
    if current_review_number == review_id:
        temp_review_text += " " + sentence_text
    elif review_id == current_review_number+1:
        review_text.append(temp_review_text)
        temp_review_text = sentence_text
        current_review_number = review_id
# add add the last review
review_text.append(temp_review_text)

In [6]:
df = pd.DataFrame(review_text, columns=["review"])
df

Unnamed: 0,review
0,this is quite the most delightful hotel i ha...
1,the actual hotel accomodations were very luxu...
2,"very good, but i am going back and won't stay..."
3,"we stayed here for one night, in december 201..."
4,the family that runs this hostel are wonderfu...
...,...
95,we arrived really late on friday evening and ...
96,bed and breakfast/ hostel style accommodation...
97,we stayed at cabanas del lago for one night f...
98,we found this place on the internet and staye...


In [7]:
# Load opinion lexicon
neg_file = open("data/opinion-lexicon-English/negative_words.txt",encoding = "ISO-8859-1")
pos_file = open("data/opinion-lexicon-English/positive_words.txt",encoding = "ISO-8859-1")
neg = [line.strip() for line in neg_file.readlines()]
pos = [line.strip() for line in pos_file.readlines()]
opinion_words = neg + pos

In [8]:
def feature_sentiment(sentence):
    '''
    input: dictionary and sentence
    function: appends dictionary with new features if the feature did not exist previously,
              then updates sentiment to each of the new or existing features
    output: updated dictionary
    '''

    sent_dict = Counter()
    debug = 0
    for token in sentence:
    #    print(token.text,token.dep_, token.head, token.head.dep_)
        # check if the word is an opinion word, then assign sentiment
        if token.text in opinion_words:
            sentiment = 1 if token.text in pos else -1
            # if target is an adverb modifier (i.e. pretty, highly, etc.)
            # but happens to be an opinion word, ignore and pass
            if (token.dep_ == "advmod"):
                continue
            elif (token.dep_ == "amod"):
                for child in token.children:
                    # if there's a adj modifier (i.e. very, pretty, etc.) add more weight to sentiment
                    # This could be better updated for modifiers that either positively or negatively emphasize
                    if ((child.dep_ == "amod") or (child.dep_ == "advmod")):
                        sentiment *= 1.5
                    # check for negation words and flip the sign of sentiment
                    if child.dep_ == "neg":
                        sentiment *= -1
                sent_dict[token.head.lemma_.lower()] += sentiment
            # for opinion words that are adjectives, adverbs, verbs...
            else:
                for child in token.children:
                    # if there's a adj modifier (i.e. very, pretty, etc.) add more weight to sentiment
                    # This could be better updated for modifiers that either positively or negatively emphasize
                    if ((child.dep_ == "amod") or (child.dep_ == "advmod")):
                        sentiment *= 1.5
                    # check for negation words and flip the sign of sentiment
                    if child.dep_ == "neg":
                        sentiment *= -1
                for child in token.children:
                    # if verb, check if there's a direct object
                    if (token.pos_ == "VERB") & (child.dep_ == "dobj"):
                        sent_dict[child.lemma_.lower()] += sentiment
                        # check for conjugates (a AND b), then add both to dictionary
                        subchildren = []
                        conj = 0
                        for subchild in child.children:
                            if subchild.text == "and":
                                conj=1
                            if (conj == 1) and (subchild.text != "and"):
                                subchildren.append(subchild.lemma_.lower())
                                conj = 0
                        for subchild in subchildren:
                            sent_dict[subchild] += sentiment

                # check for negation
                for child in token.head.children:
                    if ((child.dep_ == "amod") or (child.dep_ == "advmod")):
                        sentiment *= 1.5
                    # check for negation words and flip the sign of sentiment
                    if (child.dep_ == "neg"):
                        sentiment *= -1
                
                # check for nouns
                for child in token.head.children:
                    noun = ""
                    if (child.pos_ == "NOUN") and (child.text not in sent_dict):
                        noun = child.lemma_.lower()
                        # Check for compound nouns
                        for subchild in child.children:
                            if subchild.dep_ == "compound":
                                noun = subchild.lemma_.lower() + " " + noun
                        sent_dict[noun] += sentiment
                    debug += 1
    return sent_dict

**Extraction des aspects et leurs sentiment avec les aspects réels et leurs sentiment**

In [9]:
tagged_hotels_corpus.seek(3)
current_review_number = 1
review_text = []
temp_review_text = ""

global_true_apsects = []
global_predicted_aspects = []

for line in tagged_hotels_corpus.readlines():
    splited = line.split("###")
    sentence_text=splited[1]
    sentence_meta_data = splited[0]
    aspects_metadata = splited[0].strip().split(" ", maxsplit=1)
    true_aspects = []
    if len(aspects_metadata) > 1:
        aspects_metadata = aspects_metadata[1].split(",")
        for aspect in aspects_metadata:
            if "[p]" in aspect or "[u]" in aspect or aspect == "" :
                continue
            else:
                true_aspects.append(aspect.strip())
    extracted_aspects_sentiment = dict(feature_sentiment(nlp(sentence_text)))
    print(true_aspects)
    predicted_aspects = []
    for __aspect in extracted_aspects_sentiment.keys():
        __sentiment = extracted_aspects_sentiment[__aspect]
        if __sentiment >= 0:
            predicted_aspects.append(f"{__aspect}[+]")
        else:
            predicted_aspects.append(f"{__aspect}[-]")
    global_true_apsects.append(true_aspects)
    global_predicted_aspects.append(predicted_aspects)
    print(true_aspects, predicted_aspects)
    print(sentence_text)
    print("=================")

['hotel[+]']
['hotel[+]'] ['hotel[+]']
 this is quite the most delightful hotel i have stayed in for a very long time.

['driveway[+]', 'entrance[+]', 'staff[+]']
['driveway[+]', 'entrance[+]', 'staff[+]'] ['driveway[+]']
 charming and eventful driveway and entrance and very welcoming staff.

['owner[+]']
['owner[+]'] []
 the owner is on hand much of the time to offer help and advice.

['room[+]', 'view lake[+]', 'view[+]']
['room[+]', 'view lake[+]', 'view[+]'] ['room[+]', 'view[+]']
 the rooms are great and all have a decent view of the lake.

['room[+]', 'aspect[+]']
['room[+]', 'aspect[+]'] ['aspect[+]']
 the public rooms have superb aspects of the surrounding country.

['garden[+]', 'pool[+]', 'lakeside[+]']
['garden[+]', 'pool[+]', 'lakeside[+]'] []
 the gardens are beautifully kept and the pool and lakeside are so inviting.

['dinning[+]', 'wine choice[+]', 'wine[+]']
['dinning[+]', 'wine choice[+]', 'wine[+]'] ['dining[+]', 'choice[+]', 'wine[+]']
 excellent dining and good cho

In [46]:
from spacy import displacy

In [48]:
displacy.render(nlp("!, but whats really horrible is the way the staff treat tourists, rud, bad face and very impolite!!"))

In [10]:
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [11]:
from sklearn.preprocessing import MultiLabelBinarizer

In [13]:
multilabel_binaraizer = MultiLabelBinarizer()
multilabel_binaraizer = multilabel_binaraizer.fit(global_true_apsects)

In [15]:
b_global_true_apsects = multilabel_binaraizer.transform(global_true_apsects)

In [16]:
b_global_predicted_aspects = multilabel_binaraizer.transform(global_predicted_aspects)



In [33]:
from sklearn.metrics import classification_report

In [45]:
print(classification_report(y_pred=b_global_predicted_aspects, y_true=b_global_true_apsects, target_names=tt))

                             precision    recall  f1-score   support

                     A/C[-]       0.00      0.00      0.00         1
            accomodation[+]       1.00      0.67      0.80         3
        air-conditioning[-]       0.00      0.00      0.00         1
                 amenity[+]       0.00      0.00      0.00         1
                  animal[+]       0.00      0.00      0.00         1
               apartment[+]       1.00      1.00      1.00         1
                  aspect[+]       0.50      1.00      0.67         1
              atmosphere[+]       1.00      1.00      1.00         1
               attendant[+]       0.00      0.00      0.00         1
               attendant[-]       0.00      0.00      0.00         1
               attention[+]       0.00      0.00      0.00         1
               attention[-]       0.00      0.00      0.00         1
               back room[+]       0.00      0.00      0.00         1
               back room[-]      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
