**Libraries and dataset importation**

In [43]:
from textblob import TextBlob
from nltk.tokenize import sent_tokenize
import time
import pandas as pd
import nltk
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
from nltk import WordNetLemmatizer
import re

In [58]:
reviews_df = pd.read_csv("../data/tripadvisor_hotel_reviews.csv")
reviews_df['Review'] = reviews_df['Review'].apply(lambda r: r.strip())
reviews_df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


**Remove objective sentences with textblob**

In [59]:
def remove_objective_sentences_textblob(reviews):
    """
    methode to remove objective sentences from reviews.

    :return:
    """
    for index, review in enumerate(reviews):
        sentences = sent_tokenize(review)

        for index_s, sentence in enumerate(sentences):
            subjective_score = TextBlob(sentence).subjectivity
            if subjective_score < 0.4:
                del sentences[index_s]

        reviews[index] = " ".join(sentences)
    return reviews

In [76]:
start_time = time.time()
cleaned_reviews_textblob = remove_objective_sentences_textblob(reviews_df["Review"].tolist())
print("--- %s seconds ---" % (time.time() - start_time))

--- 18.47479271888733 seconds ---


In [86]:
reviews_df["Cleaned_reviews_textblob"] = cleaned_reviews_textblob

**Proposed method in the paper**

In [37]:
__lemmatizer = WordNetLemmatizer()
def __get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

def __get_word_sentiment(word, tag):
    global __lemmatizer
    wn_tag = __get_wordnet_pos(tag)
    if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
        return []
    
    lemma = __lemmatizer.lemmatize(word, pos=wn_tag)
    if not lemma:
        return []

    synsets = wn.synsets(word, pos=wn_tag)
    if not synsets:
        return []

    # Take the first sense, the most common
    synset = synsets[0]
    swn_synset = swn.senti_synset(synset.name())

    return [swn_synset.pos_score(), swn_synset.neg_score(), swn_synset.obj_score()]

def __count_pos_neg_words(words):
    pos_score, neg_score = (0, 0)
    for word, pos_tag in nltk.pos_tag(words):
        sentiment = __get_word_sentiment(word, pos_tag)
        if len(sentiment) == 0:
            continue
        elif sentiment[0] > 0.1:
            neg_score += sentiment[0]
        elif sentiment[1] > 0.1:
            neg_score += sentiment[1]

    return pos_score, neg_score

def remove_objective_sentences_paper_method(__reviews):
    """
    methode to remove objective sentences from reviews.

    :return:
    """
    for index, review in enumerate(__reviews):
        sentences = sent_tokenize(review)

        for index_s, sentence in enumerate(sentences):
            words = re.findall("[\w'']+", sentence)
            pos_score, neg_score = __count_pos_neg_words(words)
            count_words = len(words)
            subjective_score = (pos_score + neg_score)/count_words
            if subjective_score < 0.4:
                del sentences[index_s]

        __reviews[index] = " ".join(sentences)
    return __reviews

In [88]:
start_time = time.time()
cleaned_reviews_paper_method = remove_objective_sentences_paper_method(reviews_df["Review"].tolist())
print("--- %s seconds ---" % (time.time() - start_time))

--- 196.27033042907715 seconds ---


In [89]:
reviews_df["Cleaned_reviews_paper_method"] = cleaned_reviews_paper_method

In [97]:
reviews_df

Unnamed: 0,Review,Rating,Cleaned_reviews_textblob,Cleaned_reviews_paper_method
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...,
1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...,
2,nice rooms not 4* experience hotel monaco seat...,3,nice rooms not 4* experience hotel monaco seat...,decided book mediterranean suite 3 night weeke...
3,"unique, great stay, wonderful time hotel monac...",5,"unique, great stay, wonderful time hotel monac...",
4,"great stay great stay, went seahawk game aweso...",5,"great stay great stay, went seahawk game aweso...",
...,...,...,...,...
20486,"best kept secret 3rd time staying charm, not 5...",5,"best kept secret 3rd time staying charm, not 5...",
20487,great location price view hotel great quick pl...,4,great location price view hotel great quick pl...,
20488,"ok just looks nice modern outside, desk staff ...",2,"ok just looks nice modern outside, desk staff ...",
20489,hotel theft ruined vacation hotel opened sept ...,1,hotel theft ruined vacation hotel opened sept ...,


In [96]:
print(f"number of null reviews after cleaning with paper method : {reviews_df[reviews_df['Cleaned_reviews_paper_method'] == '']['Cleaned_reviews_paper_method'].count()}")

number of null reviews after cleaning with paper method : 20171


**Test on real review from tripadvisor**

**First review**

We spend two nights here during our Morocco tour. The place is amazing. We really enjoyed our time here , friendly people , nice restaurant and very peaceful.

In [102]:
remove_objective_sentences_textblob(["We spend two nights here during our Morocco tour. The place is amazing. We really enjoyed our time here , friendly people , nice restaurant and very peaceful."])

['The place is amazing. We really enjoyed our time here , friendly people , nice restaurant and very peaceful.']

In [103]:
remove_objective_sentences_paper_method(["We spend two nights here during our Morocco tour. The place is amazing. We really enjoyed our time here , friendly people , nice restaurant and very peaceful."])

['The place is amazing.']

**Second review**

This hotel is outstanding. The staff is friendly and attentive. It’s a great location that is quiet yet close to the subway to get anywhere you need to go quickly and it feels very safe. The price is right too! Highly recommend and would stay again. Thanks so much to the entire staff for making our stay amazing!

In [115]:
remove_objective_sentences_textblob(["This hotel is outstanding. The staff is friendly and attentive. It’s a great location that is quiet yet close to the subway to get anywhere you need to go quickly and it feels very safe. The price is right too! Highly recommend and would stay again. Thanks so much to the entire staff for making our stay amazing!"])

['This hotel is outstanding. The staff is friendly and attentive. It’s a great location that is quiet yet close to the subway to get anywhere you need to go quickly and it feels very safe. The price is right too! Highly recommend and would stay again. Thanks so much to the entire staff for making our stay amazing!']

In [108]:
remove_objective_sentences_paper_method(["This hotel is outstanding. The staff is friendly and attentive. It’s a great location that is quiet yet close to the subway to get anywhere you need to go quickly and it feels very safe. The price is right too! Highly recommend and would stay again. Thanks so much to the entire staff for making our stay amazing!"])

['The staff is friendly and attentive. The price is right too! Thanks so much to the entire staff for making our stay amazing!']

In [119]:
__get_word_sentiment("outstanding", "J")

[0.75, 0.0, 0.25]

In [120]:
0.75/4 # less than 0.4

0.1875