In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#ignore pandas warning
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
import os
os.chdir("..")

In [5]:
import pandas as pd
from Aspects.ExplicitAspectExtractor import ExplicitAspectExtractor
from Preprocessors.ReviewPreprocessor import ReviewPreprocessor
from Aspects.ImplicitAspectExtractor import ImplicitAspectExtractor
from Aspects.CoRefAspectIdentGrouping import CoRefAspectIdentGrouping
import spacy
nlp = spacy.load("en_core_web_sm")

In [17]:
data = pd.read_csv("data/trip_advisor_data_chunk_10000k.csv", encoding="utf-16")
data.head()

Unnamed: 0,hotel_url,author,date,rating,title,review
0,Hotel_Review-g194775-d1121769-Reviews-Hotel_Ba...,Lagaiuzza,2016-01-01T00:00:00,5.0,"Baltic, what else?",We have spent in this hotel our summer holiday...
1,Hotel_Review-g194775-d1121769-Reviews-Hotel_Ba...,ashleyn763,2014-10-01T00:00:00,5.0,Excellent in every way!,I visited Hotel Baltic with my husband for som...
2,Hotel_Review-g194775-d1121769-Reviews-Hotel_Ba...,DavideMauro,2014-08-01T00:00:00,5.0,The house of your family's holiday,I've travelled quite a numbers of hotels but t...
3,Hotel_Review-g303503-d1735469-Reviews-Pousada_...,TwoMonkeysTravel,2017-03-01T00:00:00,5.0,Natural Luxury,"The property is surrounded by trees, which are..."
4,Hotel_Review-g303503-d1735469-Reviews-Pousada_...,analuizade,2016-09-01T00:00:00,5.0,Very cozy!,I had a very pleasant stay at this hotel! All ...


In [18]:
preprocessor = ReviewPreprocessor(data["review"], nlp, ["rif", "riad", "dar"])
data['cleaned_review'] = preprocessor.remove_tags()
data["cleaned_review"] = preprocessor.lowercase_transformation()
data['cleaned_review'] = preprocessor.remove_objective_sentences()

3000it [01:40, 29.80it/s]


In [19]:
explicit_aspects_extractor = ExplicitAspectExtractor(data["cleaned_review"], nlp)
explicit_aspects = explicit_aspects_extractor.start(threshold=50)

In [None]:
co_ref_id_grop = CoRefAspectIdentGrouping(data, dict(explicit_aspects), nlp)

### Ancien modèle.

l'ancien modèle est entraîné sur 19'472 phrases.

In [35]:
co_ref_aspects = co_ref_id_grop.get_co_reference_aspects_groups(threshold=0.67)
co_ref_aspects

{'night': ['night'],
 'door': ['door'],
 'restaurant': ['restaurant'],
 'room': ['bathroom', 'water', 'bed', 'shower', 'room'],
 'location': ['city', 'location'],
 'pool': ['pool'],
 'bit': ['bit'],
 'parking': ['parking'],
 'hotel': ['riad', 'hotel', 'place', 'resort'],
 'breakfast': ['choice',
  'dinner',
  'meal',
  'breakfast',
  'morning',
  'food',
  'evening',
  'coffee',
  'drink'],
 'year': ['year'],
 'way': ['way'],
 'experience': ['experience', 'family', 'kid'],
 'thing': ['thing'],
 'floor': ['floor'],
 'stay': ['stay'],
 'time': ['day', 'time'],
 'minute': ['minute'],
 'price': ['price'],
 'bar': ['bar'],
 'lot': ['lot'],
 'beach': ['beach'],
 'desk': ['reception', 'desk'],
 'trip': ['trip'],
 'view': ['view'],
 'staff': ['staff'],
 'people': ['guest', 'people'],
 'area': ['area'],
 'service': ['service']}

#### Nouveau modèle.

Le nouveau modèle Word2Vec est entraîné sur 90’000 phrases.

In [3]:
from gensim.models import Word2Vec

In [13]:
co_ref_id_grop2 = CoRefAspectIdentGrouping(data, dict(explicit_aspects), nlp)
co_ref_id_grop2.model_wv = Word2Vec.load("90K_model_sg_hs_10.pkl")

In [26]:
co_ref_aspects = co_ref_id_grop2.get_co_reference_aspects_groups(threshold=0.45)
co_ref_aspects

{'view': ['view'],
 'guest': ['guest'],
 'breakfast': ['food',
  'meal',
  'dinner',
  'evening',
  'day',
  'drink',
  'time',
  'night',
  'restaurant',
  'bar',
  'breakfast',
  'morning'],
 'pool': ['pool'],
 'thing': ['thing'],
 'parking': ['parking'],
 'door': ['door'],
 'room': ['bed', 'bathroom', 'room', 'shower'],
 'beach': ['beach', 'resort'],
 'minute': ['minute'],
 'stay': ['experience', 'trip', 'stay'],
 'bit': ['bit'],
 'staff': ['service', 'staff'],
 'people': ['people'],
 'hotel': ['riad', 'hotel', 'location', 'place'],
 'way': ['way'],
 'price': ['price'],
 'desk': ['desk', 'reception'],
 'water': ['water'],
 'family': ['family'],
 'kid': ['kid'],
 'floor': ['floor'],
 'year': ['year'],
 'area': ['area'],
 'lot': ['lot'],
 'choice': ['choice'],
 'coffee': ['coffee'],
 'city': ['city']}

**Remarques:**
 
- Les deux modèles ont presque les même résultats. Les groupes qui ont au moins deux comme breakfast, room, hotel, desk.
- dans le group breakfast nous remarquons qu'il y a les aspects evening, morning, ça est due à la similarité entre l'aspect dinner et le temp evening et la similarité entre morning et breakfast.

In [18]:
from spacy.matcher import Matcher
pattern = [{"POS": "ADJ"}]
matcher = Matcher(nlp.vocab)
matcher.add("SENTIMENT_WORDS", [pattern])

In [20]:
matcher(nlp("there were many other little issues."))

[(10905816194214514640, 2, 3),
 (10905816194214514640, 3, 4),
 (10905816194214514640, 4, 5)]