In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#ignore pandas warning
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
import os
os.chdir("..")

In [4]:
import pandas as pd
from Aspects.ExplicitAspectExtractor import ExplicitAspectExtractor
from Preprocessors.ReviewPreprocessor import ReviewPreprocessor
from Aspects.ImplicitAspectExtractor import ImplicitAspectExtractor
from Aspects.CoRefAspectIdentGrouping import CoRefAspectIdentGrouping
import spacy
nlp = spacy.load("en_core_web_sm")

In [5]:
data = pd.read_csv("data/trip_advisor_data_chunk_10000k.csv", encoding="utf-16")
data.head()

Unnamed: 0,hotel_url,author,date,rating,title,review
0,Hotel_Review-g194775-d1121769-Reviews-Hotel_Ba...,Lagaiuzza,2016-01-01T00:00:00,5.0,"Baltic, what else?",We have spent in this hotel our summer holiday...
1,Hotel_Review-g194775-d1121769-Reviews-Hotel_Ba...,ashleyn763,2014-10-01T00:00:00,5.0,Excellent in every way!,I visited Hotel Baltic with my husband for som...
2,Hotel_Review-g194775-d1121769-Reviews-Hotel_Ba...,DavideMauro,2014-08-01T00:00:00,5.0,The house of your family's holiday,I've travelled quite a numbers of hotels but t...
3,Hotel_Review-g303503-d1735469-Reviews-Pousada_...,TwoMonkeysTravel,2017-03-01T00:00:00,5.0,Natural Luxury,"The property is surrounded by trees, which are..."
4,Hotel_Review-g303503-d1735469-Reviews-Pousada_...,analuizade,2016-09-01T00:00:00,5.0,Very cozy!,I had a very pleasant stay at this hotel! All ...


In [6]:
preprocessor = ReviewPreprocessor(data["review"], nlp, ["rif", "riad", "dar"])
data['cleaned_review'] = preprocessor.remove_tags()
data["cleaned_review"] = preprocessor.lowercase_transformation()
data['cleaned_review'] = preprocessor.remove_objective_sentences()

3000it [01:43, 28.95it/s]


In [7]:
explicit_aspects_extractor = ExplicitAspectExtractor(data["cleaned_review"], nlp)
explicit_aspects = explicit_aspects_extractor.start(threshold=50)

In [8]:
co_ref_id_grop = CoRefAspectIdentGrouping(data, dict(explicit_aspects), nlp)

### Ancien modèle.

l'ancien modèle est entraîné sur 19'472 phrases.

In [9]:
co_ref_aspects = co_ref_id_grop.get_co_reference_aspects_groups(threshold=0.67)
co_ref_aspects

3000it [01:38, 30.53it/s]


{'breakfast': ['meal',
  'morning',
  'choice',
  'dinner',
  'drink',
  'food',
  'bar',
  'evening',
  'breakfast',
  'coffee'],
 'location': ['location'],
 'time': ['day', 'time'],
 'people': ['people'],
 'way': ['way'],
 'guest': ['guest'],
 'kid': ['kid', 'family'],
 'door': ['door'],
 'bit': ['bit'],
 'desk': ['reception', 'desk'],
 'experience': ['experience'],
 'room': ['room', 'water', 'shower', 'bed', 'bathroom'],
 'service': ['service'],
 'hotel': ['resort', 'place', 'hotel', 'riad'],
 'thing': ['thing'],
 'trip': ['trip'],
 'night': ['night'],
 'year': ['year'],
 'pool': ['beach', 'pool'],
 'area': ['area'],
 'stay': ['stay'],
 'staff': ['staff'],
 'lot': ['lot'],
 'price': ['price'],
 'minute': ['minute'],
 'view': ['view'],
 'floor': ['floor'],
 'restaurant': ['restaurant'],
 'city': ['city'],
 'parking': ['parking']}

#### Nouveau modèle.

##### 1er modèle

Le nouveau modèle Word2Vec est entraîné sur 90’000 phrases.

In [11]:
from gensim.models import Word2Vec

In [12]:
co_ref_id_grop2 = CoRefAspectIdentGrouping(data, dict(explicit_aspects), nlp)
co_ref_id_grop2.model_wv = Word2Vec.load("90K_model_sg_hs_10.pkl")

In [13]:
co_ref_aspects = co_ref_id_grop2.get_co_reference_aspects_groups(threshold=0.45)
co_ref_aspects

{'breakfast': ['meal',
  'day',
  'restaurant',
  'morning',
  'dinner',
  'drink',
  'food',
  'bar',
  'night',
  'evening',
  'time',
  'breakfast'],
 'hotel': ['location', 'place', 'hotel', 'riad'],
 'people': ['people'],
 'way': ['way'],
 'guest': ['guest'],
 'family': ['family'],
 'door': ['door'],
 'bit': ['bit'],
 'desk': ['reception', 'desk'],
 'stay': ['trip', 'stay', 'experience'],
 'room': ['shower', 'bed', 'room', 'bathroom'],
 'staff': ['staff', 'service'],
 'beach': ['resort', 'beach'],
 'thing': ['thing'],
 'year': ['year'],
 'choice': ['choice'],
 'area': ['area'],
 'water': ['water'],
 'lot': ['lot'],
 'coffee': ['coffee'],
 'price': ['price'],
 'minute': ['minute'],
 'pool': ['pool'],
 'view': ['view'],
 'floor': ['floor'],
 'kid': ['kid'],
 'city': ['city'],
 'parking': ['parking']}

##### 2eme modèle

Le nouveau modèle Word2Vec est entraîné sur 411'195 phrases.

In [15]:
co_ref_id_grop3 = CoRefAspectIdentGrouping(data, dict(explicit_aspects), nlp)
co_ref_id_grop3.model_wv = Word2Vec.load("50K_review_model_sg_hs_10.pkl")

In [16]:
co_ref_aspects = co_ref_id_grop3.get_co_reference_aspects_groups(threshold=0.48)
co_ref_aspects

{'breakfast': ['meal',
  'day',
  'restaurant',
  'morning',
  'dinner',
  'food',
  'bar',
  'night',
  'evening',
  'time',
  'breakfast'],
 'hotel': ['location', 'pool', 'resort', 'place', 'hotel', 'beach'],
 'people': ['people'],
 'way': ['way'],
 'guest': ['guest'],
 'family': ['family'],
 'door': ['door'],
 'bit': ['bit'],
 'desk': ['reception', 'desk'],
 'stay': ['trip', 'stay', 'experience'],
 'room': ['room', 'floor', 'shower', 'bed', 'bathroom'],
 'service': ['service'],
 'thing': ['thing'],
 'drink': ['drink'],
 'year': ['year'],
 'choice': ['choice'],
 'riad': ['riad'],
 'area': ['area'],
 'water': ['water'],
 'staff': ['staff'],
 'lot': ['lot'],
 'coffee': ['coffee'],
 'price': ['price'],
 'minute': ['minute'],
 'view': ['view'],
 'kid': ['kid'],
 'city': ['city'],
 'parking': ['parking']}

##### 3eme modèle

Le nouveau modèle Word2Vec est entraîné sur 804'177 phrases.

In [18]:
co_ref_id_grop4 = CoRefAspectIdentGrouping(data, dict(explicit_aspects), nlp)
co_ref_id_grop4.model_wv = Word2Vec.load("100K_reviews_model_sg_hs_10.pkl")

In [19]:
co_ref_aspects = co_ref_id_grop4.get_co_reference_aspects_groups(threshold=0.50)
co_ref_aspects

{'breakfast': ['meal',
  'day',
  'restaurant',
  'morning',
  'dinner',
  'drink',
  'food',
  'bar',
  'night',
  'evening',
  'time',
  'breakfast'],
 'hotel': ['location', 'resort', 'place', 'hotel', 'beach'],
 'people': ['people'],
 'way': ['way'],
 'guest': ['guest'],
 'family': ['family'],
 'door': ['door'],
 'bit': ['bit'],
 'desk': ['reception', 'desk'],
 'stay': ['trip', 'stay', 'experience'],
 'room': ['shower', 'room', 'bathroom'],
 'staff': ['staff', 'service'],
 'thing': ['thing'],
 'year': ['year'],
 'choice': ['choice'],
 'riad': ['riad'],
 'area': ['area'],
 'water': ['water'],
 'lot': ['lot'],
 'coffee': ['coffee'],
 'price': ['price'],
 'minute': ['minute'],
 'pool': ['pool'],
 'view': ['view'],
 'floor': ['floor'],
 'kid': ['kid'],
 'bed': ['bed'],
 'city': ['city'],
 'parking': ['parking']}

**Remarques:**
 
- Les quatre modèles ont presque les même résultats. Les groupes qui ont au moins deux comme breakfast, room, hotel, desk. 
- Dans le group breakfast nous remarquons qu'il y a les aspects evening, morning, ça est due à la similarité entre l'aspect dinner et le temp evening et la similarité entre morning et breakfast.