In [7]:
import gzip
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models import Word2Vec, LdaMulticore

from gensim.corpora import Dictionary

import numpy as np
import pyLDAvis
from pyLDAvis import gensim

In [9]:
ls -lh data

total 83M
-rwxr--r-- 1 witek witek 83M Sep  3 10:30 [0m[01;32mreviews_data.txt.gz[0m*


In [15]:
with gzip.open('data/reviews_data.txt.gz', 'rb') as f:
    for i, line in enumerate(f):
        print(i, line)
        break

0 b"Oct 12 2009 \tNice trendy hotel location not too bad.\tI stayed in this hotel for one night. As this is a fairly new place some of the taxi drivers did not know where it was and/or did not want to drive there. Once I have eventually arrived at the hotel, I was very pleasantly surprised with the decor of the lobby/ground floor area. It was very stylish and modern. I found the reception's staff geeting me with 'Aloha' a bit out of place, but I guess they are briefed to say that to keep up the coroporate image.As I have a Starwood Preferred Guest member, I was given a small gift upon-check in. It was only a couple of fridge magnets in a gift box, but nevertheless a nice gesture.My room was nice and roomy, there are tea and coffee facilities in each room and you get two complimentary bottles of water plus some toiletries by 'bliss'.The location is not great. It is at the last metro stop and you then need to take a taxi, but if you are not planning on going to see the historic sites in 

In [29]:
#sentence = 'Aug 4 2009 \tExcellent value - location not a big problem.\tWe stayed at the Aloft Beijing Haidian for 5 nights from July 17 - 22nd.'

def preprocesing(sentence):
    return [word for word in simple_preprocess(sentence) if word not in STOPWORDS]

def read_sentences(filename):
    with gzip.open(filename, 'rb') as f:
        for line in f:
            yield preprocesing(line)
            
sentences = read_sentences('data/reviews_data.txt.gz')

In [20]:
next(sentences)

b"Aug 4 2009 \tExcellent value - location not a big problem.\tWe stayed at the Aloft Beijing Haidian for 5 nights from July 17 - 22nd. There are lots of reviews that talk about the location being a problem but we knew this ahead of time and found that it really wasn't an issue. The longest we spent in a taxi was about 30 minutes. We never paid more than 50rmb for a taxi ride (which is about $8 Cdn) and that was to the Forbidden City. Given there are 4 in our family, it was no big deal at all.As for the rooms, they were clean, the beds comfortable, the wireless internet connection reliable and it was one of the few hotels we found in Beijing that would accomodate 2 adults and 2 children. We paid about $70 Cdn. per night. That's an amazing price.It's not meant to be a 5 star hotel so you can't go in expecting that. We found the reception staff generally very helpful and friendly. They aren't the fastest in the world but it wasn't unreasonable at all.The hotel manager made an effort to sp

In [21]:
simple_preprocess('Aug 4 2009 \tExcellent value - location not a big problem.\tWe stayed at the Aloft Beijing Haidian for 5 nights from July 17 - 22nd.')

['aug',
 'excellent',
 'value',
 'location',
 'not',
 'big',
 'problem',
 'we',
 'stayed',
 'at',
 'the',
 'aloft',
 'beijing',
 'haidian',
 'for',
 'nights',
 'from',
 'july',
 'nd']

In [22]:
STOPWORDS

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [35]:
%time sentences = list(read_sentences('data/reviews_data.txt.gz'))

CPU times: user 1min 20s, sys: 1.53 s, total: 1min 22s
Wall time: 1min 23s


In [36]:
len(sentences)

255404

## Model

In [37]:
model = Word2Vec(sentences, size=100, window=5, min_count=2)

#model = Word2Vec.load('')

### Reprezentacja wektorowa:

In [38]:
model.wv.most_similar('good')

[('decent', 0.829910397529602),
 ('great', 0.8146896362304688),
 ('excellent', 0.8067136406898499),
 ('ok', 0.6516876816749573),
 ('reasonable', 0.6229607462882996),
 ('fair', 0.6203199625015259),
 ('nice', 0.613592803478241),
 ('terrific', 0.6089853048324585),
 ('average', 0.6056482791900635),
 ('fantastic', 0.6003291606903076)]

In [39]:
model.wv.most_similar(positive=['bad', 'terrible'], negative=['good'])

[('horrible', 0.8362655639648438),
 ('awful', 0.8038921356201172),
 ('dreadful', 0.7090494632720947),
 ('shocking', 0.6991307735443115),
 ('appalling', 0.6802650690078735),
 ('horrendous', 0.6727789640426636),
 ('horrid', 0.6676274538040161),
 ('miserable', 0.6636506915092468),
 ('horrific', 0.6552661657333374),
 ('unpleasant', 0.6383471488952637)]

In [40]:
model.wv.most_similar('london')

[('nyc', 0.8406497240066528),
 ('sf', 0.8280203938484192),
 ('ny', 0.7993742227554321),
 ('manhattan', 0.7976840734481812),
 ('beijing', 0.748022198677063),
 ('chicago', 0.7369435429573059),
 ('dubai', 0.7328219413757324),
 ('montreal', 0.7144181132316589),
 ('manhatten', 0.7056013345718384),
 ('delhi', 0.6972203850746155)]

In [41]:
model.wv.most_similar('hotel')

[('property', 0.7104613780975342),
 ('place', 0.6269699931144714),
 ('hotels', 0.6132397651672363),
 ('resort', 0.5683138966560364),
 ('accommodation', 0.5414091944694519),
 ('establishment', 0.540724515914917),
 ('accomodation', 0.5245465040206909),
 ('accomodations', 0.505657434463501),
 ('location', 0.5014751553535461),
 ('properties', 0.49523359537124634)]

## Topic modeling - LDA

In [46]:
sentences_light = np.random.permutation(sentences)
sentences_light = sentences_light[:1000]

In [47]:
sentences

[['oct',
  'nice',
  'trendy',
  'hotel',
  'location',
  'bad',
  'stayed',
  'hotel',
  'night',
  'fairly',
  'new',
  'place',
  'taxi',
  'drivers',
  'know',
  'want',
  'drive',
  'eventually',
  'arrived',
  'hotel',
  'pleasantly',
  'surprised',
  'decor',
  'lobby',
  'ground',
  'floor',
  'area',
  'stylish',
  'modern',
  'reception',
  'staff',
  'geeting',
  'aloha',
  'bit',
  'place',
  'guess',
  'briefed',
  'coroporate',
  'image',
  'starwood',
  'preferred',
  'guest',
  'member',
  'given',
  'small',
  'gift',
  'check',
  'couple',
  'fridge',
  'magnets',
  'gift',
  'box',
  'nice',
  'gesture',
  'room',
  'nice',
  'roomy',
  'tea',
  'coffee',
  'facilities',
  'room',
  'complimentary',
  'bottles',
  'water',
  'plus',
  'toiletries',
  'bliss',
  'location',
  'great',
  'metro',
  'stop',
  'need',
  'taxi',
  'planning',
  'going',
  'historic',
  'sites',
  'beijing',
  'ok',
  'chose',
  'breakfast',
  'hotel',
  'tasty',
  'good',
  'selection',
 

In [48]:
sentences_light

array([list(['feb', 'multiple', 'properties', 'ok', 'work', 'years', 'travel', 'industry', 'select', 'suppliers', 'airlines', 'hotels', 'large', 'corporations', 'look', 'property', 'extreme', 'details', 'looking', 'hotels', 'business', 'person', 'attracted', 'location', 'amenities', 'offered', 'hotel', 'foremost', 'truly', 'properties', 'address', 'best', 'western', 'shaftesbury', 'premier', 'concrete', 'building', 'shaftesbury', 'executive', 'referred', 'hogarth', 'executive', 'rooms', 'shaftesbury', 'story', 'townhouse', 'street', 'terribly', 'confusing', 'hotel', 'staff', 'treat', 'arriving', 'purposely', 'booked', 'night', 'sure', 'room', 'arrival', 'checking', 'advised', 'receive', 'late', 'check', 'message', 'treated', 'words', 'charged', 'room', 'gave', 'away', 'things', 'happen', 'reservation', 'customers', 'fault', 'acceptable', 'despite', 'claiming', 'absolutely', 'sold', 'offered', 'room', 'main', 'building', 'best', 'western', 'shaftesbury', 'room', 'basement', 'windows', '

In [49]:
%time dictionary = Dictionary(sentences_light)

CPU times: user 140 ms, sys: 3.92 ms, total: 144 ms
Wall time: 147 ms


In [51]:
list(dictionary.items())

[(0, 'absolutely'),
 (1, 'acceptable'),
 (2, 'address'),
 (3, 'advised'),
 (4, 'advising'),
 (5, 'afternoon'),
 (6, 'airlines'),
 (7, 'amenities'),
 (8, 'apparently'),
 (9, 'areas'),
 (10, 'arrival'),
 (11, 'arriving'),
 (12, 'assume'),
 (13, 'assure'),
 (14, 'attracted'),
 (15, 'away'),
 (16, 'bad'),
 (17, 'badly'),
 (18, 'basement'),
 (19, 'bathroom'),
 (20, 'bathtub'),
 (21, 'best'),
 (22, 'booked'),
 (23, 'border'),
 (24, 'broken'),
 (25, 'buds'),
 (26, 'building'),
 (27, 'business'),
 (28, 'cable'),
 (29, 'carpet'),
 (30, 'center'),
 (31, 'centimeters'),
 (32, 'channels'),
 (33, 'charged'),
 (34, 'check'),
 (35, 'checking'),
 (36, 'cigaret'),
 (37, 'claiming'),
 (38, 'clean'),
 (39, 'cleanliness'),
 (40, 'closet'),
 (41, 'coffee'),
 (42, 'common'),
 (43, 'concrete'),
 (44, 'confusing'),
 (45, 'corporations'),
 (46, 'covered'),
 (47, 'customers'),
 (48, 'decision'),
 (49, 'declined'),
 (50, 'deep'),
 (51, 'despite'),
 (52, 'details'),
 (53, 'difference'),
 (54, 'dust'),
 (55, 'edge

In [52]:
len(dictionary)

8728

### BoW - Bag of Words

In [53]:
%time bow_corpus = [dictionary.doc2bow(sent) for sent in sentences_light]

CPU times: user 113 ms, sys: 4 ms, total: 117 ms
Wall time: 117 ms


In [54]:
bow_corpus

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 2),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 3),
  (22, 1),
  (23, 1),
  (24, 2),
  (25, 1),
  (26, 7),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 3),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 2),
  (55, 1),
  (56, 5),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 4),
  (82, 2),
  (83, 1),
  (84, 1),
  (85, 1),
  (86, 1),
  (87, 1),
  (88, 1),
  (89, 1),
  (90, 1),
  (91, 1)

In [55]:
%time lda_model = LdaMulticore(bow_corpus, id2word=dictionary, num_topics=100, passes=20, workers=8)

CPU times: user 3min 41s, sys: 1min 47s, total: 5min 28s
Wall time: 2min 44s


## Wizualizacja tematów:

In [56]:
lda_vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
pyLDAvis.display(lda_vis)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
