## Building word2vec model using gensim

In [4]:
import warnings
warnings.filterwarnings('ignore')

#data processing
import pandas as pd
import re
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stopWords = stopwords.words('english')

#modelling
from gensim.models import Word2Vec
from gensim.models import Phrases
from gensim.models.phrases import Phraser

[nltk_data] Downloading package stopwords to C:\Users\Hridaya
[nltk_data]     Pradhan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [5]:
# load data
data = pd.read_csv('data/text.csv',header=None)
data.head()

Unnamed: 0,0
0,room kind clean strong smell dogs. generally a...
1,stayed crown plaza april april . staff friendl...
2,booked hotel hotwire lowest price could find. ...
3,stayed husband sons way alaska cruise. loved h...
4,girlfriends stayed celebrate th birthdays. pla...


In [6]:
# preprocess and prepare data
def pre_process(text):
    
    #convert to lowercase
    text = str(text).lower()
    
    #remove all special characters and keep only alpha numeric characters and spaces
    text = re.sub(r'[^A-Za-z0-9\s.]',r'',text)
    
    #remove new lines
    text = re.sub(r'\n',r' ',text)
    
    # remove stop words
    text = " ".join([word for word in text.split() if word not in stopWords])
    
    return text

pre_process(data[0][50])

'agree fancy. everything needed. breakfast pool hot tub nice shuttle airport later checkout time. noise issue tough sleep through. awhile forget noisy door nearby noisy guests. complained management later email credit compd us amount requested would return.'

In [7]:
data[0] = data[0].map(lambda x: pre_process(x))
data[0].head()

0    room kind clean strong smell dogs. generally a...
1    stayed crown plaza april april . staff friendl...
2    booked hotel hotwire lowest price could find. ...
3    stayed husband sons way alaska cruise. loved h...
4    girlfriends stayed celebrate th birthdays. pla...
Name: 0, dtype: object

In [8]:
data[0][1].split('.')[:5]

['stayed crown plaza april april ',
 ' staff friendly attentive',
 ' elevators tiny ',
 ' food restaurant delicious priced little high side',
 ' course washington dc']

In [9]:
corpus = []
for line in data[0][1].split('.'):
    words = [x for x in line.split()]
    corpus.append(words)

corpus[:2]

[['stayed', 'crown', 'plaza', 'april', 'april'],
 ['staff', 'friendly', 'attentive']]

In [10]:
data = data[0].map(lambda x: x.split('.'))

corpus = []
for i in (range(len(data))):
    for line in data[i]:
        words = [x for x in line.split()]
        corpus.append(words)

corpus[:2]

[['room', 'kind', 'clean', 'strong', 'smell', 'dogs'],
 ['generally', 'average', 'ok', 'overnight', 'stay', 'youre', 'fussy']]

In [11]:
phrases = Phrases(sentences=corpus,min_count=25,threshold=50)
bigram = Phraser(phrases)

for index,sentence in enumerate(corpus):
    corpus[index] = bigram[sentence]

corpus[111]

['connected', 'rivercenter', 'mall', 'downtown', 'san_antonio']

In [12]:
corpus[9]

['course', 'washington_dc']

In [None]:
## build model
size = 100  # Word vector size
window_size = 2
epochs = 100
min_count = 2
workers = 4
sg = 1

model = Word2Vec(corpus, sg=sg, window=window_size, vector_size=size, 
                 min_count=min_count, workers=workers, epochs=epochs)

In [16]:
model.save('model/word2vec.model')
model = Word2Vec.load('model/word2vec.model')
model = Word2Vec.load('model/word2vec.model')

In [18]:
# evaluate embeddings
model.wv.most_similar('san_diego')

[('san_antonio', 0.7854580879211426),
 ('san_francisco', 0.7668282389640808),
 ('phoenix', 0.7567591071128845),
 ('memphis', 0.7417790293693542),
 ('austin', 0.7309386730194092),
 ('dallas', 0.7235692143440247),
 ('indianapolis', 0.7212599515914917),
 ('la', 0.7203453183174133),
 ('boston', 0.7128455638885498),
 ('sf', 0.7112388610839844)]

In [20]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

[('queen', 0.6996163725852966)]

In [21]:
text = ['los_angeles','indianapolis', 'holiday', 'san_antonio','new_york']

model.wv.doesnt_match(text)

'holiday'

## Visualizing word embeddings with Tensorboard

In [55]:
import warnings
warnings.filterwarnings(action='ignore')


import tensorflow as tf
from tensorboard.plugins import projector
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
tf.compat.v1.disable_eager_execution()

import numpy as np
import gensim
import os

In [56]:
file_name = "model/word2vec.model"
model = gensim.models.keyedvectors.KeyedVectors.load(file_name)

In [57]:
max_size = len(model.wv.index_to_key)
w2v = np.zeros((max_size, model.vector_size))

In [58]:
if not os.path.exists('projections'):
    os.makedirs('projections')
    
with open("projections/metadata.tsv", 'w+') as file_metadata:
    
    for i, word in enumerate(model.wv.index_to_key[:max_size]):
        
        #store the embeddings of the word
        w2v[i] = model.wv[word]
        
        #write the word to a file 
        file_metadata.write(word + '\n')
        
sess = tf.compat.v1.InteractiveSession()

ERROR:tensorflow:An interactive session is already active. This can cause out-of-memory errors or some other unexpected errors (due to the unpredictable timing of garbage collection) in some cases. You must explicitly call `InteractiveSession.close()` to release resources held by the other session(s). Please use `tf.Session()` if you intend to productionize.


In [59]:
with tf.device("/cpu:0"):
    embedding = tf.compat.v1.Variable(w2v, trainable=False, name='embedding')

In [66]:
sess.run(tf.compat.v1.global_variables_initializer())

In [67]:
saver = tf.compat.v1.train.Saver()

In [68]:
writer = tf.compat.v1.summary.FileWriter('projections', sess.graph)

In [69]:
config = projector.ProjectorConfig()
embed= config.embeddings.add()

In [70]:
embed.tensor_name = 'embedding'
embed.metadata_path = 'metadata.tsv'

In [71]:
projector.visualize_embeddings(writer, config)

saver.save(sess, 'projections/model.ckpt', global_step=max_size)

'projections/model.ckpt-28071'