In [8]:
# Imports.
import io
import sys
import csv
import pandas as pd
import numpy as np
import nltk
import gensim

In [2]:
# Load the dataset.
df = pd.read_csv("../../data/comments_testdata_small.csv")

# Clean comments for any unwanted elements. Consider adjusting based on clustering models.
for i in range(len(df)):
    df.at[i, 'Comment'] = str(df.loc[i]['Comment']).replace('xxxx', '').replace('*', '')
    
# Inspect the dataset.
print("Length dataset: {}".format(len(df)))
print("Example comment:\n{}".format(df.loc[0]['Comment']))

Length dataset: 100
Example comment:
Teach  manners. Better pay less hours. Pay over time for filling in this.


In [3]:
# Word2Vec general model.
vector_path = 'GoogleNewsVectors300.bin'
model = gensim.models.KeyedVectors.load_word2vec_format(vector_path, binary=True)

In [4]:
# Mean embeddings for each comment.
mean_embeddings = []
for i in range(len(df)):
    tokens = nltk.tokenize.word_tokenize(df.loc[i]['Comment'])
    embeddings = []
    for token in tokens:
        try:
            embeddings.append(model[token])
        except KeyError as e:
            # Ignore the word if it does not exist.
            pass
    
    mean_embedding = np.array(embeddings).mean(axis=0)
    mean_embeddings.append(mean_embedding)

In [5]:
# Just a quick look at the mean_embeddings to see that they are real.

for vector in mean_embeddings:
    print(vector)

[ 2.36675553e-02  6.84110224e-02  1.14605241e-02  1.28540039e-01
 -3.49684507e-02  1.11459587e-02  3.14565795e-03 -7.71953911e-02
  3.52125913e-02  8.29045251e-02 -5.19972574e-03  7.56929815e-02
  2.99641527e-02  2.06017122e-02 -3.53909992e-02  1.20726362e-01
  9.57219079e-02  1.57752400e-03 -3.22547331e-02  5.28141893e-02
  5.31146713e-02  3.80483791e-02  3.13509442e-02  4.44825962e-02
 -1.87518783e-02  9.81577337e-02 -7.18231201e-02  6.06501661e-02
  2.42966879e-03  6.43780082e-02 -9.79731604e-02  3.25880796e-02
 -6.01243228e-02 -4.08750698e-02  2.74118278e-02 -5.27020954e-02
  8.92052267e-05 -3.78042385e-02 -1.58409700e-02  9.99380276e-02
  6.24812208e-02 -1.04853705e-01  4.15391177e-02 -6.86927214e-02
 -7.27445185e-02 -3.81892286e-02 -6.31432161e-02  8.48576501e-02
 -2.76160613e-02 -2.29116576e-03 -6.69086128e-02 -2.00007507e-03
 -5.25841373e-04  1.39629655e-02  7.67446682e-02  3.29284668e-02
 -1.27280608e-01 -6.22652508e-02  7.10546039e-03 -9.75341797e-02
 -5.62368557e-02  6.90137

  2.01510619e-02 -3.67675791e-03  6.31787106e-02 -6.97509795e-02]
[-0.07741293  0.02735731  0.06395128 -0.01929728 -0.19839986  0.04726834
  0.03955078 -0.10957167  0.149214    0.07393137 -0.08119032 -0.01440769
 -0.02555338  0.09817166 -0.05818685  0.03845893  0.04480658  0.11328125
  0.0393202  -0.06668091 -0.05699327  0.05251736 -0.01888021 -0.0122172
 -0.00713433  0.05037435 -0.09285143  0.05633884  0.02633328  0.00689019
 -0.01536475  0.02803548 -0.07463922 -0.10476346  0.02438354 -0.04631212
  0.04420302  0.07002767 -0.00364855 -0.0097385   0.02094862 -0.02803548
  0.07412667  0.00250922 -0.04603407 -0.04678345 -0.04326715 -0.01713732
 -0.03203668 -0.08173624 -0.10064019  0.05381944  0.01674398  0.01394823
  0.12117513  0.08177355 -0.04456668 -0.08801948  0.11542426 -0.10656738
  0.00219642  0.1003418  -0.10126241 -0.21687825  0.080051    0.05563015
 -0.0790134   0.005405   -0.11284044 -0.00241428  0.00872125  0.04397922
 -0.03173828  0.04281277 -0.20250109 -0.06302897  0.1095538

In [10]:
# Prepare mean embeddings for visualization.
filename = 'mean_embeddings.tsv'
with open(filename, 'w', newline='') as f:
    for vector in mean_embeddings:
        values = []
        for value in vector:
            values.append(value)
    
        tsv_output = csv.writer(f, delimiter='\t')
        tsv_output.writerow(values)

# Visualize the mean embeddings via TensorFlow Projector.
# https://projector.tensorflow.org/