In [28]:
# Imports.
import io
import sys
import csv
import pandas as pd
import numpy as np
import nltk
import gensim
from panns import *

In [4]:
# Load the dataset.
df = pd.read_csv("../../data/comments_testdata_small.csv")

# Clean comments for any unwanted elements. Consider adjusting based on clustering models.
for i in range(len(df)):
    df.at[i, 'Comment'] = str(df.loc[i]['Comment']).replace('xxxx', '').replace('*', '')
    
# TODO: Remove stop words.
# TODO: Clean for typos.
    
# Inspect the dataset.
print("Length dataset: {}".format(len(df)))
print("Example comment:\n{}".format(df.loc[0]['Comment']))

Length dataset: 100
Example comment:
Teach  manners. Better pay less hours. Pay over time for filling in this.


In [5]:
# Word2Vec general model.
vector_path = 'GoogleNewsVectors300.bin'
model = gensim.models.KeyedVectors.load_word2vec_format(vector_path, binary=True)

In [6]:
# Mean embeddings for each comment.
mean_embeddings = []
for i in range(len(df)):
    tokens = nltk.tokenize.word_tokenize(df.loc[i]['Comment'])
    embeddings = []
    for token in tokens:
        try:
            embeddings.append(model[token])
        except KeyError as e:
            # Ignore the word if it does not exist.
            pass
    
    mean_embedding = np.array(embeddings).mean(axis=0)
    mean_embeddings.append(mean_embedding)

In [7]:
# Just a quick look at the mean_embeddings to see that they are real.
print(mean_embeddings[0])

[ 2.36675553e-02  6.84110224e-02  1.14605241e-02  1.28540039e-01
 -3.49684507e-02  1.11459587e-02  3.14565795e-03 -7.71953911e-02
  3.52125913e-02  8.29045251e-02 -5.19972574e-03  7.56929815e-02
  2.99641527e-02  2.06017122e-02 -3.53909992e-02  1.20726362e-01
  9.57219079e-02  1.57752400e-03 -3.22547331e-02  5.28141893e-02
  5.31146713e-02  3.80483791e-02  3.13509442e-02  4.44825962e-02
 -1.87518783e-02  9.81577337e-02 -7.18231201e-02  6.06501661e-02
  2.42966879e-03  6.43780082e-02 -9.79731604e-02  3.25880796e-02
 -6.01243228e-02 -4.08750698e-02  2.74118278e-02 -5.27020954e-02
  8.92052267e-05 -3.78042385e-02 -1.58409700e-02  9.99380276e-02
  6.24812208e-02 -1.04853705e-01  4.15391177e-02 -6.86927214e-02
 -7.27445185e-02 -3.81892286e-02 -6.31432161e-02  8.48576501e-02
 -2.76160613e-02 -2.29116576e-03 -6.69086128e-02 -2.00007507e-03
 -5.25841373e-04  1.39629655e-02  7.67446682e-02  3.29284668e-02
 -1.27280608e-01 -6.22652508e-02  7.10546039e-03 -9.75341797e-02
 -5.62368557e-02  6.90137

In [None]:
# Prepare mean embeddings for visualization.
filename = 'mean_embeddings.tsv'
with open(filename, 'w', newline='') as f:
    for vector in mean_embeddings:
        values = []
        for value in vector:
            values.append(value)
    
        tsv_output = csv.writer(f, delimiter='\t')
        tsv_output.writerow(values)

# Visualize the mean embeddings via TensorFlow Projector.
# https://projector.tensorflow.org/
# Use t-SNE for approx. 500 iterations.

In [8]:
# We already see some clusters from t-SNE visualization. Let's use this to generate clusters we can analyze.
# For now we group the clusters manually, using some indices provided by TensorFlow Projector.
i_cluster_01 = [23, 97, 63, 58, 59, 54, 67, 49]
i_cluster_02 = [27, 85, 60, 32, 14, 82, 36, 99]
c_cluster_01 = []
c_cluster_02 = []

for index in i_cluster_01:
    c_cluster_01.append(df.loc[index-1]['Comment'])
    
for index in i_cluster_02:
    c_cluster_02.append(df.loc[index-1]['Comment'])
    
# Visualize the comments in each group to see if there are apparent semantic similarities.
print('Cluster 1:\n')
for comment in c_cluster_01:
    print(comment)
    
print('\n\n\n\n')

print('Cluster 2:\n')
for comment in c_cluster_02:
    print(comment)


Cluster 1:

 is a racist and discriminating garage from the management to some of the employees therefore it needs to be investigate.
Health and safety seems so important to the company yet, we don't have enough running time and are not able to go the toilet!!!!
The driver's cabs could be cleaner, and I don't get any information about anything.
The only problem I have had in my workplace are with controllers speaking to you like you are not important to the company.
1- I do the same job as a metroline driver & probably do it better, why should I get paid less than them? 2- Why should I have to wait 8 years to get higher rate when its 2 years in other companies?
Wrong time to hand out survey due to current pay talks. If the company was fair to everyone better responses would apply.  does a lot to look like the correct things are done unfortunately not enough action is done.
No fan or A / C in the summer.  competitiveness looking. Dirty bus.
RTC's are too tight, no consideration for driv

In [32]:
# We may see some similarities between the comments. Let's establish semantic labels to try and 'explain' the clusters.
v_cluster_01 = []
v_cluster_02 = []

for index in i_cluster_01:
    v_cluster_01.append(mean_embeddings[index-1])
    
for index in i_cluster_02:
    v_cluster_02.append(mean_embeddings[index-1])
    
# Mean embedding of the mean embeddings.
me_cluster_01 = np.array(v_cluster_01).mean(axis=0)
me_cluster_02 = np.array(v_cluster_02).mean(axis=0)

# TODO: Get KNN for these new mean embeddings. See spotify:annoy, facebook:faiss, panns
# TODO: See also: https://www.aclweb.org/anthology/W16-1612.pdf
# TODO: Extract cluster labels from the KNN.

In [None]:
# TODO: Label the comments with mean embeddings.