In [None]:
# Imports.
import io
import sys
import csv
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import gensim
from nearpy import Engine
from nearpy.hashes import RandomBinaryProjections
from sklearn.cluster import AgglomerativeClustering

In [None]:
# Train Word2Vec news model.
vector_path = 'GoogleNewsVectors300.bin'
model = gensim.models.KeyedVectors.load_word2vec_format(vector_path, binary=True)

In [None]:
# Load the dataset.
df = pd.read_csv("../../data/pnlp_data_en.csv", delimiter=';')

# Clean comments for any unwanted elements. Consider adjusting based on clustering goals.
comments_cleaned = []
stop_words = set(stopwords.words('english'))
wordlist_en = [w for w in nltk.corpus.words.words('en') if w.islower()]
for i in range(len(df)):
    tokens = nltk.tokenize.word_tokenize(df.loc[i]['Comments'])
    tokens_filtered = [w for w in tokens if w in wordlist_en and not w in stop_words]
    if len(tokens_filtered) > 1:
        comments_cleaned.append(str(df.loc[i]['Comments']).replace('xxxx', '').replace('*', '').lower()) 
    else:
        pass
    
# Inspect the dataset.
print("Length dataset: {}".format(len(comments_cleaned)))
print("Example comment: {}".format(comments_cleaned[0]))

In [None]:
# Instantiate a new data structure to keep things cleaner.
comments = comments_cleaned

In [None]:
# Mean embeddings for each comment.
mean_embeddings = []
for i in range(len(comments)):
    tokens = nltk.tokenize.word_tokenize(comments[i])
    tokens_filtered = [w for w in tokens if not w in stop_words]
    
    embeddings = []
    for token in tokens_filtered:
        try:
            embeddings.append(model[token])
        except KeyError as e:
            # Ignore the word if it does not exist.
            pass
    
    mean_embedding = np.array(embeddings).mean(axis=0)
    mean_embeddings.append(mean_embedding)

In [None]:
# Run a clustering algorithm on the data given the number of clusters we expect to find.
n_clusters = 12
ward = AgglomerativeClustering(n_clusters=n_clusters,
        linkage='ward').fit(mean_embeddings)

In [None]:
# Organize the data.
cluster_labels = ward.labels_
merged = []
for i in range(len(comments)):
    tpl = (comments[i], mean_embeddings[i], cluster_labels[i])
    merged.append(tpl)
    
df_merged = pd.DataFrame(merged)

In [None]:
# Define a class to store the cluster data.
class Cluster:
    def __init__(self, c_cl=None, me_cl=None, i_cl=None, l_cl=None):
        self.c_cl = c_cl
        self.me_cl = me_cl
        self.l_cl =  l_cl
        self.i_cl = i_cl

In [None]:
# Generate the list of cluster data objects.
clusters = []
for i in range(n_clusters):
    c = df_merged[0][df_merged[2] == i]
    me = df_merged[1][df_merged[2] == i]    
    cluster = Cluster(c_cl=c, me_cl=me, i_cl=i)
    clusters.append(cluster)

In [None]:
# We will place the mean cluster embeddings in the Word2Vec vector space. First prep the NearPy embedding query engine.
dimension = 300
rbp = RandomBinaryProjections('rbp', 10)
engine = Engine(dimension, lshashes=[rbp])

# Create list of all valid English words to filter out strange tokens in Word2Vec model.
wordlist_en = [w for w in nltk.corpus.words.words('en') if w.islower()]

# Build the engine using the Word2Vec model.
for word in wordlist_en:
    try:
        v = model[word]
        engine.store_vector(v, word)
    except KeyError as e:
        pass

In [None]:
# Query using normal words to assess how well it works.
test_fruit = engine.neighbours(model['apple'])
test_man = engine.neighbours(model['man'])

print([i[1] for i in test_fruit])
print([i[1] for i in test_man])

In [None]:
# Add meaningful lables to the clusters.
for cluster in clusters:
    cluster.l_cl = engine.neighbours(np.array(cluster.me_cl).mean(axis=0))
    labels_cleaned = []
    for i in cluster.l_cl:
        labels_cleaned.append(i[1])
        
    cluster.l_cl = labels_cleaned

In [None]:
# Print the comment clusters and proposed labels.
for cluster in clusters:
    print("Cluster " + str(cluster.i_cl+1) + ":", end='')
    print("{}, {}, {}".format(cluster.l_cl[0], cluster.l_cl[1], cluster.l_cl[2]))
    print('\n' + "Sample Comment: ", end='')
    print([comment for comment in cluster.c_cl][0])
    print('\n\n\n')

In [None]:
# Prepare mean embeddings for visualization.
filedata = 'mean_embeddings_full_clustered.tsv'
filelabels = 'mean_embeddings_full_clustered_labels.tsv'
with open(filedata, 'w', newline='', encoding='utf-8') as f1:
    tsv_output1 = csv.writer(f1, delimiter='\t')
    with open(filelabels, 'a', newline='', encoding='utf-8') as f2:
        tsv_output2 = csv.writer(f2, delimiter='\t')
        tsv_output2.writerow(['comment', 'label'])
        for cluster in clusters:
            for i in range(len(cluster.c_cl)):
                comment = [comment for comment in cluster.c_cl][i]
                label = "{}, {}, {}".format(cluster.l_cl[0], cluster.l_cl[1], cluster.l_cl[2])
                metadata = [comment, label]
                me_values = [me for me in cluster.me_cl][i]
                
                tsv_output1.writerow(me_values)
                tsv_output2.writerow(metadata)

In [None]:
# Visualize the mean embeddings via TensorFlow Projector.

# https://projector.tensorflow.org/
# Load both the embeddings and labels .tsv files.
# Color by label.
# Use UMAP visualization.
# Play around with other visualization methods.

In [None]:
# TODO: Study data more carefully to get a sense of the kinds of clusters we can expect. Eg. 'pay', 'hiring', 'facilities.'
# TODO: Clean comments for typos.
# TODO: Break longer comments into phrases because topics can be mixed.
# TODO: Identify key words to focus the analysis. Prioritize words such as 'pay', 'hiring', etc.
# TODO: Cluster comments around vectors of keyword categories.
# TODO: Optimize clustering and labeling methods.
# TODO: Optimize all methods to get better final results.
# TODO: Dynamically identify optimal number of n_clusters.