In [1]:
# Imports.
import csv
import pandas as pd
import numpy as np
import nltk
import collections
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import gensim
from nearpy import Engine
from nearpy.hashes import RandomBinaryProjections
from sklearn.cluster import AgglomerativeClustering
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer

In [2]:
# GLobal variables.
wordlist_en = [w for w in nltk.corpus.words.words('en') if w.islower()]
N_TOPICS = 10

In [3]:
# Class definitions.
class Cluster:
    def __init__(self, c_cl=None, me_cl=None, i_cl=None, l_cl=None):
        self.c_cl = c_cl
        self.me_cl = me_cl
        self.l_cl =  l_cl
        self.i_cl = i_cl

In [4]:
# Load the dataset.
df = pd.read_csv("../../data/pnlp_data_en.csv", delimiter=';')

In [5]:
# Tokenize comments into sentences.
corpus = ''
for line in df['Comments']:
    corpus += line.lower() + ' '

trainer = PunktTrainer()
trainer.INCLUDE_ALL_COLLOCS = True
trainer.train(corpus)
 
tokenizer = PunktSentenceTokenizer(trainer.get_params())

comments_tokenized = []
for comment in df['Comments']:
    for sentence in tokenizer.tokenize(comment):
        comments_tokenized.append(sentence.lower())

In [6]:
# Train Word2Vec news model.
vector_path = 'GoogleNewsVectors300.bin'
model = gensim.models.KeyedVectors.load_word2vec_format(vector_path, binary=True)

In [7]:
# Clean comments for any unwanted elements. Consider adjusting based on clustering goals.
comments_cleaned = []
stop_words = set(stopwords.words('english'))
wordlist_en = [w for w in nltk.corpus.words.words('en') if w.islower()]
for i in range(len(comments_tokenized)):
    tokens = nltk.tokenize.word_tokenize(comments_tokenized[i])
    tokens_filtered = [w for w in tokens if w in wordlist_en and not w in stop_words]
    if len(tokens_filtered) > 1:
        comments_cleaned.append(comments_tokenized[i].replace('xxxx', '').replace('*', '')) 
    else:
        pass
    
# Inspect the dataset.
print("Length dataset: {}".format(len(comments_cleaned)))
print("Example comment: {}".format(comments_cleaned[0]))

Length dataset: 24197
Example comment: we do what our customers need, we communicate aperiodically.


In [8]:
# Instantiate a new data structure to keep things cleaner.
comments = comments_cleaned

In [9]:
# Mean embeddings for each comment.
mean_embeddings = []
for i in range(len(comments)):
    tokens = nltk.tokenize.word_tokenize(comments[i])
    tokens_filtered = [w for w in tokens if not w in stop_words]
    
    embeddings = []
    for token in tokens_filtered:
        try:
            embeddings.append(model[token])
        except KeyError as e:
            # Ignore the word if it does not exist.
            pass
    
    mean_embedding = np.array(embeddings).mean(axis=0)
    mean_embeddings.append(mean_embedding)

In [10]:
# Run a clustering algorithm on the data given the number of topics.
n_clusters = N_TOPICS
ward = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward').fit(mean_embeddings)

In [11]:
# Organize the cluster data.
cluster_labels = ward.labels_
merged = []
for i in range(len(comments)):
    tpl = (comments[i], mean_embeddings[i], cluster_labels[i])
    merged.append(tpl)
    
df_merged = pd.DataFrame(merged)

In [12]:
# Generate the list of cluster data objects.
clusters = []
for i in range(n_clusters):
    c = df_merged[0][df_merged[2] == i]
    me = df_merged[1][df_merged[2] == i]    
    cluster = Cluster(c_cl=c, me_cl=me, i_cl=i)
    clusters.append(cluster)

In [16]:
# Label clusters with frequency analysis of the comments.
for i in range(len(clusters)):
    nouns_in_segments = []
    for line in clusters[i].c_cl:
        nouns = set()
        text = nltk.word_tokenize(line)
        text_tagged = nltk.pos_tag(text)
        for tpl in text_tagged:
            if tpl[1] == 'NN' or tpl[1] == 'NNS':
                nouns.add(tpl[0])

        nouns_in_segments.append(nouns)

    frequency_evaluation = []
    for segment_x in nouns_in_segments:
        for segment_y in nouns_in_segments:
            intersection = segment_x.intersection(segment_y)
            if len(intersection) > 0:
                for item in list(intersection):
                    frequency_evaluation.append(item)

    counter = collections.Counter(frequency_evaluation)
    most_common = counter.most_common(30)
    most_frequent = []
    for item in most_common:
        tpl = (item[0], item[1] / len(frequency_evaluation), item[1], len(frequency_evaluation))
        most_frequent.append(tpl)
    
    clusters[i].l_cl = most_frequent[0][0]

In [17]:
# Review the comment clusters and proposed labels.
for cluster in clusters:
    print("Cluster " + str(cluster.i_cl+1) + ":" + str(cluster.l_cl))
    print("Sample Comments: \n", end='')
    for comment in [comment for comment in cluster.c_cl][0:5]:
        print(comment)
    print('\n\n\n')

Cluster 1:company
Sample Comments: 
customs business development continues to grow and expand, through the use of our internal business network & the team of people which are in place at the moment
to change the global forwarding business unit back to "forwarding mode" is very good.
however it is important these roles are distributed to the correct people and not some management level where the customer relationship is not supported
lots of big and positive changes taken place.
working towards new application cw1 also implementing new changes in old and new applications




Cluster 2:company
Sample Comments: 
i am happy to inform that  got monthly salary, uniform, air ticket and weekly fruit on time.
(please refrain from using people’s names or language that m
we need more rest most especially night shift at least deserves 2 days off.
our value adds for larger bc, mnc and csi are really working well,
our office as a whole works very well together.




Cluster 3:team
Sample Comments: 
i

In [18]:
# Prepare mean embeddings for visualization.
filedata = 'mean_embeddings_full_clustered_03.tsv'
filelabels = 'mean_embeddings_full_clustered_labels_03.tsv'
with open(filedata, 'w', newline='', encoding='utf-8') as f1:
    tsv_output1 = csv.writer(f1, delimiter='\t')
    with open(filelabels, 'a', newline='', encoding='utf-8') as f2:
        tsv_output2 = csv.writer(f2, delimiter='\t')
        tsv_output2.writerow(['comment', 'label'])
        for cluster in clusters:
            for i in range(len(cluster.c_cl)):
                comment = [comment for comment in cluster.c_cl][i]
                label = cluster.l_cl
                metadata = [comment, label]
                me_values = [me for me in cluster.me_cl][i]
                
                tsv_output1.writerow(me_values)
                tsv_output2.writerow(metadata)

In [None]:
# Visualize the mean embeddings via TensorFlow Projector.

# https://projector.tensorflow.org/
# Load both the embeddings and labels .tsv files.
# Use UMAP visualization.
# Color by label.

In [None]:
# TODO: Write 'comments_cleaned' to a file so i don't have to wait so long.
# TODO: Make a new, clean notebook and push all the new stuff to git.
# TODO: Make a new Word2Vec model with more dimensions and run it overnight.
# https://wikipedia2vec.github.io/wikipedia2vec/pretrained/