In [7]:
# imports
import pandas as pd
import string
import spacy
nlp = spacy.load('en_core_web_sm')
import numpy as np
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score




# Load filtered dataframe

In [41]:
# open cleaned dataframe

path = "data/cleaned_customer_dataframe.csv"
filtered_df = pd.read_csv(path)

# extract clean sents for bert and raw sents for evaluation
raw = filtered_df['raw_sentences'].tolist()
clean = filtered_df['sentences'].tolist()

# remove NaN instances from both sent lists
# split sentences into words without losing sentence structure
raw_sents = []
clean_sents = []

i = 0
for item in clean: 
    if isinstance(item, str) == True and len(item) > 5:
        item = item.split()
        sent = []
        for word in item:
            sent.append(word)
        clean_sents.append(sent)
        raw_sents.append(raw[i])
    i += 1

# Train word2vec model 

Here I train a word2vec model on the dataset and create sentence vectors by averaging word vectors per sentence.

In [42]:
# train w2v model

w2v_model = Word2Vec(sentences=clean_sents, vector_size=50, window=4, min_count=1)

In [43]:
def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features
    
vectorized_docs = vectorize(clean_sents, model=w2v_model)
len(vectorized_docs), len(vectorized_docs[0])

(1443, 50)

# Clustering with KMeans

Here I cluster the data based on sentence vectors. Cluster statistics and some examples are printed for inspection.

In [98]:
# cluster with KMeans

# fit and predict clusters
num_clusters = 30
km = KMeans(n_clusters=num_clusters)
km.fit(vectorized_docs)

# print cluster statistics
sample_silhouette_values = silhouette_samples(vectorized_docs, km.labels_)
print(f"Silhouette values:")

silhouette_values = []
for i in range(num_clusters):
    cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
    silhouette_values.append((i, cluster_silhouette_values.shape[0], cluster_silhouette_values.mean(), cluster_silhouette_values.min(), cluster_silhouette_values.max(),))
    
silhouette_values = sorted(silhouette_values, key=lambda tup: tup[2], reverse=True)
for s in silhouette_values:
    print(f"Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}")

Silhouette values:
Cluster 11: Size:17 | Avg:1.00 | Min:1.00 | Max: 1.00
Cluster 14: Size:3 | Avg:1.00 | Min:1.00 | Max: 1.00
Cluster 17: Size:2 | Avg:1.00 | Min:1.00 | Max: 1.00
Cluster 24: Size:4 | Avg:1.00 | Min:1.00 | Max: 1.00
Cluster 28: Size:11 | Avg:0.63 | Min:0.20 | Max: 0.79
Cluster 29: Size:3 | Avg:0.58 | Min:0.28 | Max: 0.73
Cluster 13: Size:4 | Avg:0.39 | Min:0.18 | Max: 0.59
Cluster 7: Size:6 | Avg:0.38 | Min:0.09 | Max: 0.56
Cluster 25: Size:10 | Avg:0.31 | Min:0.03 | Max: 0.54
Cluster 18: Size:13 | Avg:0.07 | Min:-0.09 | Max: 0.31
Cluster 16: Size:176 | Avg:0.07 | Min:0.02 | Max: 0.13
Cluster 21: Size:57 | Avg:0.05 | Min:-0.03 | Max: 0.16
Cluster 19: Size:113 | Avg:0.04 | Min:-0.03 | Max: 0.14
Cluster 23: Size:22 | Avg:0.04 | Min:-0.15 | Max: 0.14
Cluster 0: Size:20 | Avg:0.02 | Min:-0.07 | Max: 0.15
Cluster 27: Size:52 | Avg:0.02 | Min:-0.07 | Max: 0.12
Cluster 20: Size:196 | Avg:0.00 | Min:-0.05 | Max: 0.06
Cluster 26: Size:48 | Avg:-0.00 | Min:-0.10 | Max: 0.18
Clust

In [114]:
# print full cluster overview

clusters = km.labels_.tolist()
clustered_articles ={'raw_sents': raw_sents, 'clean_sents':clean_sents, 'cluster': clusters}
overview = pd.DataFrame(clustered_articles)
overview.head(10)

Unnamed: 0,raw_sents,clean_sents,cluster
0,The correct way to do it is via an OCS Account...,"[correct, way, ocs, account, takeover, email, ...",20
1,My friend is without internet we need to play ...,"[friend, internet, need, play, videogame, skil...",20
2,"I have my phone number and email , that 's it .","[phone, number, email]",12
3,How did I get equipment and service ?,"[equipment, service]",12
4,I 'm literally trying to pay and nobody can fi...,"[literally, try, pay, find]",16
5,Thank you for resolving my issue so quickly ! !,"[thank, resolve, issue, quickly]",23
6,Y’all are the best ☺ ️ # fanforlife .,"[y, all, good, fanforlife]",2
7,So frustrated with 😡 Ordered dinner on Saturda...,"[frustrated, order, dinner, saturday, app]",5
8,Order was wrong AND they charged my credit car...,"[order, wrong, charge, credit, card, twice]",2
9,Pretty much explained my issue in the quoted t...,"[pretty, explain, issue, quote, tweet, drag, i...",9


In [130]:
# print selection of instances per cluster for inspection

df = overview.loc[overview['cluster'] == 25]
df.head(20)

Unnamed: 0,raw_sents,clean_sents,cluster
69,"No refund because it "" does n't qualify "" .","[refund, qualify]",25
114,and so you have provided me the refund for that .,"[provide, refund]",25
321,I would like a refund .,"[like, refund]",25
524,Give me my refund,[refund],25
607,Can you refund me the difference ? .,"[refund, difference]",25
933,i 'd like a full refund 1/2 .,"[like, refund]",25
1023,Can I get a refund ? .,[refund],25
1130,# refund https://t.co/7RoF4wNv9v .,[refund],25
1161,A refund .,[refund],25
1364,How can I get a refund for food that I do n't ...,"[refund, food, like]",25


In [110]:
# checking in which clusters the keywords can be found

df = overview.loc[overview['raw_sents'].str.contains('internet')].head(40)
l = df['cluster'].tolist()

from collections import Counter
c = Counter(list(l))
c.most_common(5)

[(26, 13), (20, 11), (16, 6), (27, 2), (2, 2)]

In [123]:
# save chosen cluster to excel for further inspection
# df.to_excel('results/cluster26orderproblems.xlsx')