In [401]:
# imports
import pandas as pd
import string
import spacy
nlp = spacy.load('en_core_web_sm')
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.metrics.pairwise import cosine_similarity

## Loading the file

Here I open the dataframe created in the preprocessing notebook.

In [421]:
# open cleaned dataframe

path = "data/cleaned_customer_dataframe.csv"
filtered_df = pd.read_csv(path)

# extract clean sents for bert and raw sents for evaluation
raw = filtered_df['raw_sentences'].tolist()
clean = filtered_df['sentences'].tolist()

# remove NaN instances from both sent lists
raw_sents = []
clean_sents = []

i = 0
for item in clean: 
    if isinstance(item, str) == True and len(item) > 5:
        clean_sents.append(item)
        raw_sents.append(raw[i])
    i += 1

In [422]:
# preview sentences
clean_sents[:10]

['correct way ocs account takeover email consent form need local office ',
 'friend internet need play videogame skill diminish moment internetz ',
 'phone number email   ',
 'equipment service ',
 'literally try pay find   ',
 'thank resolve issue quickly   ',
 'y all good   fanforlife ',
 'frustrated order dinner saturday app ',
 'order wrong charge credit card twice ',
 'pretty explain issue quote tweet   drag image canvas long center snap ']

# Sentence vectors with BERT

Here I create sentence vectors with BERT's built-in tokenizer and vectorizer functions.

In [423]:
# Initialize bert
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [424]:
# vectorize data

vectors = []
for sent in clean_sents:
    tokens = tokenizer.encode(sent, add_special_tokens=True)
#     print([tokenizer.decode([token]) for token in tokens])
    model.eval()  # turn off dropout layers
    tokens_tensor = torch.tensor(tokens).unsqueeze(0)
    output = model(tokens_tensor)
    vector = output[0].detach().numpy()[0]
    vectors.append(vector)
#     print(vector.shape)

In [425]:
# average token vectors into sentence vectors and compute pairwise cosine similarity

sentence_vectors = []
for vec in vectors: 
    token_vec = vec[1:-1]
    sentence_vec = np.mean(token_vec, axis=0)
    sentence_vectors.append(sentence_vec)
    
similarity_matrix = cosine_similarity(np.asarray(sentence_vectors))
print(similarity_matrix)

[[0.99999994 0.67447144 0.5205359  ... 0.81180084 0.5860615  0.6518429 ]
 [0.67447144 1.         0.44003856 ... 0.7094057  0.5768237  0.61316013]
 [0.5205359  0.44003856 1.0000001  ... 0.59455526 0.4362641  0.53515774]
 ...
 [0.81180084 0.7094057  0.59455526 ... 0.9999999  0.607898   0.75644267]
 [0.5860615  0.5768237  0.4362641  ... 0.607898   1.0000001  0.6873381 ]
 [0.6518429  0.61316013 0.53515774 ... 0.75644267 0.6873381  1.0000002 ]]


# Clustering with KMeans algorithm

Here I cluster the sentences based on cosine similarity. Cluster statistics are printed and the 10 best clusters are saved to a csv for further inspection.

In [426]:
# cluster with KMeans

# fit and predict clusters
num_clusters = 50
km = KMeans(n_clusters=num_clusters)
km.fit(similarity_matrix)

# print cluster statistics
sample_silhouette_values = silhouette_samples(similarity_matrix, km.labels_)
print(f"Silhouette values:")

silhouette_values = []
for i in range(num_clusters):
    cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
    silhouette_values.append((i, cluster_silhouette_values.shape[0], cluster_silhouette_values.mean(), cluster_silhouette_values.min(), cluster_silhouette_values.max(),))
    
silhouette_values = sorted(silhouette_values, key=lambda tup: tup[2], reverse=True)
for s in silhouette_values:
    print(f"Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}")

Silhouette values:
Cluster 44: Size:18 | Avg:0.92 | Min:0.22 | Max: 0.96
Cluster 3: Size:14 | Avg:0.50 | Min:0.10 | Max: 0.66
Cluster 16: Size:8 | Avg:0.26 | Min:0.06 | Max: 0.40
Cluster 2: Size:20 | Avg:0.22 | Min:0.02 | Max: 0.39
Cluster 47: Size:33 | Avg:0.22 | Min:0.03 | Max: 0.39
Cluster 31: Size:8 | Avg:0.21 | Min:-0.00 | Max: 0.32
Cluster 25: Size:19 | Avg:0.18 | Min:-0.01 | Max: 0.34
Cluster 23: Size:8 | Avg:0.14 | Min:0.02 | Max: 0.27
Cluster 43: Size:15 | Avg:0.13 | Min:-0.08 | Max: 0.34
Cluster 15: Size:9 | Avg:0.13 | Min:-0.00 | Max: 0.31
Cluster 46: Size:19 | Avg:0.12 | Min:0.02 | Max: 0.23
Cluster 38: Size:15 | Avg:0.12 | Min:0.04 | Max: 0.20
Cluster 5: Size:26 | Avg:0.12 | Min:-0.01 | Max: 0.28
Cluster 29: Size:36 | Avg:0.12 | Min:0.03 | Max: 0.25
Cluster 7: Size:59 | Avg:0.10 | Min:0.00 | Max: 0.24
Cluster 36: Size:12 | Avg:0.10 | Min:-0.05 | Max: 0.30
Cluster 24: Size:45 | Avg:0.10 | Min:-0.01 | Max: 0.25
Cluster 9: Size:28 | Avg:0.09 | Min:-0.02 | Max: 0.21
Cluster 42

In [427]:
# print full cluster overview

clusters = km.labels_.tolist()
clustered_articles ={'raw_sents': raw_sents, 'clean_sents': clean_sents, 'cluster': clusters}
overview = pd.DataFrame(clustered_articles)
overview.head(10)

Unnamed: 0,raw_sents,clean_sents,cluster
0,The correct way to do it is via an OCS Account...,correct way ocs account takeover email consent...,24
1,My friend is without internet we need to play ...,friend internet need play videogame skill dimi...,24
2,"I have my phone number and email , that 's it .",phone number email,23
3,How did I get equipment and service ?,equipment service,39
4,I 'm literally trying to pay and nobody can fi...,literally try pay find,8
5,Thank you for resolving my issue so quickly ! !,thank resolve issue quickly,34
6,Y’all are the best ☺ ️ # fanforlife .,y all good fanforlife,42
7,So frustrated with 😡 Ordered dinner on Saturda...,frustrated order dinner saturday app,34
8,Order was wrong AND they charged my credit car...,order wrong charge credit card twice,37
9,Pretty much explained my issue in the quoted t...,pretty explain issue quote tweet drag image ...,37


In [462]:
# print selection of instances per cluster for inspection

df = overview.loc[overview['cluster'] == 25]
df.head(20)

Unnamed: 0,raw_sents,clean_sents,cluster
38,iPhone 7 Plus 😊 .,iphone plus,25
262,# mobile_CareXI,mobilecarexi,25
401,Which I do n't get since I know we have espnew...,know espnew home,25
472,how is Amazon going to solve this ?,amazon go solve,25
532,# mobile_CareXI,mobilecarexi,25
619,Now we ca n’t get the account back on the xbox .,nt account xbox,25
709,Loosing Photoshop subscription ..,loose photoshop subscription,25
723,My internet SUCKS ! ! ! !,internet suck,25
762,Internet up and down for days in Texas ..,internet day texas,25
817,I am watching CBS all access right now .,watch cbs access right,25


In [430]:
# write clusters to csv file
# df.to_excel('results/cluster35providedetailsinvestigate.xlsx')