In [None]:
import sys
sys.path.append("..")
import src.utils.regex as regex
import os
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import ktrain
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
import hdbscan
! spacy download en_core_web_lg
import spacy

from nltk.stem.porter import PorterStemmer
STEMMER = PorterStemmer()
import nltk
from nltk.corpus import stopwords
from collections import Counter

A lot of entries contain questions about travel advice, often with individual country names
this meant the clusterer was clustering by country name which wasn't ideal
The same goes for months etc, so they are removed

In [None]:
model = spacy.load('en_core_web_lg')
def remove_common_terms(text):
    doc = model(text)
    for ent in doc.ents:
        if ent.label_ == "GPE" or ent.label_ == "DATE":
            text = text.replace(ent.text, ent.label_)
    return text

# Sanity check
remove_common_terms("to find out an update for my holiday in mexico in april")

Clean the data, there is a lot going on here, explained in the comments

In [None]:
# Read in dataset
df = pd.read_csv ("../data/raw/uis_ga_all_cols_20200301_20200327.csv")
q3 = "Q3"
df['q3_copy'] = df[q3]

corona_slugs = open('../data/raw/coronavirus_page_slugs.txt').read().split("\n")
corona_related_items_regex = regex.coronavirus_misspellings_and_typos_regex() + '|sick pay|ssp|sick|isolation|closures|quarantine|closure|cobra|cruise|hand|isolat|older people|pandemic|school|social distancing|symptoms|cases|travel|wuhan|care|elderly|care home|carehome'

# These are terms that are functionally the same but people use different terms, this standardises them
same_terms = {
    "travelling": "travel",
    "travellers": "travel",
    "holiday": "travel",
    "self-isolation": "quarantine",
    "selfisolation": "quarantine",
    "self isolation": "quarantine",
    "isolation": "quarantine",
    "statuatory sick pay": "ssp",
    "sick pay": "ssp",
}

def clean_text(text):
    text = str(text)
    # We'll be removing non alphabetical characters but we want to keep the non emergency phone number 
    # '111' in, so we'll just replace that with text
    text = text.replace("111", "oneoneone")
    # Same for 999
    text = text.replace("999", "nineninenine")
    # Remove non alphabetical or space characters
    text = re.sub("[^a-zA-Z\s:]", "", text)
    # Use our function from previous cell
    text = remove_common_terms(text)
    # This is done after remove_common_terms because spacy doesn't 
    # always recognise country names without a capital letter at the beginning!
    text = text.lower()
    text = re.sub(regex.coronavirus_misspellings_and_typos_regex() + "|virus", "", text)
    # People using different terms for "I want to know", so just remove those
    text = re.sub("wanted to find out|to look up about|to get an update|to find infos|to find info|to find out|to understand|to read the|check on advice|to check|ti get advice|to get advice|for information on", "", text)
    for word_to_replace, word_to_replace_with in same_terms.items():
        text.replace(word_to_replace, word_to_replace_with)
    return text

df[q3] = df[q3].apply(clean_text)

# Remove rows without a page sequence
df = df[df['PageSequence'].notnull()].reset_index(drop=True)

# We only want to cluster rows that are relevant to corona stuff
# so we have the column 'has_corona_page'
# It is only true if they have visted a corona page AND included a relevant term in the feedback
# (there was some irrelevant stuff about passports, we may want to remove the need for a relevant term
# as people may be using terms not in that list and we might miss out on some insights)
for index, row in df.iterrows():
    has_corona_page = False
    if re.search(corona_related_items_regex, df.at[index, q3]) is not None:
        for slug in row['PageSequence'].split(">>"):
            if slug in corona_slugs or "coronavirus" in slug:
                has_corona_page = True
    df.at[index, 'has_corona_page'] = has_corona_page
df = df[df['has_corona_page']].reset_index(drop=True)

# Remove duplicate users
df = df.drop_duplicates('intents_clientID')

df.head()

In [None]:
def stem_tokens(tokens):
    return [STEMMER.stem(item) for item in tokens]

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens)
    return stems

In [None]:
vectorizer = TfidfVectorizer(tokenizer=tokenize, analyzer='word', stop_words=stopwords.words('english'), max_features=100, nngram_range=(1,3) )
X = vectorizer.fit_transform(df[q3]).toarray()
clusterer = hdbscan.HDBSCAN(min_cluster_size=50,min_samples=2, cluster_selection_method='leaf')
clusterer.fit(X)

In [None]:
# This is helpful for debugging
print(vectorizer.get_feature_names())

In [None]:
# _labels is the cluster they've been assigned to
df['cluster'] = clusterer.labels_
# _probabilities, from the comment in the library:
# Cluster membership strengths for each point. Noisy samples are assigned 0.
df['probabilities'] = clusterer.probabilities_
df.sort_values(by=['probabilities'], inplace=True, ascending=False)

In [None]:
print(f"{df[df['cluster'] == -1].shape[0]} items out of {df.shape[0]} couldn't be put into a cluster")

def words_in_entry(entry):
      words = entry.split(" ")
      words = [word if word not in stopwords.words('english') else "" for word in words]
      words = filter(lambda word: len(word) > 0, words)
      return words
     
def most_common_words_in_cluster(cluster_entries):
      feature_words = vectorizer.get_feature_names()
      common_words = []
      for sentence in cluster_entries[q3]:
          for word in sentence.split(" "):
              tokenized_word = tokenize(word)
              for word in tokenized_word:
                    # The line commented out here should work but I haven't tested it 
                    # and don't want to hand over a script that doesn't work
                  if word in feature_words:# and word not in stopwords.words('english'):
                      common_words.append(word)
      return Counter(common_words).most_common(10)


errors = []
for topic in list(set(clusterer.labels_)):
    # Topic -1 is the cluster for things that don't really have a home
    if topic > -1:
        # TODO: Find a better way of doing this!
        df_copy = df.copy()
        cluster_entries = df_copy[df_copy['cluster'] == topic].reset_index(drop = True)
        cluster_count = len(cluster_entries)
        print(cluster_count)
        if cluster_count > 0:
            print("")
            print("")
            print("")
            print("")
            print(f"New cluster with {cluster_count} entries")
            print(f"Most common words are: {most_common_words_in_cluster(cluster_entries)}")
            print(f"1. {cluster_entries.at[0, 'q3_copy']}")
            print(f"2. {cluster_entries.at[1, 'q3_copy']}")
            print(f"3. {cluster_entries.at[2, 'q3_copy']}")