# Imports and opening the file

In [38]:
# imports

import pandas as pd
import spacy
nlp = spacy.load('en_core_web_sm')

import datapurifier as dp
from datapurifier import Mleda, Nlpurifier, NLAutoPurifier, MlReport, Nlpeda
from collections import Counter

from nltk.corpus import stopwords
StopWords = stopwords.words('english')

In [57]:
# open file in df

path = "data/data.csv"
full_data_df = pd.read_csv(path)
full_data_df.columns=["text"]
print(full_data_df.shape)

# create small df for testing
data_df = full_data_df.head(40000)
data_df.columns=["text"]

# print first 10 lines for inspection
data_df.head(10)

(2811773, 1)


Unnamed: 0,text
0,@sprintcare and how do you propose we do that
1,@sprintcare I have sent several private messag...
2,@115712 Please send us a Private Message so th...
3,@sprintcare I did.
4,@115712 Can you please send us a private messa...
5,@sprintcare is the worst customer service
6,@115713 This is saddening to hear. Please shoo...
7,@sprintcare You gonna magically change your co...
8,@115713 We understand your concerns and we'd l...
9,@sprintcare Since I signed up with you....Sinc...


In [97]:
# split costomer questions from employee replies

instance_list = data_df['text'].tolist()

customer_instances = []

for item in instance_list:
    if item[1].isalpha() == True:
        customer_instances.append(item)

In [100]:
joined_customer_instances = ' '.join(customer_instances)

# Pre-preprocessing

The dataset is too large for my computer to handle, but I did not want to lose valuable information by simply cutting it in half. So before I started preprocessing with Spacy and other heavy modules, I manually split the data into words and found the most frequent keywords. I extracted all instances that contain the found keywords to shorten the data. This step is also helpful to already create a dataset focused on a few common topics, which will make it easier for the clustering algorithm to find clusters. 

In [103]:
# shorten and precluster data by frequent keywords

# # open file
# path = "data/data.csv"
# with open(path, "r", encoding='utf-8') as infile:
#     raw_content = infile.read()
    
# preclean and split content
new_content = joined_customer_instances.replace('\n', '. ')
# new_content = raw_content.replace('\n', '. ')
new_content = new_content.replace('..', '.')
new_content = new_content.replace('  ', ' ')
new_content = new_content.replace('"', '')
split_content = new_content.split()

# get most frequent keywords
freq_counts = Counter(list(split_content))

# remove stopwords from frequency dict
for word in StopWords:
    if word in freq_counts:
        del freq_counts[word]
        
most_common = freq_counts.most_common(200)

# get 100 most common words in list for viewing
most_common_words = []
for item in most_common:
    most_common_words.append(item[0])

In [104]:
# view most common words
print(most_common_words)

['I', '@AmazonHelp', '@AppleSupport', 'get', '.', 'still', '@ChipotleTweets', 'service', '@British_Airways', '-', '&amp;', 'time', 'one', 'Thanks', 'got', '@Uber_Support', 'back', 'The', '@sainsburys', 'need', "I'm", '@Tesco', 'Thank', 'phone', 'like', '2', 'help', 'flight', 'know', '@AmericanAir', 'customer', 'It', 'would', '@O2', 'I’m', 'please', 'My', 'No', 'want', 'going', 'new', '@Delta', '@VirginTrains', 'account', 'email', 'call', 'getting', 'This', 'even', '@AskPlayStation', 'it’s', 'app', '@SouthwestAir', '@SpotifyCares', 'You', "can't", '@GWRHelp', '@Ask_Spectrum', 'How', '@AirAsiaSupport', 'u', 'order', '@idea_cares', 'Is', 'What', 'Can', 'day', "I've", 'Why', 'Just', 'see', 'So', '@VerizonSupport', 'last', 'sent', '@Safaricom_Care', '@hulu_support', 'go', 'already', 'check', '@XboxSupport', 'Hey', 'issue', 'update', 'Not', 'since', 'Please', 'iPhone', 'use', 'make', 'thanks', 'Hi', 'guys', 'And', '@SW_Help', 'trying', 'could', 'it.', 'says', 'days', 'number', 'said', '3', '

In [108]:
# define keywords to filter data before heavy preprocessing

keywords = ['phone', 'flight', 'account', 'order', 'issue', 'iphone', 'delivery', 'internet', 'card', 'working', 'train', 'refund']

# keywords = ['phone', 'email', 'account', 'number', 'team', 'order', 'issue', 'address', 'details',
#            'contact', 'flight', 'link', 'booking', 'baggage', 'call', 'cancel', 'delivery', 'app', 'update', 'iphone', 'refund', 'due']

In [109]:
# get instances into list
instance_list = data_df['text'].tolist()

# split costomer questions from employee replies
customer_instances = []

for item in instance_list:
    if item[1].isalpha() == True:
        customer_instances.append(item)

# filter instances by keyword
filtered_instances = []

for instance in customer_instances:
    split_instance = instance.split()
    for word in split_instance:
        if word.lower() in keywords:
            filtered_instances.append(instance)
            break

# join for further preprocessing            
joined_filtered_data = '. '.join(filtered_instances)
len(joined_filtered_data)

388596

# Preprocessing and cleaning

Now that we have a filtered, shortened dataset, we can process it with SpaCy and DataPurifier. After cleaning, the data is saved to use in the clustering pipelines. The following cleaning steps are performed:

- Lemmatize
- Remove stop words
- Remove special characters, numeric characters and punctuation
- Remove links and tokens starting with '@'
- Remove tokens starting with a '-' or '^' as those are employee names
- Strip leading, trailing and duplicate whitespaces
- Remove most frequent and least frequent words with a threshold of 10

In [110]:
# load filtered data in spacy

# shorten data to spacy max length
joined_filtered_data = joined_filtered_data[:100000]
doc = nlp(joined_filtered_data)

# get all spacy sentences in list
sents = []

for sent in doc.sents:
    sent_list = []
    for token in sent:
        token = str(token)
        # removing tokens that datapurifier will not remove
        if token.startswith('-') == False and token.startswith('^') == False and token.startswith('@') == False and token != " " and token != "" and token != '\n':
            sent_list.append(token)
    sent_list = ' '.join(sent_list)
    sents.append(sent_list)
    
# remove super short sentences
sentences = []
for item in sents:
    if len(item) > 3:
        sentences.append(item)
        
# get sentences in dataframe for further processing
d = {'sentences': sentences}
filtered_df = pd.DataFrame(d)
filtered_df.shape

(1595, 1)

In [111]:
# clean data with DataPurifier
cleaned_df = Nlpurifier(filtered_df, "sentences")

GridspecLayout(children=(Checkbox(value=False, description='Drop Null Rows', indent=False, layout=Layout(grid_…

[1m[34m
Convert Word to its Base Form[0m


interactive(children=(RadioButtons(description='Technique:', options=('None', 'Stemming', 'Lemmatization'), va…

[1m[31mRemove Top Common Words[0m


interactive(children=(Checkbox(value=False, description='Remove Top Common Words'), Output()), _dom_classes=('…

[1m[31mRemove Top Rare Words[0m


interactive(children=(Checkbox(value=False, description='Remove Top Rare Words'), Output()), _dom_classes=('wi…

Button(description='Start Purifying', style=ButtonStyle())

In [112]:
# preview cleaned df
cleaned_df = cleaned_df.df
cleaned_df['raw_sentences'] = sentences # add original sentences

cleaned_df.head(10)

Unnamed: 0,sentences,raw_sentences
0,correct way ocs account takeover email consent...,The correct way to do it is via an OCS Account...
1,friend internet need play videogame skill dimi...,My friend is without internet we need to play ...
2,,How ?
3,phone number email,"I have my phone number and email , that 's it ."
4,equipment service,How did I get equipment and service ?
5,literally try pay find,I 'm literally trying to pay and nobody can fi...
6,thank resolve issue quickly,Thank you for resolving my issue so quickly ! !
7,y all good fanforlife,Y’all are the best ☺ ️ # fanforlife .
8,frustrated order dinner saturday app,So frustrated with 😡 Ordered dinner on Saturda...
9,order wrong charge credit card twice,Order was wrong AND they charged my credit car...


In [114]:
# save filtered and cleaned df for use in other notebooks
cleaned_df.to_csv("data/cleaned_customer_dataframe.csv")