# Chatbot Conversations From Customer Service Scripts

In [1]:
import numpy as np
import pandas as pd
import sys, os, re, itertools, collections, string, time
from io import BytesIO
from collections import Counter
from time import time
import datetime
   


In [2]:
# https://catalog.data.gov/dataset/consumer-complaint-database 
complaints_df_raw = pd.read_csv("consumer_complaints.csv", 
                usecols=('Product','Consumer complaint narrative', 'Sub-issue'),
                dtype={'consumer_complaint_narrative': object})
# Only interested in data with consumer complaints
complaints_df_raw=complaints_df_raw[complaints_df_raw['Consumer complaint narrative'].notnull()]
complaints_df_raw=complaints_df_raw[complaints_df_raw['Product'].notnull()]

# remove XXXX from narratives
complaints_df_raw['Consumer complaint narrative'] =  complaints_df_raw['Consumer complaint narrative'].replace({'X':''}, regex=True)

# always seed your random generators for reporducilibity 
complaints_df_raw = complaints_df_raw.sample(200000, replace=False, random_state=1)

# basic sentence prep
# set to lower
complaints_df_raw['Consumer complaint narrative'] = complaints_df_raw['Consumer complaint narrative'].str.lower()
# remove special characters
complaints_df_raw['Consumer complaint narrative'] = complaints_df_raw['Consumer complaint narrative'].str.replace('\W', ' ')

# remove elements with no text
complaints_df_raw= complaints_df_raw[complaints_df_raw['Consumer complaint narrative'] != '']

# any dups
complaints_df_raw = complaints_df_raw.drop_duplicates(subset=['Consumer complaint narrative'])

In [3]:
complaints_df_raw.head()

Unnamed: 0,Product,Sub-issue,Consumer complaint narrative
332635,Bank account or service,,wells fargo 2015 took 16000 00 from my cr...
992324,"Credit reporting, credit repair services, or o...",Credit inquiries on your report that you don't...,computer base system pre approve me for a cred...
902590,Debt collection,Debt is not yours,i have a dept on my credit report which is not...
618944,Credit reporting,Information is not mine,upon getting my credit report i noted an addr...
834643,"Credit reporting, credit repair services, or o...",Information belongs to someone else,i never consented to be a co signer account d...


# Clean Up Data

In [26]:
complaints_df = complaints_df_raw.copy()

In [27]:
word_similarity=complaints_df['Consumer complaint narrative'].str.split(' ').map(Counter)
word_similarity_ratio = []
complaints_df.shape 

(194410, 3)

In [28]:
for wu in word_similarity:
    word_similarity_ratio.append(np.sum([x[1] for x in wu.items()])/np.float(len(wu)))
    
complaints_df['narrative_similarity_ratio'] = word_similarity_ratio
complaints_df['narrative_similarity_ratio'].describe()

count    194410.000000
mean          2.114613
std           1.131822
min           1.000000
25%           1.636364
50%           1.990741
75%           2.432749
max         240.619048
Name: narrative_similarity_ratio, dtype: float64

In [29]:
# thin out some entries that contain too much duplicated lines within
complaints_df = complaints_df[complaints_df['narrative_similarity_ratio'] <= 1.7]
complaints_df.reset_index(drop=True,inplace=True)
complaints_df.shape

(57150, 4)

In [38]:
list(complaints_df['Consumer complaint narrative'])[0:4]

['computer base system pre approve me for a credit limit then decline my application',
 'i have a dept on my credit report which is not mine  i have spoke with the company a while ago and its still being reported  i do n t know what else to do but file a complaint ',
 'upon getting my credit report  i noted an address of     il on there with a possible loan store credit card and possibly more obtained attempted and possibly utilities use connection using my name  this address is was my ex husband and his new wife     and    they have tried this before in getting cell phones  i would like more information on what who they used my info for and what i can do about it please  these were noted on my transunion credit report and i am unable to get my   right away because it shows accounts i do not recognize on there ',
 'i never consented to be a co signer  account does not belong to me  i am a victim of fraud ']

# Get Key Verbs And Nouns

In [32]:
# find most common verbs and measure coverage 
import spacy
# Load English tokenizer, tagger, parser, NER and word vectors
#nlp = spacy.load('en')

# just load what we need to avoid taxing memory
nlp = spacy.load('en', parser=False, entity=False)


In [33]:
# create one big blob of text to process things a bit faster
blob_complaints = ''.join(list(complaints_df['Consumer complaint narrative']))

# Max text of length of 1000000
n = 900000
blog_chunks = [blob_complaints[i:i+n] for i in range(0, len(blob_complaints), n)]
len(blog_chunks)

20

In [34]:
just_verbs = []
just_nouns = []
counter_=len(blog_chunks)
for sentence in blog_chunks:
    counter_ -= 1
    if (counter_ % 10 == 0): print(counter_)
    print(counter_)
    doc = nlp(sentence.decode('utf-8'))
    temp_verb = []
    temp_noun = []
    for token in doc: 
        if (token.pos_ == u'VERB'): 
            temp_verb.append(token.text)
        if (token.pos_ == u'NOUN'):
            temp_noun.append(token.text)
            

    just_verbs.append(' '.join(temp_verb).encode('utf-8'))
    just_nouns.append(' '.join(temp_noun).encode('utf-8'))
    
    

19
18
17
16
15
14
13
12
11
10
10
9
8
7
6
5
4
3
2
1
0
0


In [35]:
just_verbs[0].split()[0:10]

['approve',
 'decline',
 'have',
 'is',
 'have',
 'spoke',
 'being',
 'reported',
 'do',
 'know']

In [36]:
just_nouns[0].split()[0:10]

['computer',
 'base',
 'system',
 'pre',
 'credit',
 'limit',
 'applicationi',
 'dept',
 'credit',
 'report']

In [37]:
print('count just_verbs: %i' % len(just_verbs))
print('count just_nouns: %i' % len(just_nouns))
    

count just_verbs: 20
count just_nouns: 20


In [39]:
# pickle both objects so you don't have to re-run spacy 
import pickle
pickle_file = "verbs_nouns.p"

overwrite_old_pickle = True
if overwrite_old_pickle:
    with open(pickle_file, "wb") as f:
        pickle.dump([just_verbs, just_nouns], f)
    
# read in saved pickle
with open(pickle_file, "rb") as f:
    backup_pos = pickle.load(f)

## Sorting Out Verbs

In [40]:
all_verbs = backup_pos[0]
len(all_verbs)

# append all verbs together so we can run frequency counts
verbs = []
for verb_set in all_verbs:
    verbs.append(verb_set.split())
    #verbs = [verb for verb in verb_set[0].split()]

len(verbs)
verbs_master = [val for sublist in verbs for val in sublist]
len(verbs_master)

762105

In [41]:
# what is your upper and lower cut offs?
from collections import Counter
verbs_df = pd.DataFrame(Counter([verb for verb in verbs_master]).most_common(), columns = ['verb', 'count'])
verbs_df.head(20)

Unnamed: 0,verb,count
0,have,49226
1,is,38867
2,was,36476
3,has,16444
4,are,16046
5,been,15056
6,be,13915
7,do,12502
8,had,10626
9,am,10541


In [43]:
len(verbs_df[verbs_df['count'] > 1000])
verbs_df = verbs_df[verbs_df['count'] > 1000]
len(verbs_df)

130

## Sorting Out Nouns

In [44]:
all_nouns = backup_pos[1]

# append all verbs together so we can run frequency counts
nouns = []
for noun_set in all_nouns:
    nouns.append(noun_set.split())

nouns_master = [val for sublist in nouns for val in sublist]
len(nouns_master)

755822

In [45]:
# what is your upper and lower cut offs?
from collections import Counter
nouns_df = pd.DataFrame(Counter([noun for noun in nouns_master]).most_common(), columns = ['noun', 'count'])
nouns_df.head()

Unnamed: 0,noun,count
0,credit,40323
1,account,23173
2,report,17342
3,debt,16744
4,information,14105


In [46]:
len(nouns_df[nouns_df['count'] > 1000])
nouns_df = nouns_df[nouns_df['count'] > 1000]
len(nouns_df)

148

## Binarize DataFrame With Official Verb & Noun List

In [47]:
# create new data frame with key verbs and nouns as features
key_words = list(nouns_df['noun']) + list(verbs_df['verb'])
row_bools = []
counter_ = len(complaints_df['Consumer complaint narrative'])
for sentence in complaints_df['Consumer complaint narrative']:
    counter_ -= 1
    if (counter_ % 10000 == 0): print(counter_)
    row_bool = []
    words = sentence.split()
    for kw in key_words:
        row_bool.append(kw in words)
    row_bools.append(row_bool)
    
print('length:', len(row_bools))
row_bools = pd.DataFrame(row_bools, columns=key_words)    
row_bools = row_bools.astype(int)
row_bools.shape

    

50000
40000
30000
20000
10000
0
('length:', 57150)


(57150, 278)

In [48]:
row_bools.head()

Unnamed: 0,credit,account,report,debt,information,company,loan,payment,bank,collection,...,show,tell,use,notified,receiving,feel,sending,shows,re,work
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,1,0,1,0,1,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0


# Cluster of popular sentences

In [49]:
from sklearn.cluster import KMeans

TOTAL_CLUSTERS = 50

# Number of clusters
kmeans = KMeans(n_clusters=TOTAL_CLUSTERS)
# Fitting the input data
kmeans = kmeans.fit(row_bools)
# Getting the cluster labels
labels = kmeans.predict(row_bools)

# add cluster back to data frame 
row_bools['cluster'] = labels

row_bools['cluster'].value_counts().head()

36    2978
21    2224
45    2160
6     2010
8     1846
Name: cluster, dtype: int64

In [50]:
row_bools['cluster'].value_counts().head()

36    2978
21    2224
45    2160
6     2010
8     1846
Name: cluster, dtype: int64

In [51]:

# add cluster number back to orginal corpus
complaints_df['Cluster'] = labels
# import sys
# reload(sys)
# sys.setdefaultencoding('utf8')
import itertools
from collections import Counter
import nltk
from nltk.util import ngrams

unique_complaints_2grams = []
unique_complaints_3grams = []
unique_complaints_4grams = []
unique_complaints_5grams = []
unique_complaints_6grams = []
# loop through each cluster
for cluster_to_search in range(min(row_bools['cluster']), max(row_bools['cluster'])+1):
    # cluster-level research
    print('Cluster: %i' % cluster_to_search)
    df_tmp = complaints_df[complaints_df['Cluster']==cluster_to_search].copy()
    print('data cluster shape: %s' % len(df_tmp))
    
    bigrams = []
    trigrams = []
    fourgrams = []
    fivegrams = []
    sixgrams = []
    
    for index, row in df_tmp.iterrows(): 
        token = nltk.word_tokenize(row['Consumer complaint narrative'].decode('utf-8'))
        bigrams.append([' '.join(pair) for pair in list(ngrams(token,2)) if len(set(pair))==2])
        trigrams.append([' '.join(pair) for pair in list(ngrams(token,3)) if len(set(pair))==3])
        fourgrams.append([' '.join(pair) for pair in list(ngrams(token,4)) if len(set(pair))==4])
        fivegrams.append([' '.join(pair) for pair in list(ngrams(token,5)) if len(set(pair))==5])
        sixgrams.append([' '.join(pair) for pair in list(ngrams(token,6)) if len(set(pair))==6])
        
    bigrams = [val for sublist in bigrams for val in sublist]
    trigrams = [val for sublist in trigrams for val in sublist]
    fourgrams = [val for sublist in fourgrams for val in sublist]
    fivegrams = [val for sublist in fivegrams for val in sublist]
    sixgrams = [val for sublist in sixgrams for val in sublist]
    
    # find top x most popular grams per size
    # 2 bigrams
    freqx = pd.DataFrame(Counter([noun for noun in bigrams]).most_common(50), columns=['bigrams','frequency'])
    freqx['Cluster'] = cluster_to_search
    unique_complaints_2grams.append(freqx)
    # 3 bigrams
    freqx = pd.DataFrame(Counter([noun for noun in trigrams]).most_common(50), columns=['trigrams','frequency'])
    freqx['Cluster'] = cluster_to_search
    unique_complaints_3grams.append(freqx)
    # 4 bigrams
    freqx = pd.DataFrame(Counter([noun for noun in fourgrams]).most_common(50), columns=['fourgrams','frequency'])
    freqx['Cluster'] = cluster_to_search
    unique_complaints_4grams.append(freqx)
    # 5 bigrams
    freqx = pd.DataFrame(Counter([noun for noun in fivegrams]).most_common(50), columns=['fivegrams','frequency'])
    freqx['Cluster'] = cluster_to_search
    unique_complaints_5grams.append(freqx)
    # 6 bigrams
    freqx = pd.DataFrame(Counter([noun for noun in sixgrams]).most_common(50), columns=['sixgrams','frequency'])
    freqx['Cluster'] = cluster_to_search
    unique_complaints_6grams.append(freqx)
 
  

Cluster: 0
data cluster shape: 850
Cluster: 1
data cluster shape: 605
Cluster: 2
data cluster shape: 99
Cluster: 3
data cluster shape: 858
Cluster: 4
data cluster shape: 775
Cluster: 5
data cluster shape: 1031
Cluster: 6
data cluster shape: 2010
Cluster: 7
data cluster shape: 1714
Cluster: 8
data cluster shape: 1846
Cluster: 9
data cluster shape: 1132
Cluster: 10
data cluster shape: 1122
Cluster: 11
data cluster shape: 775
Cluster: 12
data cluster shape: 1131
Cluster: 13
data cluster shape: 528
Cluster: 14
data cluster shape: 1034
Cluster: 15
data cluster shape: 1224
Cluster: 16
data cluster shape: 1641
Cluster: 17
data cluster shape: 1022
Cluster: 18
data cluster shape: 1089
Cluster: 19
data cluster shape: 712
Cluster: 20
data cluster shape: 671
Cluster: 21
data cluster shape: 2224
Cluster: 22
data cluster shape: 243
Cluster: 23
data cluster shape: 1232
Cluster: 24
data cluster shape: 1627
Cluster: 25
data cluster shape: 973
Cluster: 26
data cluster shape: 1145
Cluster: 27
data cluste

In [52]:
    df = pd.concat(unique_complaints_4grams)
    # freqx = pd.DataFrame(Counter([noun for noun in fourgrams]).most_common(50), columns=['fourgrams','frequency'])
    df = df.drop_duplicates(subset=['fourgrams'], keep=False)
    df.head()

Unnamed: 0,fourgrams,frequency,Cluster
20,please help me i,14,0
36,would be greatly appreciated,10,0
38,have a foreclosure sale,10,0
42,a loan modification with,9,0
46,and now i am,9,0


In [53]:
# find top x most popular grams per size
see_grams = 6


if see_grams==2:
    df = pd.concat(unique_complaints_2grams)
    df = df.drop_duplicates(subset=['bigrams'], keep=False)
elif see_grams==3:
    df = pd.concat(unique_complaints_3grams)
    df = df.drop_duplicates(subset=['trigrams'], keep=False)
elif see_grams==4:
    df = pd.concat(unique_complaints_4grams)
    df = df.drop_duplicates(subset=['fourgrams'], keep=False)
elif see_grams==5:
    df = pd.concat(unique_complaints_5grams)
    df = df.drop_duplicates(subset=['fivegrams'], keep=False)
elif see_grams==6:
    df = pd.concat(unique_complaints_6grams)
    df = df.drop_duplicates(subset=['sixgrams'], keep=False)
 
df = df.sort_values('Cluster')
df[df['frequency'] > 10]  



Unnamed: 0,sixgrams,frequency,Cluster
28,protocols in handling consumer information they,53,1
36,injured parties they are responsible for,52,1
35,by signing injured parties they are,52,1
34,signing injured parties they are responsible,52,1
33,risk of identity theft fraud due,52,1
29,are further attempting to capitalize on,53,1
31,situation by signing injured parties they,52,1
30,put myself and millions of others,52,1
37,equifaxsecurity2017 com they have put myself,51,1
32,in handling consumer information they are,52,1


## Tie It Back To Complaint

In [54]:
# tie it back to look into a couple of actual complaints
keywords = "attempting to collect a debt from"
 
for index, row in complaints_df.iterrows():
    txt = row['Consumer complaint narrative'] 
    if (keywords in txt):
        print(txt)
        print('------')
    
 

stellar recovery inc is attempting to collect a debt from me that i do n t owe them  i never established a contract with this company and they have reported negative items onto my  and  credit reports which is in violation of the fcra 
------
i received a message from global credit and collection company who said it was attempting to collect a debt i owe  i called the number back who said it was a non working number for   this same company left a message on  other phone numbers that have never been associated with any account i have  it was disclosed on answering machines voicemail that they were attempting to collect a debt from me 
------
trident asset management is attempting to collect a debt from me that i do n t owe them  i never established a contract with this company and they have reported negative items onto my  and  credit reports which is in violation of the fcra 
------
this company is attempting to collect a debt from me and has reported this debt to the credit bureaus  t