In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

This Kernel will give a quick demonstration on a **text mining example**  on the Consumer Financial complaints data published on the Consumer Financial Protection Bureau (CFPB) website.

This kernel  utilizes the text attribute - Consumer complaint narrative to answer one simple question: **what are the top 10 key complaint word that the top 3 credit bureau agencies (Equifax, Experian, and TransUnion) received ?**

By extracting those keywords, it might help financial agencies like credit bureau in this case, especially the compliance department to better target any potential risk or issue and eventually control the risk.

**The kernel can be break into three parts:**

1. count the word and get the words' frequency
2. calculate the ratio (between word frequency of individual agency and word frequency of the entire complaint lists) and get the words with the highest ratio value
3. improve the ratio by excluding common words related to company' name
There are 555,957 complaints(records) and 18 features (variables), but this kernel, I will be only using the text attribute - Consumer complaint narrative After excluding the row with missing consumer complaint narrative, we are left 66,806 complaints(records).

Let's get started!

In [None]:
#Load library
import pandas as pd
import numpy as np
import re
from collections import Counter

In [None]:
# For dispaly purpose
class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

In [None]:
df = pd.read_csv('../input/consumer_complaints.csv')

In [None]:
# take a look the first five observation in the dataset
df.head()

In [None]:
print('Complain data set shape: ', df.shape)

Complain data set shape:  (555957, 18)
There are 555,957 complaints(records)a and 18 features (variables).

After we get a sense of the dataset, let's dive in the text attribute - consumer complaint narrative

first, take a look the complaint narrative examples:

In [None]:
df[(df['consumer_complaint_narrative'].notnull())]['consumer_complaint_narrative'].head()

 **Step 1: count the word frequency**

In [None]:
# Subset the data by company
EQU = df[(df['consumer_complaint_narrative'].notnull())&(df['company']=='Equifax')]
EXP = df[(df['consumer_complaint_narrative'].notnull())&(df['company']=='Experian')]
TRU = df[(df['consumer_complaint_narrative'].notnull())&(df['company']=='TransUnion Intermediate Holdings, Inc.')]
TOTAL = df[(df['consumer_complaint_narrative'].notnull())]

# Take a look how many complaint related to each company
print(len(EQU),'complaints related to Equifax')
print(len(EXP),'complaints related to Experian')
print(len(TRU),'complaints related to TransUnion')
print(len(TOTAL),'complaints in Total')

In [None]:
# Create a empty Counter Object for the next step counting
EQU_counts = Counter()
EXP_counts = Counter()
TRU_counts = Counter()
TOTAL_counts = Counter()

EQU_lt = EQU['consumer_complaint_narrative'].tolist()
EXP_lt = EXP['consumer_complaint_narrative'].tolist()
TRU_lt = TRU['consumer_complaint_narrative'].tolist()
TOTAL_lt = TOTAL['consumer_complaint_narrative'].tolist()

In [None]:
#loop over all the words in the complaints and add up the counts
def count_word(complaints,word_counts):
    for i in range(len(complaints)):
        for word in re.split(r'\W+',  complaints[i]):
            word_counts[word] +=1

In [None]:
# count the word for each company's complaint lists
count_word(EQU_lt,EQU_counts)
count_word(EXP_lt,EXP_counts)
count_word(TRU_lt,TRU_counts)
count_word(TOTAL_lt,TOTAL_counts)

# extract the most common 10 words used in each company's complaint
EQU_counts_10 = EQU_counts.most_common(10)
EXP_counts_10 = EXP_counts.most_common(10)
TRU_counts_10 = TRU_counts.most_common(10)
TOTAL_counts_10 = TOTAL_counts.most_common(10)


# convert to dataframe for display
EQU_df = pd.DataFrame({'most 10 common (EQU)':EQU_counts_10})
EXP_df = pd.DataFrame({'most 10 common (EXP)':EXP_counts_10})
TRU_df = pd.DataFrame({'most  10 common (TRU)':TRU_counts_10})
Total_df = pd.DataFrame({'most 10 common (Total)':TOTAL_counts_10})

display('EQU_df', 'EXP_df', 'TRU_df', 'Total_df')

As the side by side comparison shown above for the top 10 common words used in different companies and entire complaint lists, common words like "the" is ranked the top among Equifax, Experian, TransUnion, and the Entire complaints lists.

Instead of finding the common words in the Equifax or Experian or TransUnion complaints, what we really want is those words that are shown far more often among one company's complain rather than the total complaint list. In other word, what complaint key word is concentrated uniquely for this company.

**Step2: Calculate the frequent ratio**

To accomplish this, we' ll need to calculate the word usage ratio between individual company and the entire list.

Use "the" as an example,

$$FreqRatio = \frac{Count_{EQU}['the']}{(Count_{TOTAL}['the'] + 1)}$$
Note: the "+ 1 " here is added in case the TOTAL_counts for some words is zero.

Dividing the company specific count on a word by the total count of the same word, we can let the company unique complaint key word stands out, and suppress the importance of common words like "the".

Let's calculated the ratio!

In [None]:
def calculate_ratio(word_counts,ratios):
    for word in list(word_counts):
        ratio = word_counts[word] / float(TOTAL_counts[word]+1)
        ratios[word] = ratio

In [None]:
# Again, create Counter object for ratio calculation
EQU_ratios = Counter()
EXP_ratios = Counter()
TRU_ratios = Counter()

# calculate the ratio for each company's complaint words
calculate_ratio(EQU_counts,EQU_ratios)
calculate_ratio(EXP_counts,EXP_ratios)
calculate_ratio(TRU_counts,TRU_ratios)


# words with the highest ratio 
EQU_df = pd.DataFrame({'most_common (EQU)':EQU_ratios.most_common(10)})
EXP_df = pd.DataFrame({'most_common (EXP)':EXP_ratios.most_common(10)})
TRU_df = pd.DataFrame({'most_common (TRU)':TRU_ratios.most_common(10)})

display('EQU_df', 'EXP_df', 'TRU_df')

As the side by side comparison shown above this time, we started to see the differentiation. Unfortunately, biggest differentiation in the complaints is the company name, which is again not we really interested about, we want to find what's the real financial service issue in those complaints!

But we already very close to the answer we are trying to answer, just need one more step - leave out those word related to company names.

To accomplish this, we'll skip the counting when it is company related name.

**Step 3: improve the ratio**

One simply way is just say if any time we see 'Equifax', for example, we skip counting the word frequency, so that this word will be automatically, have a zero frequency, and showing in our most common list.

**However, as we can see in the result above, when customer wrote complaint, they misspelled a lot. So by excluding just 'Equifax', is not gonna get us what we want.
**

**Two ways of dealing with it:
**
1. manually summarize the misspelled pattern
2. using the library FuzzyWuzzy to implement a fuzzy matching

An example below is demonstrating how the fuzzywuzzy works. basically, it calculate a distance (called Levenshtein distance) to measure the difference between two sequence, in our case, two words. the higher the score

In [None]:
# illustrate how the fuzzywuzzy.process work
from fuzzywuzzy import process

misspelled1 = 'Exquifaax'
misspelled2 = 'Exclude'
match = ['equifax']
fuzzy_score1 = process.extract(misspelled1, match)
fuzzy_score2 = process.extract(misspelled2, match)
print(fuzzy_score1)
print(fuzzy_score2)

Back to our example, we for sure want to exclude those with high fuzzy score, which are those misspelled.

**The pros and cons for the two approach I mentioned above:**

* the manually summarizing is fast, but can't exclude the fuzzy words
* the fuzzy matching is accurately excluding those fuzzy word, but the code can take very long time to run, since it has to compute the fuzzy score for every word in the complaint list.

Therefore, A mixed approach that combined first and second option can by more reasonable and leads to faster and more accurate result.The mixed approach is, if a word contain certain string like "eq" in the Equifax complaints, those words are far more likely is to be a misspelled 'Equifax '.

**Therefore:**

* if the condition like containing “eq” is met, fuzzy matching function will be called.
* if condition is not matched, simply count the word with out any other processing.
* when the fuzzy matching function is called, is the fuzzy_score is greater than 85, skip the counting, since it is probably a misspelled 'Equifax', which we don't care that much

For the implementation, everything else keep the same as shown previously, and only revise this count_word function, since we need to update our new logic

In [None]:
#loop over all the words in the complaints and add up the counts
def count_word_new(word_lt,word_cnt,c_name_int,c_name):
    for i in range(len(word_lt)):
        lt = filter(None, re.split(r'\W+',  word_lt[i]))
        for word in lt:
            if word.lower().find(c_name_int) != -1:
                fuzzy_score = process.extract(word, c_name)[0][1]
                if fuzzy_score>=80:
                     continue
                else:
                    word_cnt[word] += 1

            else:
                word_cnt[word] += 1

In [None]:
# Create a empty Counter Object for the next step counting
EQU_counts2 = Counter()
EXP_counts2 = Counter()
TRU_counts2 = Counter()

# Again, create Counter object for ratio calculation
EQU_ratios2 = Counter()
EXP_ratios2 = Counter()
TRU_ratios2 = Counter()

# use the count_word_new to count
count_word_new(EQU_lt,EQU_counts2,"eq",["Equifax"])
count_word_new(EXP_lt,EXP_counts2,"expe",["Experian"])
count_word_new(TRU_lt,TRU_counts2,"tran",["TransUnion"])

calculate_ratio(EQU_counts2,EQU_ratios2)
calculate_ratio(EXP_counts2,EXP_ratios2)
calculate_ratio(TRU_counts2,TRU_ratios2)

EQU_df = pd.DataFrame({'most_common (EQU)':EQU_ratios2.most_common(10)})
EXP_df = pd.DataFrame({'most_common (EXP)':EXP_ratios2.most_common(10)})
TRU_df = pd.DataFrame({'most_common (TRU)':TRU_ratios2.most_common(10)})

display('EQU_df', 'EXP_df', 'TRU_df')

Those misspelled decreased a lot, although not all!

**Based on this complaint list, we can summarize**

* Equifax's complaint are mostly concentrated in TrustedID, Intruders, segmentation, re alleges, cyber attack and 2013correct.
* Experian's complaints are mostly concentrated in Geographical, Credit Works, free credit report, Inquiry, Delinquency.
* Trans Union's complaints are mostly concentrated in 3rd party info, Libellant, LLCConsumer, Inquiry, Financing.
That's it. 😃

We answered our question - what are the top 10 key complaint word that the top 3 credit bureau agencies (Equifax, Experian, and TransUnion) received ?

Hopefully, this kernel demonstrate the power of the text mining, even it's a simple one, in helping compliance department in the financial industry to gain insights on what are the potential risk based on the customer text data.

Appendix link for the dataset: https://www.consumerfinance.gov/data-research/consumer-complaints/

Specail thanks to the display code from: https://jakevdp.github.io/PythonDataScienceHandbook