In [1]:
#importing packages
from collections import Counter, defaultdict
import math
import json
import numpy as np
import os
import pandas as pd
import re
import pydotplus
from sklearn import tree, preprocessing
from sklearn_pandas import DataFrameMapper, cross_val_score
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from scipy.sparse import csr_matrix
from IPython.display import Image  
#import urllib.request
#import zipfile


In [2]:
# Read the input
file = "reformatted1.csv"
d = pd.read_csv(file, na_values = "null")
# the consumer dataset is now a Pandas DataFrame
# Only interested in data with consumer complaints
d = d[d['Consumer complaint narrative'].notnull()]

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
print(d['Consumer complaint narrative'])

205       Capitol One 360 keeps changing what I owe on m...
4148      I requested all XXXX reports. I got through th...
4727      I received a forberance on my loans last sprin...
4805      As of Tuesday; XXXX XXXX; 2017; TransUnion is ...
5616      We sold our XXXX home on XX/XX/2016 our lender...
13895     Re : STELLAR RECOVERY; INC ( Collection compan...
14898     There has been a collection added to my report...
15385     I was a victim of identity theft and noticed a...
15853     I built my home in XXXX and had a Countrywide ...
56784     Received Capital One charge card offer XXXX. A...
56841     I do n't know how they got my cell number. I t...
56891     I 'm a longtime member of Charter One Bank/RBS...
56901     After looking at my credit report; I saw a col...
56905     I received a call from a XXXX XXXX from XXXX @...
56917     Was not contacted 4 years later about some pri...
56918     Collection Consultants is reporting a collecti...
56921     I had my purse stolen in 2007.

In [4]:
print(d.columns.values)
print(d.head())

['sid' 'id' 'position' 'created_at' 'created_meta' 'updated_at'
 'updated_meta' 'meta' 'Date received' 'Product' 'Sub-product' 'Issue'
 'Sub-issue' 'Consumer complaint narrative' 'Company public response'
 'Company' 'State' 'ZIP code' 'Tags' 'Consumer consent provided?'
 'Submitted via' 'Date sent to company' 'Company response to consumer'
 'Timely response?' 'Consumer disputed?' 'Complaint ID']
         sid                                    id  position  created_at  \
205   950855  ABB1397D-4223-455D-8B0E-84A3A96A9F5D    950855  1487697354   
4148  951134  F88FDCA7-8A77-4D7B-861D-5F70AE14D65E    951134  1487697356   
4727  951135  B6FB6CBA-54CB-4BB7-BA46-2AA71CD68962    951135  1487697356   
4805  951136  EDD8A582-379F-486F-9D91-DE9DAEBF979C    951136  1487697356   
5616  951137  513A2B17-783C-4138-BE94-5AEEE55597C9    951137  1487697356   

      created_meta  updated_at  updated_meta  meta        Date received  \
205         912605  1487697354        912605   NaN  2016-12-02T23:07:

In [5]:
def tokenize_string(my_string):
    """ DONE. You should use this in your tokenize function.
    """
    #return re.findall('[\w\-]+', my_string.lower())
    #\W -> Matches any non-alphanumeric character; 
    #this is equivalent to the class [^a-zA-Z0-9_]. 
    
    stemmer = SnowballStemmer("english", ignore_stopwords=True)
    
    no_numbers = my_string.lower().translate({ord(ch): None for ch in '0123456789'})
    word_list = re.sub('\W+', ' ', no_numbers).split()
    stop = set(stopwords.words('english'))
    output = []
    
    for word in word_list:
        if word != "xxxx" and word not in stop:
            word = stemmer.stem(word)
            if len(word) > 2:
                output.append(word)
            
    word_list = output
    
    # preprocessing ->
   
    # remove too low and too high frequency words -> we can't calculate here

    return(word_list)

In [6]:
def tokenize(data):
    """
    Append a new column to the movies DataFrame with header 'tokens'.
    This will contain a list of strings, one per token, extracted
    from the 'genre' field of each movie. Use the tokenize_string method above.
    Note: you may modify the movies parameter directly; no need to make
    a new copy.
    Params:
      movies...The movies DataFrame
    Returns:
      The movies DataFrame, augmented to include a new column called 'tokens'.
    >>> movies = pd.DataFrame([[123, 'Horror|Romance'], [456, 'Sci-Fi']], columns=['movieId', 'genres'])
    >>> movies = tokenize(movies)
    >>> movies['tokens'].tolist()
    [['horror', 'romance'], ['sci-fi']]
    """
    ###TODO
    
    # step 1 -> do tokenize_string for each row in movies['genres']

    all_words = []
    for row in data['Consumer complaint narrative']:
        #genre_list = re.sub(r'[||)|(]', r' ',row.lower()).split()
        word_list = tokenize_string(row)
        #print(word_list)
        #print(len(genre_list))
        all_words.append(word_list)
    
    # step 2 -> add column tokens in movies
    array = np.array(all_words)
    
    #print(array[:5])
    #print('#list = ',len(array))
    
    new_data = d.assign(tokens = array)
    
    #print(new_movies.head(5))
    return(new_data)  

In [7]:
def featurize(data):
    """
    Append a new column to the movies DataFrame with header 'features'.
    Each row will contain a csr_matrix of shape (1, num_features). Each
    entry in this matrix will contain the tf-idf value of the term, as
    defined in class:
    tfidf(i, d) := tf(i, d) / max_k tf(k, d) * log10(N/df(i))
    where:
    i is a term
    d is a document (movie)
    tf(i, d) is the frequency of term i in document d
    max_k tf(k, d) is the maximum frequency of any term in document d
    N is the number of documents (movies)
    df(i) is the number of unique documents containing term i
    Params:
      movies...The movies DataFrame
    Returns:
      A tuple containing:
      - The movies DataFrame, which has been modified to include a column named 'features'.
      - The vocab, a dict from term to int. Make sure the vocab is sorted alphabetically as in a2 (e.g., {'aardvark': 0, 'boy': 1, ...})
   
    """
    ###TODO
    #print(movies[:5]) 
    
    #step 1 -> build a vocab and df(term)
    vocab = {}
    vocab_list = []
    df = {}
    
    for tokenization in data['tokens']:
        tokens = list(set(tokenization))
        for term in tokens:
            if term not in vocab.keys():
                vocab.setdefault(term,-1)
             
            if term not in df.keys(): 
                df.setdefault(term,1)
            else :
                df[term] += 1
             
             
    #print('vocab = ', vocab)
    
    vocab_list = sorted(vocab.keys(), key = lambda x:x)
    #print('vocab_list = ', vocab_list)
    
    for i,term in enumerate(vocab_list):
         vocab[term] = i
            
    #        
         
    #print('Sorted vocab = ', sorted(vocab.items()))
    #print('df = ',sorted(df.items(), key=lambda x:x[0]))
    
    # step 2 -> Build a csr_matrix for each row of movies['tokens']
   
    #print('N = ',N)
    
    #[comedy, comedy, comedy, horror]  -> max_k tf(k, d) = 3 
    #[action, comedy,thriller] -> tf(action, d) =1
    # df(i) ->
    #num_features is the total number of unique features across all documents.
    
    N = len(data)
    
    csr_array =[]
    
    for row1 in data['tokens']:
        csr_row = []
        csr_col = []
        csr_data = []
        max_k = 0
       
        max_k = Counter(row1).most_common()[:1][0][1]
        row = list(set(row1))

        #print('removed duplicates =',row)
        for term in row:       
            csr_row.append(0)
            csr_col.append(vocab[term])
            #tfidf(i, d) := tf(i, d) / max_k tf(k, d) * log10(N/df(i))
            tf = Counter(row1)[term]
            #max_k = max_k.most_common()[:1][0][1]
         
            #print('term = %s ---> tf = %d ---> max_k = %d'%(term,tf,max_k))
            tfidf = (tf / max_k) * math.log10(N/df[term])
            csr_data.append(tfidf)
           
         
        #print('csr_row = ',csr_row) 
        #print('csr_col = ',csr_col)
        #print('csr_data=',csr_data)
        X = csr_matrix((csr_data, (csr_row, csr_col)), shape=(1, len(vocab)), dtype=np.float64)
       
        #print('X ->\n',X.toarray())
        csr_array.append(X)
    

    # step 3 -> add column features to movies 
    #print('size of csr_array = ',len(csr_array)) 
    #print('CSR = ',csr_array[:2])  
    new_data = data.assign(features = csr_array)
    #print(new_movies.head(2))
     
    return(new_data,vocab)   
    
    pass

In [8]:
data = tokenize(d)

In [9]:
print(data.head(20)['tokens'])
print(data.tail(20)['tokens'])

205      [capitol, one, keep, chang, owe, loan, also, c...
4148     [request, report, got, process, fine, got, rep...
4727     [receiv, forber, loan, last, spring, saw, rece...
4805     [tuesday, transunion, still, know, illeg, repo...
5616     [sold, home, lender, vanderbilt, mortgag, toda...
13895    [stellar, recoveri, inc, collect, compani, ref...
14898    [collect, report, bureaus, belong, collect, co...
15385    [victim, ident, theft, notic, collect, credit,...
15853    [built, home, countrywid, home, loan, husband,...
56784    [receiv, capit, one, charg, card, offer, appli...
56841    [know, got, cell, number, told, would, deal, o...
56891    [longtim, member, charter, one, bank, rbs, cit...
56901    [look, credit, report, saw, collect, account, ...
56905    [receiv, call, ext, state, owe, want, howev, w...
56917    [contact, year, later, privat, loan, suppos, t...
56918    [collect, consult, report, collect, account, c...
56921    [purs, stolen, never, found, person, respons, .

In [10]:
#create new features
class_vector = pd.factorize(data['Company response to consumer'])
data['class_vector'] = pd.Series(class_vector[0])
data['combined_product'] = data.apply(lambda x: "%s-%s" % (x['Product'], x['Sub-product']), axis = 1)
data['combined_issue'] = data.apply(lambda x: "%s-%s" % (x['Issue'], x['Sub-issue']), axis = 1)
print(data["class_vector"].head())
print(data["combined_product"].head())
print(data["combined_issue"].head())


205     1.0
4148    1.0
4727    0.0
4805    2.0
5616    0.0
Name: class_vector, dtype: float64
205                      Consumer Loan-Vehicle loan
4148                           Credit reporting-nan
4727    Student loan-Federal student loan servicing
4805                           Credit reporting-nan
5616           Mortgage-Conventional fixed mortgage
Name: combined_product, dtype: object
205                      Taking out the loan or lease-nan
4148    Unable to get credit report/credit score-Probl...
4727    Dealing with my lender or servicer-Having prob...
4805    Incorrect information on credit report-Account...
5616         Loan servicing; payments; escrow account-nan
Name: combined_issue, dtype: object


In [13]:
decision_tree_data = data.ix[:, ['combined_product', 'combined_issue']]
clf = tree.DecisionTreeClassifier(max_depth=3)
product_enc = preprocessing.OneHotEncoder()
issue_enc = preprocessing.OneHotEncoder()
x = DataFrameMapper([
        ('combined_product', product_enc), 
        ('combined_issue',issue_enc)
    ])
x_1 = x.fit_transform(decision_tree_data)
x_1

ValueError: could not convert string to float: 'Student loan-Non-federal student loan'

In [12]:

clf = clf.fit( x_1, y = data['Company response to consumer'].values)

ValueError: could not convert string to float: 'Student loan-Non-federal student loan'

In [None]:
dot_data = tree.export_graphviz(clf, out_file=None,   
                         class_names=data['Company response to consumer'].values,  
                         filled=True, rounded=True,  
                         special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())

In [None]:
data, vocab = featurize(data)

In [None]:
print('vocab:')
print(sorted(vocab.items())[:500])

In [None]:
print(len(vocab))