In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [2]:
import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

# This code was tested with TensorFlow v1.4
print("You have TensorFlow version", tf.__version__)

You have TensorFlow version 1.13.1


Using TensorFlow backend.


In [7]:
df = pd.read_csv('Data/Consumer_Complaints.csv', encoding='latin-1')
df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,05/24/2019,Debt collection,Medical debt,Attempts to collect debt not owed,Debt was paid,,,"ONLINE Information Services, Inc.",TN,38128.0,,,Web,05/24/2019,Closed with explanation,Yes,,3252932
1,05/24/2019,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,,,AMERICAN EXPRESS COMPANY,,,,,Web,05/24/2019,Closed with non-monetary relief,Yes,,3253201
2,05/24/2019,Debt collection,Other debt,Attempts to collect debt not owed,Debt is not yours,,,"Diversified Consultants, Inc.",NC,28347.0,,,Web,05/24/2019,In progress,Yes,,3252699
3,05/24/2019,Debt collection,Mortgage debt,Attempts to collect debt not owed,Debt was already discharged in bankruptcy and ...,,,"Designed Receivable Solultions, Inc.",CA,92376.0,,,Web,05/24/2019,Closed with explanation,Yes,,3252624
4,05/24/2019,Credit card or prepaid card,Store credit card,Closing your account,Can't close your account,,,"TIDEWATER FINANCE COMPANY, INC.",PA,15207.0,,,Web,05/24/2019,In progress,Yes,,3252406


In [8]:
col = ['Consumer complaint narrative','Product']
df = df[col]
df = df[pd.notnull(df['Consumer complaint narrative'])]
df.head()

Unnamed: 0,Consumer complaint narrative,Product
38665,I have reached out to All 3 credit bureaus and...,"Credit reporting, credit repair services, or o..."
39143,I am a victim of identity theft. My personal i...,"Credit reporting, credit repair services, or o..."
39177,I am a victim of identity theft. My personal i...,"Credit reporting, credit repair services, or o..."
39276,On all credit agencies the same accounts are n...,"Credit reporting, credit repair services, or o..."
39978,I have a student loan thats discharge and its ...,"Credit reporting, credit repair services, or o..."


In [9]:
df.isnull().sum()

Consumer complaint narrative    0
Product                         0
dtype: int64

In [7]:
df['Product'].value_counts()

Credit reporting, credit repair services, or other personal consumer reports    92633
Debt collection                                                                 86841
Mortgage                                                                        53050
Credit reporting                                                                31588
Student loan                                                                    21839
Credit card or prepaid card                                                     21429
Credit card                                                                     18838
Bank account or service                                                         14885
Checking or savings account                                                     12898
Consumer Loan                                                                    9474
Vehicle loan or lease                                                            5762
Money transfer, virtual currency, or money service    

In [10]:
# Split data into train and test
train_size = int(len(df) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(df) - train_size))

Train size: 307325
Test size: 76832


In [9]:
train_narrative = df['Consumer complaint narrative'][:train_size]
train_product = df['Product'][:train_size]

test_narrative = df['Consumer complaint narrative'][train_size:]
test_product = df['Product'][train_size:]

In [11]:
max_words = 1000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)

In [13]:
tokenize.fit_on_texts(train_narrative) # only fit on train
x_train = tokenize.texts_to_matrix(train_narrative)
x_test = tokenize.texts_to_matrix(test_narrative)

In [14]:
# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(train_product)
y_train = encoder.transform(train_product)
y_test = encoder.transform(test_product)

In [15]:
# Converts the labels to a one-hot representation
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)

In [16]:
# Inspect the dimensions of our training and test data (this is helpful to debug)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (307325, 1000)
x_test shape: (76832, 1000)
y_train shape: (307325, 18)
y_test shape: (76832, 18)


In [17]:
batch_size = 32
epochs = 5

In [18]:
# Build the model
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

In [19]:
history = model.fit(x_train, y_train,
                   batch_size=batch_size,
                   epochs=epochs,
                   verbose=1,
                   validation_split=0.1)

Train on 276592 samples, validate on 30733 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [20]:
# Evaluate the accuracy of our trained model
score = model.evaluate(x_test, y_test,
                      batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 1.3846809557704616
Test accuracy: 0.5345949604331528


In [22]:
# Here's how to generate a prediction on individual examples
text_labels = encoder.classes_ 

for i in range(10):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction)]
    print(test_narrative.iloc[i][:50], "...")
    print('Actual label:' + test_product.iloc[i])
    print("Predicted label: " + predicted_label + "\n")

To Whom It May Concern, I just received verbal con ...
Actual label:Credit card
Predicted label: Credit reporting, credit repair services, or other personal consumer reports

My complaint has to do with Bank of America 's pol ...
Actual label:Bank account or service
Predicted label: Checking or savings account

This item has been disputed previously. The compla ...
Actual label:Credit reporting
Predicted label: Credit reporting, credit repair services, or other personal consumer reports

Less than minimum payments were being made to a cl ...
Actual label:Debt collection
Predicted label: Debt collection

M & T BANK purchased my mortgage loan for my home  ...
Actual label:Bank account or service
Predicted label: Mortgage

I submitted a loan modification package to my lend ...
Actual label:Mortgage
Predicted label: Mortgage

ShellPoint XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX ...
Actual label:Mortgage
Predicted label: Mortgage

I have been sued by Portfolio Recovery Associates. ...
Actual 