In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['database.sqlite', 'consumer_complaints.csv']


In [2]:
# keras model
# for text classification, with multiple classes (multi-class)
# but single label
# character-level tokenization
# with fixed length input
# based on convolutional layers

# using customer complaints dataset
# we classify a narrative text about an issue into a product category
# https://www.kaggle.com/cfpb/us-consumer-finance-complaints

# See also
# https://www.kaggle.com/kadhambari/multi-class-text-classification
# https://www.kaggle.com/anucool007/multi-class-text-classification-bag-of-words

In [3]:
import keras

Using TensorFlow backend.


In [4]:
# utility functions for later

def dict_to_csv(d, path):
    df = pd.DataFrame.from_dict(d, orient='index')
    df.to_csv(path)

In [5]:
# load dataset
df = pd.read_csv('../input/consumer_complaints.csv', usecols=('product', 'consumer_complaint_narrative'))

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
print(len(df))
df.head()[:5]

555957


Unnamed: 0,product,consumer_complaint_narrative
0,Mortgage,
1,Mortgage,
2,Credit reporting,
3,Student loan,
4,Debt collection,


In [7]:
# remove nan's
df = df.dropna() # drop row if have nan in any column
print(len(df))
df.head()[:5]

66806


Unnamed: 0,product,consumer_complaint_narrative
190126,Debt collection,XXXX has claimed I owe them {$27.00} for XXXX ...
190135,Consumer Loan,Due to inconsistencies in the amount owed that...
190155,Mortgage,In XX/XX/XXXX my wages that I earned at my job...
190207,Mortgage,I have an open and current mortgage with Chase...
190208,Mortgage,XXXX was submitted XX/XX/XXXX. At the time I s...


In [8]:
# encode product

In [9]:
# this turns each string into a number (most popular are lowest)
product_encoding = pd.factorize(df['product'])
print(product_encoding)

(array([0, 1, 2, ..., 7, 2, 2]), Index(['Debt collection', 'Consumer Loan', 'Mortgage', 'Credit card',
       'Credit reporting', 'Student loan', 'Bank account or service',
       'Payday loan', 'Money transfers', 'Other financial service',
       'Prepaid card'],
      dtype='object'))


In [10]:
labels, index = product_encoding
print(labels) # encoding for each product in the dataset
print(index) # index -> string map

[0 1 2 ... 7 2 2]
Index(['Debt collection', 'Consumer Loan', 'Mortgage', 'Credit card',
       'Credit reporting', 'Student loan', 'Bank account or service',
       'Payday loan', 'Money transfers', 'Other financial service',
       'Prepaid card'],
      dtype='object')


In [11]:
# build label <-> index maps to use later
product_to_id = {name: i for i, name in enumerate(index)}
id_to_product = {i: name for i, name in enumerate(index)}
print(product_to_id)
print(id_to_product)
print(len(index)) # number of classes

{'Debt collection': 0, 'Consumer Loan': 1, 'Mortgage': 2, 'Credit card': 3, 'Credit reporting': 4, 'Student loan': 5, 'Bank account or service': 6, 'Payday loan': 7, 'Money transfers': 8, 'Other financial service': 9, 'Prepaid card': 10}
{0: 'Debt collection', 1: 'Consumer Loan', 2: 'Mortgage', 3: 'Credit card', 4: 'Credit reporting', 5: 'Student loan', 6: 'Bank account or service', 7: 'Payday loan', 8: 'Money transfers', 9: 'Other financial service', 10: 'Prepaid card'}
11


In [12]:
dict_to_csv(product_to_id, 'labels_index.csv')

In [13]:
# note that the classes are imbalanced
for product in index:
    print(product, len(df.loc[df['product'] == product]))

Debt collection 17552
Consumer Loan 3678
Mortgage 14919
Credit card 7929
Credit reporting 12526
Student loan 2128
Bank account or service 5711
Payday loan 726
Money transfers 666
Other financial service 110
Prepaid card 861


In [14]:
# one-hot encode
y = keras.utils.to_categorical(labels)
print(len(y))
print(y[0], labels[0])

66806
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] 0


In [15]:
# encode input with character level tokenization and embeddings

from keras.preprocessing.text import Tokenizer

tok = Tokenizer(num_words=None, # don't limit number of characters
                lower=False, # don't lower
                char_level=True, # character-level tokenization
                oov_token='<OOV>', # token for unknown characters
                                   # FIXME: multi-character token but should be single char?
               )

In [16]:
texts = df['consumer_complaint_narrative'].values
print(texts[:2])

['XXXX has claimed I owe them {$27.00} for XXXX years despite the PROOF of PAYMENT I sent them : canceled check and their ownPAID INVOICE for {$27.00}! \nThey continue to insist I owe them and collection agencies are after me. \nHow can I stop this harassment for a bill I already paid four years ago? \n'
 'Due to inconsistencies in the amount owed that I was told by M & T Bank and the amount that was reported to the credit reporting agencies, I was advised to write a good will letter in order to address the issue and request the negative entry be removed from my credit report all together. I had a vehicle that was stolen and it was declared a total loss by insurance company. The insurance company and the GAP insurancw companypaid the outstanding balance of the loan, but I was told by M & T Bank that there was still a balance due on the loan. In good faith, without having received any proof as to why there was still a balance, I made a partial payment towards the remaining debt. I then 

In [17]:
tok.fit_on_texts(texts)

In [18]:
x = tok.texts_to_sequences(texts)
print(x[:2])

[[10, 10, 10, 10, 2, 12, 5, 11, 2, 15, 14, 5, 8, 16, 3, 13, 2, 26, 2, 6, 21, 3, 2, 4, 12, 3, 16, 2, 54, 51, 53, 74, 23, 29, 29, 55, 2, 20, 6, 9, 2, 10, 10, 10, 10, 2, 18, 3, 5, 9, 11, 2, 13, 3, 11, 19, 8, 4, 3, 2, 4, 12, 3, 2, 46, 40, 38, 38, 44, 2, 6, 20, 2, 46, 31, 64, 42, 33, 39, 30, 2, 26, 2, 11, 3, 7, 4, 2, 4, 12, 3, 16, 2, 71, 2, 15, 5, 7, 15, 3, 14, 3, 13, 2, 15, 12, 3, 15, 28, 2, 5, 7, 13, 2, 4, 12, 3, 8, 9, 2, 6, 21, 7, 46, 31, 26, 45, 2, 26, 39, 69, 38, 26, 35, 33, 2, 20, 6, 9, 2, 54, 51, 53, 74, 23, 29, 29, 55, 66, 2, 32, 30, 12, 3, 18, 2, 15, 6, 7, 4, 8, 7, 17, 3, 2, 4, 6, 2, 8, 7, 11, 8, 11, 4, 2, 26, 2, 6, 21, 3, 2, 4, 12, 3, 16, 2, 5, 7, 13, 2, 15, 6, 14, 14, 3, 15, 4, 8, 6, 7, 2, 5, 22, 3, 7, 15, 8, 3, 11, 2, 5, 9, 3, 2, 5, 20, 4, 3, 9, 2, 16, 3, 23, 2, 32, 48, 6, 21, 2, 15, 5, 7, 2, 26, 2, 11, 4, 6, 19, 2, 4, 12, 8, 11, 2, 12, 5, 9, 5, 11, 11, 16, 3, 7, 4, 2, 20, 6, 9, 2, 5, 2, 24, 8, 14, 14, 2, 26, 2, 5, 14, 9, 3, 5, 13, 18, 2, 19, 5, 8, 13, 2, 20, 6, 17, 9, 2, 18, 3,

In [19]:
df = pd.DataFrame(x)
df.to_csv('x_data.csv')

In [20]:
y = np.array(y)
np.savetxt('y_data.csv', y, fmt="%d", delimiter=",")

In [21]:
# create word index to use later

# word -> index map
word_index = tok.word_index
word_index['<PAD>'] = 0 # set unused index to padding token
print(word_index['<PAD>'], word_index['<OOV>'], word_index[' '])

# index -> word map
reversed_word_index = {v:k for k, v in word_index.items()}
print(reversed_word_index[0], reversed_word_index[1], reversed_word_index[2])

0 1 2
<PAD> <OOV>  


In [22]:
dict_to_csv(word_index, 'word_index.csv')

In [23]:
def vectorized_to_tokens(sample):
    return [reversed_word_index.get(num, '<OOV>') for num in sample]
    
def tokens_to_string(tokens):
    return ''.join(tokens)

In [24]:
print(len(x), len(y))

66806 66806


In [25]:
# pad to fixed length

In [26]:
# find a good length to pad to
lengths = [len(sample) for sample in x]
print(len(lengths))
print(lengths[0])

66806
299


In [27]:
p = np.percentile(lengths, 95)
print(p)

3193.0


In [28]:
maxlen = int(p)
print(maxlen)

3193


In [29]:
from keras.preprocessing.sequence import pad_sequences

x = pad_sequences(x,
                  padding='post',
                  truncating='post',
                  value=word_index['<PAD>'],
                  maxlen=maxlen,
                )
print(len(x))
print(len(x[0]))

66806
3193


In [30]:
# split train and test data

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=42)

print(len(x_train), len(x_test))
print(len(y_train), len(y_test))

33403 33403
33403 33403


In [31]:
print(tokens_to_string(vectorized_to_tokens(x_test[0])))
print(id_to_product[np.argmax(y_test[0])])

I have been battling with portfolio recovery and Foster, Garbus & Garbus for over a year regarding a debt that is not mine. I continue receiving letters from Foster, Garbus XXXX Garbus regarding same debt although I 've submitted documents to Foster, Garbus & Garbus proving that I DO NOT OWE said debt. These guys went as far as having my XXXX XXXX account frozen last year and I thought the issue was resolved once I submitted my documents. 
<PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><

In [32]:
# define model

from keras.models import Sequential
from keras.layers import (
    Embedding,
    Conv1D,
    Dense,
    MaxPooling1D,
    AveragePooling1D,
    Flatten,
    GlobalAveragePooling1D,
    Dropout,
)

model = Sequential([
    Embedding(len(word_index), 8, input_length=maxlen),
    Conv1D(128, 15, activation='relu'),
    Dropout(0.2),
    MaxPooling1D(2),
    Conv1D(128, 10, activation='relu'),
    Dropout(0.2),
    AveragePooling1D(2),
    Conv1D(128, 5, activation='relu'),
    Dropout(0.2),
    MaxPooling1D(2),
    GlobalAveragePooling1D(),
    Dense(32, activation='relu'),
    Dense(11, activation='softmax'),
])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 3193, 8)           816       
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 3179, 128)         15488     
_________________________________________________________________
dropout_1 (Dropout)          (None, 3179, 128)         0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 1589, 128)         0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 1580, 128)         163968    
_________________________________________________________________
dropout_2 (Dropout)          (None, 1580, 128)         0         
_________________________________________________________________
average_pooling1d_1 (Average (None, 790, 128)          0         
__________

In [33]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [34]:
# train

epochs = 47
batch_size = 512

history = model.fit(x, 
                    y, 
                    epochs=epochs, 
                    batch_size=batch_size,
                    verbose=2, 
                    validation_split=0.3,
                   )

# baseline: 1/11 ~= 0.1 accuracy with random guessing

Train on 46764 samples, validate on 20042 samples
Epoch 1/47
 - 32s - loss: 1.8898 - acc: 0.3192 - val_loss: 1.8835 - val_acc: 0.3178
Epoch 2/47
 - 28s - loss: 1.7228 - acc: 0.3783 - val_loss: 1.7772 - val_acc: 0.3734
Epoch 3/47
 - 28s - loss: 1.6462 - acc: 0.4060 - val_loss: 1.7345 - val_acc: 0.3787
Epoch 4/47
 - 28s - loss: 1.6082 - acc: 0.4186 - val_loss: 1.7285 - val_acc: 0.3829
Epoch 5/47
 - 28s - loss: 1.5499 - acc: 0.4397 - val_loss: 1.6215 - val_acc: 0.4213
Epoch 6/47
 - 28s - loss: 1.4896 - acc: 0.4604 - val_loss: 1.5544 - val_acc: 0.4566
Epoch 7/47
 - 28s - loss: 1.4121 - acc: 0.4882 - val_loss: 1.5276 - val_acc: 0.4759
Epoch 8/47
 - 28s - loss: 1.3569 - acc: 0.5093 - val_loss: 1.4624 - val_acc: 0.5010
Epoch 9/47
 - 28s - loss: 1.3058 - acc: 0.5326 - val_loss: 1.4180 - val_acc: 0.5188
Epoch 10/47
 - 28s - loss: 1.2667 - acc: 0.5527 - val_loss: 1.3598 - val_acc: 0.5362
Epoch 11/47
 - 28s - loss: 1.2349 - acc: 0.5682 - val_loss: 1.3686 - val_acc: 0.5476
Epoch 12/47
 - 28s - los

In [35]:
# test
print(model.evaluate(x_test, y_test))

[0.6686020720609237, 0.7996886507253484]


In [36]:
model.save('keras_text_model_multiclass.h5')