In [15]:
import pandas as pd
import numpy as np

import fasttext
from gensim.utils import simple_preprocess
import csv
import fasttext
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from gensim.models import KeyedVectors

In [2]:
data = pd.read_csv('../complaints-2021-11-16_03_13.csv')
data = data[['Consumer complaint narrative','Issue']]
data = data[~(data['Consumer complaint narrative'].isnull())]
data = data[~(data['Consumer complaint narrative']=='')]
data['Consumer complaint narrative'] = data['Consumer complaint narrative'].apply(lambda x:" ".join(simple_preprocess(x)))
print(data.shape)
data.head()

(24449, 2)


Unnamed: 0,Consumer complaint narrative,Issue
0,opened citi double cash card the beginning of ...,Getting a credit card
1,have paid off two citi cards on xx xx one for ...,Closing your account
3,this issue was identity theft resolved with ci...,Attempts to collect debt not owed
5,on xx xx xxxx received letter from citibank da...,Problem with a purchase shown on your statement
6,small business checking and savings corporate ...,Managing an account


In [3]:
data['Consumer complaint narrative'].to_csv('Embedding_data.txt', 
                                          index = False, 
                                          sep = ' ',
                                          header = None, 
                                          quoting = csv.QUOTE_NONE, 
                                          quotechar = "", 
                                          escapechar = " ")

In [4]:
model = fasttext.train_unsupervised('Embedding_data.txt', "cbow")

In [5]:
# Extracting Word Vectors from the Trained Fasttext Model

dimension = model.get_dimension()
words_list = model.get_words()

kv = KeyedVectors(vector_size= dimension)

vectors = [] 
for word in words_list: 
    vectors.append(model.get_word_vector(word))  

kv.add(words_list, vectors) 

kv.save_word2vec_format('Fasttext_trained_vectors.bin',binary=True)

In [6]:
train_data = pd.read_csv('train_dataset.csv',index_col=0)
print(train_data.shape,end="\n\n")
print(train_data['Issue'].value_counts())

train_data['Issue']= train_data['Issue'].apply(lambda x: "_".join(x.split()))
train_data['Consumer complaint narrative'] = train_data['Consumer complaint narrative'].apply(lambda x:" ".join(simple_preprocess(x)))
# Prefixing each row of the category column with '__label__'
train_data['Issue'] = train_data['Issue'].apply(lambda x: '__label__' + x)

train_data.head()

(3184, 2)

Problem with a purchase shown on your statement    1993
Incorrect information on your report               1191
Name: Issue, dtype: int64


Unnamed: 0,Consumer complaint narrative,Issue
554,on xx xx two charges were made at vendor calle...,__label__Problem_with_a_purchase_shown_on_your...
11901,citibank south dakota charge off frs file numb...,__label__Incorrect_information_on_your_report
17377,made purchaseson xxxx that was opened and xxxx...,__label__Problem_with_a_purchase_shown_on_your...
8681,on xxxx xx xx filed dispute with citibank conc...,__label__Problem_with_a_purchase_shown_on_your...
5644,on xx xx xxxx purchased xxxx xxxx xxxx concert...,__label__Problem_with_a_purchase_shown_on_your...


In [7]:
test_data = pd.read_csv('test_dataset.csv',index_col=0)
print(test_data.shape,end="\n\n")
print(test_data['Issue'].value_counts())

test_data['Issue']= test_data['Issue'].apply(lambda x: "_".join(x.split()))
test_data['Consumer complaint narrative'] = test_data['Consumer complaint narrative'].apply(lambda x:" ".join(simple_preprocess(x)))
# Prefixing each row of the category column with '__label__'
test_data['Issue'] = test_data['Issue'].apply(lambda x: '__label__' + x)

test_data.head()

(1364, 2)

Problem with a purchase shown on your statement    854
Incorrect information on your report               510
Name: Issue, dtype: int64


Unnamed: 0,Consumer complaint narrative,Issue
8201,hello cfpb can you please forward the folllowi...,__label__Incorrect_information_on_your_report
49162,the response from citi cards has cleared up lo...,__label__Problem_with_a_purchase_shown_on_your...
8206,have made repeated attempts with macys credit ...,__label__Problem_with_a_purchase_shown_on_your...
8226,paid through citibank credit card for two pair...,__label__Problem_with_a_purchase_shown_on_your...
57391,at best buys request am submitting documentati...,__label__Incorrect_information_on_your_report


In [8]:
# Saving the CSV file as a text file to train/test the classifier
train_data[['Consumer complaint narrative', 'Issue']].to_csv('train.txt', 
                                          index = False, 
                                          sep = ' ',
                                          header = None, 
                                          quoting = csv.QUOTE_NONE, 
                                          quotechar = "", 
                                          escapechar = " ")

test_data[['Consumer complaint narrative', 'Issue']].to_csv('test.txt', 
                                     index = False, 
                                     sep = ' ',
                                     header = None, 
                                     quoting = csv.QUOTE_NONE, 
                                     quotechar = "", 
                                     escapechar = " ")

In [9]:
# Training the fastText classifier with the saved pretrained vectors 
model = fasttext.train_supervised('train.txt',pretrainedVectors='Fasttext_trained_vectors.bin',wordNgrams = 2)

In [10]:
# Evaluating performance on the entire test file
model.test('test.txt')                      

(1364, 0.8797653958944281, 0.8797653958944281)

In [11]:
predictions = model.predict(test_data['Consumer complaint narrative'].values.tolist())

In [12]:
predictions = np.array(predictions[0]).flatten()
target = test_data['Issue'].to_numpy()

In [13]:
print(classification_report(target,predictions))

                                                          precision    recall  f1-score   support

           __label__Incorrect_information_on_your_report       0.92      0.75      0.82       510
__label__Problem_with_a_purchase_shown_on_your_statement       0.86      0.96      0.91       854

                                                accuracy                           0.88      1364
                                               macro avg       0.89      0.85      0.87      1364
                                            weighted avg       0.88      0.88      0.88      1364



In [14]:
accuracy_score(target,predictions)

0.8797653958944281

In [16]:
TP,FN,FP,TN = confusion_matrix(target,predictions).flatten()

print(TN,FP,FN,TP)

819 35 129 381


In [17]:
FPR = FP/(TN+FP)

FPR

0.040983606557377046