In [1]:
import pandas as pd
import numpy as np

from gensim.utils import simple_preprocess
import csv
import fasttext
from sklearn.metrics import classification_report,accuracy_score

In [2]:
train_data = pd.read_csv('train_dataset_multiclass.csv',index_col=0)
print(train_data.shape,end="\n\n")
print(train_data['Issue'].value_counts())

train_data['Issue']= train_data['Issue'].apply(lambda x: "_".join(x.split()))
train_data['Consumer complaint narrative'] = train_data['Consumer complaint narrative'].apply(lambda x:" ".join(simple_preprocess(x)))
# Prefixing each row of the category column with '__label__'
train_data['Issue'] = train_data['Issue'].apply(lambda x: '__label__' + x)

train_data.head()

(5908, 2)

Problem with a purchase shown on your statement    1993
Incorrect information on your report               1191
Managing an account                                 935
Fees or interest                                    904
Other features, terms, or problems                  885
Name: Issue, dtype: int64


Unnamed: 0,Consumer complaint narrative,Issue
57519,the citibank credit card company is ridiculous...,__label__Problem_with_a_purchase_shown_on_your...
2790,hi went to local bar on the evening of xx xx x...,__label__Problem_with_a_purchase_shown_on_your...
54277,transferred xxxx thankyou points citibank to x...,"__label__Other_features,_terms,_or_problems"
587,was in xxxx xxxx and used my citibank credit c...,__label__Problem_with_a_purchase_shown_on_your...
12420,clicked promotion on citibank website that sta...,__label__Managing_an_account


In [3]:
test_data = pd.read_csv('test_dataset_multiclass.csv',index_col=0)
print(test_data.shape,end="\n\n")
print(test_data['Issue'].value_counts())

test_data['Issue']= test_data['Issue'].apply(lambda x: "_".join(x.split()))
test_data['Consumer complaint narrative'] = test_data['Consumer complaint narrative'].apply(lambda x:" ".join(simple_preprocess(x)))
# Prefixing each row of the category column with '__label__'
test_data['Issue'] = test_data['Issue'].apply(lambda x: '__label__' + x)

test_data.head()

(2532, 2)

Problem with a purchase shown on your statement    854
Incorrect information on your report               510
Managing an account                                401
Fees or interest                                   388
Other features, terms, or problems                 379
Name: Issue, dtype: int64


Unnamed: 0,Consumer complaint narrative,Issue
17357,back in we noticed charge on our macy credit c...,__label__Fees_or_interest
6524,on xx xx xxxx purchased ticket xxxx xxxx for x...,__label__Problem_with_a_purchase_shown_on_your...
23689,hope all is well the reason why im writing is ...,__label__Fees_or_interest
6960,citibank had requested some xxxx documents ear...,__label__Managing_an_account
7204,on xx xx authorized payment of amount to pay o...,__label__Managing_an_account


In [4]:
# Saving the CSV file as a text file to train/test the classifier
train_data[['Consumer complaint narrative', 'Issue']].to_csv('train_multiclass.txt', 
                                          index = False, 
                                          sep = ' ',
                                          header = None, 
                                          quoting = csv.QUOTE_NONE, 
                                          quotechar = "", 
                                          escapechar = " ")

test_data[['Consumer complaint narrative', 'Issue']].to_csv('test_multiclass.txt', 
                                     index = False, 
                                     sep = ' ',
                                     header = None, 
                                     quoting = csv.QUOTE_NONE, 
                                     quotechar = "", 
                                     escapechar = " ")

In [5]:
# Training the fastText classifier
model = fasttext.train_supervised('train_multiclass.txt', wordNgrams = 2)

In [6]:
# Evaluating performance on the entire test file
model.test('test_multiclass.txt')                      

(2532, 0.4830173775671406, 0.4830173775671406)

In [7]:
predictions = model.predict(test_data['Consumer complaint narrative'].values.tolist())

In [8]:
predictions = np.array(predictions[0]).flatten()
target = test_data['Issue'].to_numpy()

In [9]:
print(classification_report(target,predictions))

                                                          precision    recall  f1-score   support

                               __label__Fees_or_interest       0.56      0.10      0.17       388
           __label__Incorrect_information_on_your_report       0.47      0.67      0.55       510
                            __label__Managing_an_account       0.84      0.10      0.19       401
             __label__Other_features,_terms,_or_problems       0.00      0.00      0.00       379
__label__Problem_with_a_purchase_shown_on_your_statement       0.47      0.93      0.63       854

                                                accuracy                           0.48      2532
                                               macro avg       0.47      0.36      0.31      2532
                                            weighted avg       0.47      0.48      0.38      2532



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
accuracy_score(target,predictions)

0.4830173775671406