In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('../complaints-2021-11-16_03_13.csv')
data = data[~(data['Consumer complaint narrative'].isnull())]
data = data[~(data['Consumer complaint narrative']=='')]
print(data.shape)

(24449, 18)


In [3]:
data.drop(set(data.columns) -  set(['Issue','Consumer complaint narrative']),axis=1,inplace=True)

In [4]:
data.head()

Unnamed: 0,Issue,Consumer complaint narrative
0,Getting a credit card,I opened a citi double cash card the beginning...
1,Closing your account,1. I have paid off two Citi cards on XX/XX/202...
3,Attempts to collect debt not owed,this issue was identity theft resolved with ci...
5,Problem with a purchase shown on your statement,On XX/XX/XXXX I received a letter from Citiban...
6,Managing an account,Small business ( checking and savings ) ( corp...


In [5]:
data['Issue'].nunique()

130

In [6]:
data['Issue'].value_counts()

Problem with a purchase shown on your statement    2847
Incorrect information on your report               1701
Managing an account                                1336
Fees or interest                                   1292
Other features, terms, or problems                 1264
                                                   ... 
Can't contact lender                                  2
Shopping for a line of credit                         1
Can't stop charges to bank account                    1
Account terms and changes                             1
Problem adding money                                  1
Name: Issue, Length: 130, dtype: int64

In [11]:
classes = list(data['Issue'].value_counts()[0:5].index)
classes

['Problem with a purchase shown on your statement',
 'Incorrect information on your report',
 'Managing an account',
 'Fees or interest',
 'Other features, terms, or problems']

In [12]:
filtered_data = data[data['Issue'].isin(classes)]
print(filtered_data.shape)
filtered_data = filtered_data[['Consumer complaint narrative','Issue']]
filtered_data = filtered_data.sample(frac=1)
print(filtered_data.shape)
filtered_data.head()

(8440, 2)
(8440, 2)


Unnamed: 0,Consumer complaint narrative,Issue
28041,"My Credit card, BEST BUY VISA. payments are du...",Fees or interest
11355,There was a fraudulent charge of {$160.00} to ...,Problem with a purchase shown on your statement
14446,THERE IS AN UNKNOWN 30 DAY LATE REPORTING FALS...,Incorrect information on your report
10674,"On XX/XX/XXXX, i accepted an offer from CitiBa...","Other features, terms, or problems"
16258,My father is over XXXX years old. He has a jo...,Managing an account


In [18]:
X_train, X_test, y_train, y_test = train_test_split(filtered_data['Consumer complaint narrative'],filtered_data['Issue'],test_size=0.3,stratify=filtered_data['Issue'])

print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(5908,) (5908,)
(2532,) (2532,)


In [21]:
train_data,test_data = train_test_split(filtered_data,test_size=0.3,stratify=filtered_data['Issue'])

print(train_data.shape)
print(test_data.shape)

(5908, 2)
(2532, 2)


In [22]:
train_data.head()

Unnamed: 0,Consumer complaint narrative,Issue
57519,The Citibank Credit Card company is a ridiculo...,Problem with a purchase shown on your statement
2790,Hi I went to a local bar on the evening of XX/...,Problem with a purchase shown on your statement
54277,I transferred XXXX ThankYou Points ( Citibank ...,"Other features, terms, or problems"
587,I was in XXXX XXXX and used my CITIBANK credit...,Problem with a purchase shown on your statement
12420,I clicked a promotion on citibank 's website t...,Managing an account


In [23]:
test_data.head()

Unnamed: 0,Consumer complaint narrative,Issue
17357,"Back in 2015, we noticed a charge on our Macy ...",Fees or interest
6524,"- On XX/XX/XXXX, I purchased 1 ticket XXXX {$3...",Problem with a purchase shown on your statement
23689,I hope all is well ... \nThe reason why Im wri...,Fees or interest
6960,Citibank had requested some XXXX documents ear...,Managing an account
7204,On XX/XX/2021 I authorized a payment of amount...,Managing an account


In [24]:
train_data['Issue'].value_counts()

Problem with a purchase shown on your statement    1993
Incorrect information on your report               1191
Managing an account                                 935
Fees or interest                                    904
Other features, terms, or problems                  885
Name: Issue, dtype: int64

In [25]:
test_data['Issue'].value_counts()

Problem with a purchase shown on your statement    854
Incorrect information on your report               510
Managing an account                                401
Fees or interest                                   388
Other features, terms, or problems                 379
Name: Issue, dtype: int64

In [26]:
train_data.to_csv('train_dataset_multiclass.csv')

In [27]:
test_data.to_csv('test_dataset_multiclass.csv')