In [1]:
import os
import sys
sys.path.append(os.path.join(os.getcwd(),os.pardir,'src','data'))
from load_preprocess_data import load_raw_complaints_data
import altair as alt
import pandas as pd

data_path = os.path.join(os.pardir, "data", "raw", "complaints.csv")

complaints_df = load_raw_complaints_data(data_path)

In [2]:
complaints_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3084450 entries, 0 to 3084449
Data columns (total 18 columns):
 #   Column                        Dtype         
---  ------                        -----         
 0   date_received                 datetime64[ns]
 1   product                       object        
 2   sub_product                   object        
 3   issue                         object        
 4   sub_issue                     object        
 5   consumer_complaint_narrative  object        
 6   company_public_response       object        
 7   company                       object        
 8   state                         object        
 9   zip_code                      float64       
 10  tags                          object        
 11  consumer_consent_provided     object        
 12  submitted_via                 object        
 13  date_sent_to_company          datetime64[ns]
 14  company_response_to_consumer  object        
 15  timely_response               ob

In [3]:
unique_df = pd.DataFrame()
unique_df['columns'] = complaints_df.columns
unique_df['valid_count'] = complaints_df.count(axis=0).reset_index()[0]
unique_df['unique_count'] = complaints_df.nunique().reset_index()[0]

In [4]:
unique_df

Unnamed: 0,columns,valid_count,unique_count
0,date_received,3084450,4003
1,product,3084450,18
2,sub_product,2849156,76
3,issue,3084450,165
4,sub_issue,2401899,221
5,consumer_complaint_narrative,1107253,967541
6,company_public_response,1339720,11
7,company,3084450,6565
8,state,3044459,63
9,zip_code,3043948,34429


- We can see that the interested target only has 768443 valid values, under which we want to trim the data frame to have null dispute responses removed.
- We can drop non-useful features like `zip_code` and `complaint_id`.
- It seems that we can process the `consumer_complaint_narrative` using NLP and other useful features using `OneHotEncoder` (apply binary encoding if necessary) since the unique values of most of the features are not too many.

In [5]:
# alt.Chart(unique_df.melt(id_vars=['columns'])).mark_bar().encode(
#     x=alt.X('variable:O',title = ''),
#     y=alt.Y('value:Q',title = 'count'),
#     color='variable:N',
#     column='columns:N'
# )

In [6]:
complaints_df = complaints_df.query('not consumer_disputed.isnull()')
complaints_df.head()

Unnamed: 0,date_received,product,sub_product,issue,sub_issue,consumer_complaint_narrative,company_public_response,company,state,zip_code,tags,consumer_consent_provided,submitted_via,date_sent_to_company,company_response_to_consumer,timely_response,consumer_disputed,complaint_id
115,2016-10-21,Credit card,,Other,,On XX/XX/2016 a deposit in the amount of {$150...,,AMERICAN EXPRESS COMPANY,NY,14580.0,,Consent provided,Web,2016-10-25,Closed with monetary relief,Yes,Yes,2172679
2301,2015-12-10,Mortgage,Conventional adjustable mortgage (ARM),"Loan modification,collection,foreclosure",,Please stop the Default / foreclosure process ...,,NATIONSTAR MORTGAGE,CA,92142.0,,Consent provided,Web,2015-12-10,Closed with explanation,Yes,Yes,1691917
3935,2016-03-19,Consumer Loan,Vehicle loan,Problems when you are unable to pay,,I 've been having several issues with SNAAC wh...,Company has responded to the consumer and the ...,Security National Automotive Acceptance,GA,31315.0,Servicemember,Consent provided,Web,2016-03-25,Closed with explanation,Yes,No,1840418
4188,2015-05-07,Mortgage,Conventional adjustable mortgage (ARM),"Loan servicing, payments, escrow account",,1. Mortgage sold by XXXX XXXX XXXX to the XXXX...,,NATIONSTAR MORTGAGE,CA,90028.0,Older American,Consent provided,Web,2015-05-07,Closed with explanation,Yes,Yes,1364576
4538,2016-09-27,Bank account or service,Checking account,Problems caused by my funds being low,,I have a checking and savings account with Wel...,Company has responded to the consumer and the ...,WELLS FARGO & COMPANY,NJ,8098.0,,Consent provided,Web,2016-09-27,Closed with explanation,No,No,2133127


In [7]:
target = pd.DataFrame(complaints_df.value_counts('consumer_disputed')).reset_index()
target.columns = ['consumer_disputed','count']

In [8]:
alt.Chart(target).mark_bar().encode(
    x=alt.X('consumer_disputed:O',title = 'Consumer Disputed'),
    y=alt.Y('count:Q',title = 'Count'),
    color='consumer_disputed:O',
)

- We see an imbalanced class, which we should take into account during later training of the model.