In [1]:
import os
import sys
sys.path.append(os.path.join(os.getcwd(),os.pardir,'data'))
from load_preprocess_data import load_raw_complaints_data
import altair as alt
import pandas as pd
import warnings
warnings.simplefilter("ignore", UserWarning)
data_path = os.path.join(os.pardir, os.pardir ,"data", "raw", "complaints.csv")

complaints_df = load_raw_complaints_data(data_path)

In [2]:
complaints_df = complaints_df.query('not consumer_disputed.isnull()').dropna()
complaints_df['consumer_disputed'].replace(['Yes','No'],[1,0], inplace = True)

In [3]:
complaints_df.head()
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(complaints_df,test_size=0.2, random_state=123)

In [4]:
train_df.head()

Unnamed: 0,date_received,product,sub_product,issue,sub_issue,consumer_complaint_narrative,company_public_response,company,state,zip_code,tags,consumer_consent_provided,submitted_via,date_sent_to_company,company_response_to_consumer,timely_response,consumer_disputed,complaint_id
1986109,2015-10-30,Student loan,Non-federal student loan,Can't repay my loan,Can't temporarily postpone payments,I am co signer on daughters loan and daughter ...,Company chooses not to provide a public response,WELLS FARGO & COMPANY,TX,76054.0,Older American,Consent provided,Web,2015-10-30,Closed with explanation,Yes,0,1633369
1371860,2016-12-29,Debt collection,I do not know,Disclosure verification of debt,Not given enough info to verify debt,i received a letter from Alpha recovery group ...,Company has responded to the consumer and the ...,Alpha Recovery Corp,IL,62225.0,Servicemember,Consent provided,Web,2016-12-29,Closed with non-monetary relief,Yes,0,2269234
873578,2015-07-06,Debt collection,I do not know,Taking/threatening an illegal action,Threatened to sue on too old debt,My wife and I have been getting harassing phon...,Company chooses not to provide a public response,WELLS FARGO & COMPANY,WA,98028.0,Servicemember,Consent provided,Web,2015-07-06,Closed with explanation,Yes,0,1452843
2009864,2016-02-18,Debt collection,Credit card,Improper contact or sharing of info,Contacted employer after asked not to,I asked XXXX XXXX XXXX XXXX XXXX to not call m...,Company chooses not to provide a public response,"BANK OF AMERICA, NATIONAL ASSOCIATION",MN,55419.0,Servicemember,Consent provided,Web,2016-02-18,Closed with explanation,Yes,0,1794540
1451008,2017-04-15,Debt collection,Medical,Cont'd attempts collect debt not owed,Debt is not mine,I am a XXXX veteran who contracted XXXX while ...,Company believes it acted appropriately as aut...,"GOLD KEY CREDIT, INC.",CA,95370.0,"Older American, Servicemember",Consent provided,Web,2017-04-15,Closed with non-monetary relief,Yes,0,2435574


In [5]:
unique_df = pd.DataFrame()
unique_df['columns'] = complaints_df.columns
unique_df['valid_count'] = complaints_df.count(axis=0).reset_index()[0]
unique_df['unique_count'] = complaints_df.nunique().reset_index()[0]
unique_df

Unnamed: 0,columns,valid_count,unique_count
0,date_received,3120,728
1,product,3120,2
2,sub_product,3120,10
3,issue,3120,9
4,sub_issue,3120,37
5,consumer_complaint_narrative,3120,3102
6,company_public_response,3120,10
7,company,3120,627
8,state,3120,57
9,zip_code,3120,2322


In [6]:
X_train, y_train = train_df.drop(columns= ['consumer_disputed']), train_df['consumer_disputed']
X_test, y_test = test_df.drop(columns= ['consumer_disputed']), train_df['consumer_disputed']

In [7]:
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_validate

categorical_features = ['product',
                        'sub_product',
                        'issue',
                        'sub_issue',
                        'company_public_response', 
                        'company',
                        'state',
                        'consumer_consent_provided',
                        'consumer_consent_provided',
                        'submitted_via',
                        'company_response_to_consumer',
                        'timely_response']
drop_features = ['date_received',
                 'zip_code',
                 'tags',
                 'date_sent_to_company',
                 'complaint_id']

text_feature = 'consumer_complaint_narrative'


preprocessor = make_column_transformer(
    (OneHotEncoder(handle_unknown = 'ignore',
                   sparse=False,
                   drop='if_binary'), categorical_features),
    (CountVectorizer(stop_words='english', max_features = 1000), text_feature),
    ('drop', drop_features))
scoring_metrics = ['accuracy','recall','precision','f1']

cross_val_results = {}


In [8]:

from sklearn.dummy import DummyClassifier
pipe_dc = make_pipeline(preprocessor, DummyClassifier())
pipe_dc.fit(X_train, y_train)
cross_val_results['dummy'] = pd.DataFrame(cross_validate(
    pipe_dc, X_train, y_train,scoring=scoring_metrics)).agg(['mean', 'std']).round(3).T

In [9]:
cross_val_results['dummy']

Unnamed: 0,mean,std
fit_time,0.239,0.031
score_time,0.065,0.005
test_accuracy,0.787,0.001
test_recall,0.0,0.0
test_precision,0.0,0.0
test_f1,0.0,0.0


In [10]:
from sklearn.linear_model import LogisticRegression
pipe_lr = make_pipeline(preprocessor, LogisticRegression(max_iter=1000, class_weight='balanced'))
cross_val_results['logreg'] = pd.DataFrame(cross_validate(
    pipe_lr, X_train, y_train, n_jobs=-1, scoring=scoring_metrics)).agg(['mean', 'std']).round(3).T
cross_val_results['logreg']



Unnamed: 0,mean,std
fit_time,5.326,0.048
score_time,0.087,0.02
test_accuracy,0.638,0.024
test_recall,0.318,0.054
test_precision,0.238,0.024
test_f1,0.271,0.033


In [11]:
from sklearn.naive_bayes import BernoulliNB
pipe_lr = make_pipeline(preprocessor, BernoulliNB(alpha = 0.17))
cross_val_results['bayes'] = pd.DataFrame(cross_validate(
    pipe_lr, X_train, y_train, n_jobs=-1, scoring=scoring_metrics)).agg(['mean', 'std']).round(3).T
cross_val_results['bayes']



Unnamed: 0,mean,std
fit_time,0.29,0.047
score_time,0.07,0.008
test_accuracy,0.716,0.034
test_recall,0.286,0.051
test_precision,0.323,0.068
test_f1,0.3,0.046


In [12]:
from sklearn.svm import SVC
pipe_svc = make_pipeline(preprocessor, SVC(class_weight='balanced'))
cross_val_results['svc'] = pd.DataFrame(cross_validate(
    pipe_svc, X_train, y_train, n_jobs=-1, scoring=scoring_metrics)).agg(['mean', 'std']).round(3).T
cross_val_results['svc']



Unnamed: 0,mean,std
fit_time,9.144,0.152
score_time,3.957,0.21
test_accuracy,0.716,0.019
test_recall,0.22,0.061
test_precision,0.283,0.048
test_f1,0.246,0.053


In [13]:
from sklearn.ensemble import RandomForestClassifier
pipe_svc = make_pipeline(preprocessor, RandomForestClassifier(class_weight='balanced'))
cross_val_results['random forest'] = pd.DataFrame(cross_validate(
    pipe_svc, X_train, y_train, n_jobs=-1, scoring=scoring_metrics)).agg(['mean', 'std']).round(3).T
cross_val_results['random forest']



Unnamed: 0,mean,std
fit_time,2.608,0.041
score_time,0.125,0.008
test_accuracy,0.784,0.007
test_recall,0.015,0.013
test_precision,0.483,0.41
test_f1,0.029,0.024


In [14]:
pd.concat(cross_val_results, axis=1)

Unnamed: 0_level_0,dummy,dummy,logreg,logreg,bayes,bayes,svc,svc,random forest,random forest
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
fit_time,0.239,0.031,5.326,0.048,0.29,0.047,9.144,0.152,2.608,0.041
score_time,0.065,0.005,0.087,0.02,0.07,0.008,3.957,0.21,0.125,0.008
test_accuracy,0.787,0.001,0.638,0.024,0.716,0.034,0.716,0.019,0.784,0.007
test_recall,0.0,0.0,0.318,0.054,0.286,0.051,0.22,0.061,0.015,0.013
test_precision,0.0,0.0,0.238,0.024,0.323,0.068,0.283,0.048,0.483,0.41
test_f1,0.0,0.0,0.271,0.033,0.3,0.046,0.246,0.053,0.029,0.024
