In [1]:
import os
import sys
sys.path.append(os.path.join(os.getcwd(),os.pardir,'data'))
from load_preprocess_data import load_raw_complaints_data
import altair as alt
import pandas as pd
import warnings
warnings.simplefilter("ignore", UserWarning)
data_path = os.path.join(os.pardir, os.pardir ,"data", "raw", "complaints.csv")

complaints_df = load_raw_complaints_data(data_path)

In [2]:
complaints_df = complaints_df.query('not consumer_disputed.isnull()')
complaints_df['consumer_disputed'].replace(['Yes','No'],[1,0], inplace = True)
drop_features = ['date_received',
                 'zip_code',
                 'tags',
                 'date_sent_to_company',
                 'complaint_id']
complaints_df = complaints_df.drop(columns = drop_features).dropna()

In [3]:
complaints_df.head()
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(complaints_df,test_size=0.2, random_state=123)

In [4]:
train_df.head()

Unnamed: 0,product,sub_product,issue,sub_issue,consumer_complaint_narrative,company_public_response,company,state,consumer_consent_provided,submitted_via,company_response_to_consumer,timely_response,consumer_disputed
2226480,Student loan,Non-federal student loan,Dealing with my lender or servicer,Received bad information about my loan,WF XXXX continues to report that ( XXXX ) acco...,Company chooses not to provide a public response,WELLS FARGO & COMPANY,CA,Consent provided,Web,Closed with explanation,Yes,0
1134000,Debt collection,"Other (i.e. phone, health club, etc.)",Communication tactics,Used obscene/profane/abusive language,I did join the gym. I took out a membership. I...,Company chooses not to provide a public response,Thrive National Corporation,MS,Consent provided,Web,Closed with explanation,Yes,0
2087336,Debt collection,Medical,Cont'd attempts collect debt not owed,Debt is not mine,"I had XXXX XX/XX/XXXX and the XXXX, XXXX, XXXX...",Company believes it acted appropriately as aut...,Monterey Financial Services LLC,ID,Consent provided,Web,Closed with explanation,Yes,1
1951383,Debt collection,"Other (i.e. phone, health club, etc.)",Cont'd attempts collect debt not owed,Debt is not mine,XXXX XXXX tells me I owe {$620.00} over 8 bill...,Company believes it acted appropriately as aut...,"Amsher Collection Services, Inc.",NC,Consent provided,Web,Closed with explanation,Yes,0
2220703,Debt collection,I do not know,Disclosure verification of debt,Not given enough info to verify debt,Have been turned over to collection on a debt ...,Company believes it acted appropriately as aut...,CONRAD CREDIT CORPORATION,CO,Consent provided,Web,Closed with explanation,Yes,0


In [5]:
unique_df = pd.DataFrame()
unique_df['columns'] = complaints_df.columns
unique_df['valid_count'] = complaints_df.count(axis=0).reset_index()[0]
unique_df['unique_count'] = complaints_df.nunique().reset_index()[0]
unique_df

Unnamed: 0,columns,valid_count,unique_count
0,product,19432,2
1,sub_product,19432,10
2,issue,19432,9
3,sub_issue,19432,37
4,consumer_complaint_narrative,19432,19146
5,company_public_response,19432,10
6,company,19432,1425
7,state,19432,57
8,consumer_consent_provided,19432,1
9,submitted_via,19432,1


In [6]:
target = pd.DataFrame(complaints_df.value_counts('consumer_disputed')).reset_index()
target.columns = ['consumer_disputed','count']
alt.Chart(target).mark_bar().encode(
    x=alt.X('consumer_disputed:O',title = 'Consumer Disputed'),
    y=alt.Y('count:Q',title = 'Count'),
    color='consumer_disputed:O',
)

  for col_name, dtype in df.dtypes.iteritems():


In [7]:
X_train, y_train = train_df.drop(columns= ['consumer_disputed']), train_df['consumer_disputed']
X_test, y_test = test_df.drop(columns= ['consumer_disputed']), train_df['consumer_disputed']

In [10]:
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_validate

categorical_features = ['product',
                        'sub_product',
                        'issue',
                        'sub_issue',
                        'company_public_response', 
                        'company',
                        'state',
                        'consumer_consent_provided',
                        'consumer_consent_provided',
                        'submitted_via',
                        'company_response_to_consumer',
                        'timely_response']
drop_features = ['consumer_consent_provided','submitted_via']


text_feature = 'consumer_complaint_narrative'

preprocessor = make_column_transformer(
    (OneHotEncoder(handle_unknown = 'ignore',
                   drop='if_binary'), categorical_features),
    (CountVectorizer(stop_words='english', max_features = 1000), text_feature),
    ('drop', drop_features))
scoring_metrics = ['accuracy','recall','precision','f1']

cross_val_results = {}


In [11]:

from sklearn.dummy import DummyClassifier
pipe_dc = make_pipeline(preprocessor, DummyClassifier())
pipe_dc.fit(X_train, y_train)
cross_val_results['dummy'] = pd.DataFrame(cross_validate(
    pipe_dc, X_train, y_train,scoring=scoring_metrics)).agg(['mean']).round(3).T

In [12]:
cross_val_results['dummy']

Unnamed: 0,mean
fit_time,1.357
score_time,0.347
test_accuracy,0.779
test_recall,0.0
test_precision,0.0
test_f1,0.0


In [13]:
from sklearn.linear_model import LogisticRegression
pipe_lr = make_pipeline(preprocessor, LogisticRegression(max_iter=1000, class_weight='balanced'))
cross_val_results['logreg'] = pd.DataFrame(cross_validate(
    pipe_lr, X_train, y_train, n_jobs=-1, scoring=scoring_metrics)).agg(['mean']).round(3).T
cross_val_results['logreg']

In [None]:
from sklearn.naive_bayes import BernoulliNB
pipe_lr = make_pipeline(preprocessor, BernoulliNB(alpha = 0.1))
cross_val_results['bayes'] = pd.DataFrame(cross_validate(
    pipe_lr, X_train, y_train, n_jobs=-1, scoring=scoring_metrics)).agg(['mean']).round(3).T
cross_val_results['bayes']

In [None]:
from sklearn.svm import SVC
pipe_svc = make_pipeline(preprocessor, SVC(class_weight='balanced'))
cross_val_results['svc'] = pd.DataFrame(cross_validate(
    pipe_svc, X_train, y_train, n_jobs=-1, scoring=scoring_metrics)).agg(['mean']).round(3).T
cross_val_results['svc']

In [None]:
from sklearn.ensemble import RandomForestClassifier
pipe_svc = make_pipeline(preprocessor, RandomForestClassifier(class_weight='balanced'))
cross_val_results['random forest'] = pd.DataFrame(cross_validate(
    pipe_svc, X_train, y_train, n_jobs=-1, scoring=scoring_metrics)).agg(['mean']).round(3).T
cross_val_results['random forest']

In [None]:
res = pd.concat(cross_val_results, axis=1)

In [None]:
res.columns = res.columns.droplevel(1)

Unnamed: 0,dummy,dummy.1,logreg,logreg.1,bayes,bayes.1,svc,svc.1,random forest,random forest.1
fit_time,1.165,0.087,6.687,0.081,2.63,0.042,100.239,1.03,24.474,0.18
score_time,0.291,0.03,0.583,0.022,0.601,0.16,21.818,0.183,0.716,0.017
test_accuracy,0.779,0.0,0.634,0.011,0.696,0.006,0.668,0.006,0.784,0.002
test_recall,0.0,0.0,0.514,0.025,0.356,0.013,0.466,0.013,0.053,0.009
test_precision,0.0,0.0,0.305,0.012,0.328,0.011,0.325,0.008,0.632,0.048
test_f1,0.0,0.0,0.383,0.014,0.342,0.011,0.383,0.009,0.098,0.016


In [None]:
# pd.read_csv('res.csv', index_col=[0], header=[0,1])

source = res.melt()
source.columns = ['model', 'stat','score']

In [None]:
source

In [None]:

alt.Chart(source).mark_bar().encode(
    x='model:O',
    y='score:Q',
    color='stat:N',
    column='stat:N'
)