### Import Relevant Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import re
import seaborn as sns
sns.set()

### Generate Master DF

In [3]:
master_df = pd.read_csv('Consumer_Complaints.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
master_df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,10/01/2019,"Payday loan, title loan, or personal loan",Installment loan,Struggling to pay your loan,,,Company believes it acted appropriately as aut...,"Atlas Credit Company, Inc.",TX,75703,,Consent not provided,Web,10/01/2019,Closed with explanation,Yes,,3391722
1,10/01/2019,Debt collection,Other debt,False statements or representation,Attempted to collect wrong amount,,Company has responded to the consumer and the ...,"ProCollect, Inc",TX,79936,Servicemember,Consent not provided,Web,10/01/2019,Closed with explanation,Yes,,3391649
2,10/01/2019,Debt collection,Auto debt,Written notification about debt,Notification didn't disclose it was an attempt...,,,NAVY FEDERAL CREDIT UNION,CA,91915,,,Web,10/01/2019,In progress,Yes,,3391379
3,10/01/2019,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Account information incorrect,,,OKLAHOMA STUDENT LOAN AUTHORITY,IN,47130,,,Web,10/01/2019,Closed with explanation,Yes,,3391378
4,10/01/2019,Debt collection,Medical debt,Attempts to collect debt not owed,Debt is not yours,,Company disputes the facts presented in the co...,"Eastern Account Systems of Connecticut, Inc.",CT,6401,,Other,Web,10/01/2019,Closed with explanation,Yes,,3391434


### Preprocessing: dropping missing narratives, removing 'X's, and taking an even sample of 1,000(/10,000) narratives with timely and un-timely responses

In [5]:
proc_df = master_df.dropna(subset=['Consumer complaint narrative'])

In [6]:
#proc_df['Rapid response'] = (proc_df['Timely response?'] == 'Yes') & (proc_df['Consumer disputed?'] != 'Yes')

In [7]:
#proc_df['Rapid response'].value_counts()

In [29]:
new_df = proc_df[proc_df['Timely response?']=='Yes'].sample(2000)

new_df2 = proc_df[proc_df['Timely response?']=='No'].sample(2000)

df = new_df.append(new_df2)

In [30]:
first_clean = [re.sub('X', '', nar) for nar in df['Consumer complaint narrative']]

second_clean = [re.sub("\'", ' ', nar) for nar in first_clean]

df['Cleaned narratives'] = [nar for nar in second_clean]

In [31]:
from nltk import word_tokenize, pos_tag

def nouns_adjs(text):
    
    # So far it seems like the best POS tags are: Nouns, Adjectives, Adverbs, Superlative Adjs
    
    """Noun 'NN', Adjective 'JJ', Adverb 'RB', Superlative Adverb 'RBS', Superlative Adjective 'JJS', 
    Personal Pronoun 'PRP', Personal Pronoun Possessive 'PRP$' """
    
    noun_adj = lambda pos: (pos[:2] == 'NN' or pos[:2] == 'JJ' or pos[:2] == 'RB' or pos[:2] == 'RBS'
                            or pos[:2] == 'JJS')
    tokenized = word_tokenize(text)
    nouns_adjs = [word for (word, pos) in pos_tag(tokenized) if noun_adj(pos)]
    output = ' '.join(nouns_adjs)
    
    return output

In [32]:
df['Cleaned narratives'] = [nouns_adjs(nar) for nar in df['Cleaned narratives']]

In [33]:
# from textblob import TextBlob

# df['Subjectivity'] = [TextBlob(blob).subjectivity for blob in df['Cleaned narratives'] ]

# df['Polarity'] = [TextBlob(blob).polarity for blob in df['Cleaned narratives']]

### Train_test_split the data

In [52]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['Cleaned narratives'], 
                                                       df['Timely response?'], random_state=42)

### Tfidf Vectorize

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

#vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,3), min_df=5)

vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,3), min_df=3)

X_train = vectorizer.fit_transform(X_train)

X_test = vectorizer.transform(X_test)

# X_train['Cleaned narratives'] = vectorizer.fit_transform(X_train['Cleaned narratives'])

# X_test['Cleaned narratives'] = vectorizer.transform(X_test['Cleaned narratives'])

In [54]:
#vectorizer.stop_words_

In [55]:
# from sklearn.decomposition import TruncatedSVD

# n_components = 200

# svd = TruncatedSVD(n_components)
    
# X_train = svd.fit_transform(X_train)

# X_test = svd.transform(X_test)
    
# explained_variance_1 = round(100*sum(svd.explained_variance_ratio_), 3)

# print('Retained info after SVD: ' + str(explained_variance_1) + '%')

### Support Vector Classifier

In [57]:
from sklearn.svm import SVC

svc_pipe = make_pipeline(SVC(kernel='linear'))

param_grid = {'svc__C': [0.4, 0.55, 0.7]}

cv_input = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#svc = SVC(kernel='linear', C=1)

grid = GridSearchCV(svc_pipe, param_grid, cv=cv_input).fit(X_train, y_train)

#svc.fit(X_train, y_train)

print(f'Best param: {grid.best_params_}')

print(f'Train score: {grid.score(X_train, y_train)}')

print(f'Test score: {grid.score(X_test, y_test)}')

Best param: {'svc__C': 0.55}
Train score: 0.8623333333333333
Test score: 0.706


### Random Forest Classifier

In [40]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=200)

rf_clf.fit(X_train, y_train)

print(f'Train score: {rf_clf.score(X_train, y_train)}')

print(f'Test score: {rf_clf.score(X_test, y_test)}')

Train score: 0.9976666666666667
Test score: 0.685


### Gradient Boosting Classifier

In [42]:
from sklearn.ensemble import GradientBoostingClassifier

pipe = make_pipeline(GradientBoostingClassifier(n_estimators=100))

param_grid = {'gradientboostingclassifier__learning_rate': [0.025, 0.05, 0.075]}

cv_input = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, param_grid, cv=cv_input).fit(X_train, y_train)

print(f'Best params: {grid.best_params_}')

print(f'Train score: {grid.score(X_train, y_train)}')

print(f'Test score: {grid.score(X_test, y_test)}')

Best params: {'gradientboostingclassifier__learning_rate': 0.075}
Train score: 0.781
Test score: 0.671


### Bernoulli Naive Bayes Classifier

In [48]:
from sklearn.naive_bayes import BernoulliNB

pipe = make_pipeline(BernoulliNB())

param_grid = {'bernoullinb__alpha': [0.6, 0.65, 0.7, 0.75, 0.8, 0.9, 1]}

cv_input = StratifiedKFold(n_splits=20, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, param_grid, cv=cv_input).fit(X_train, y_train)

#bernoulli.fit(X_train, y_train)

print(f'Best params: {grid.best_params_}')

print(f'Train score: {grid.score(X_train, y_train)}')

print(f'Test score: {grid.score(X_test, y_test)}')

Best params: {'bernoullinb__alpha': 0.6}
Train score: 0.867
Test score: 0.695


### AdaBoost Classifier

In [44]:
from sklearn.ensemble import AdaBoostClassifier

pipe = make_pipeline(AdaBoostClassifier(n_estimators=200))

param_grid = {'adaboostclassifier__learning_rate': [0.1, 0.3, 0.5]}

cv_input = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, param_grid, cv=cv_input).fit(X_train, y_train)

print(f'Best params: {grid.best_params_}')

print(f'Train score: {grid.score(X_train, y_train)}')

print(f'Test score: {grid.score(X_test, y_test)}')

Best params: {'adaboostclassifier__learning_rate': 0.1}
Train score: 0.7453333333333333
Test score: 0.653


### XGB Classifier

In [45]:
from xgboost import XGBClassifier

pipe = make_pipeline(XGBClassifier())

param_grid = {'xgbclassifier__eta': [0.05, 0.075, 0.1]}

cv_input = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, param_grid, cv=cv_input).fit(X_train, y_train)

print(f'Best params: {grid.best_params_}')

print(f'Train score: {grid.score(X_train, y_train)}')

print(f'Test score: {grid.score(X_test, y_test)}')

Best params: {'xgbclassifier__eta': 0.05}
Train score: 0.7736666666666666
Test score: 0.665


In [595]:
import pickle

filename = 'XGBClassifier'

outfile = open(filename, 'wb')

In [596]:
pickle.dump(grid, outfile)

outfile.close()