In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings

# NLP Module
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist
import string
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Classification model
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,ConfusionMatrixDisplay,\
                            precision_score,recall_score,f1_score,roc_auc_score, roc_curve
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# Preprocessing models
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders.binary import BinaryEncoder
from imblearn.combine import SMOTETomek

# Hyperparameters tuning models
from hyperopt import tpe,hp,Trials,space_eval
from hyperopt.fmin import fmin
from hyperopt.pyll import scope

In [108]:
df = pd.read_csv('Complaints.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
df.shape

(768358, 18)

In [109]:
missing = df.isnull().sum().div(df.shape[0]).mul(100).to_frame().sort_values(by=0, ascending=False)
missing

Unnamed: 0,0
Tags,85.881191
Consumer complaint narrative,78.651358
Company public response,74.528931
Consumer consent provided?,61.23721
Sub-issue,59.265342
Sub-product,30.605525
State,0.735334
ZIP code,0.50198
Date sent to company,0.0
Consumer disputed?,0.0


In [110]:
df.columns
drop_columns = ['Tags','Consumer complaint narrative','Company public response',
                'Consumer consent provided?','Sub-issue','Sub-product','Complaint ID','ZIP code']
df.drop(drop_columns, axis=1,inplace=True)

## Feature extraction

In [111]:
df[['Date received','Date sent to company']].head()

Unnamed: 0,Date received,Date sent to company
0,2015-01-04,2015-01-04
1,2013-09-04,2013-09-03
2,2014-06-10,2014-06-10
3,2014-01-08,2014-01-08
4,2014-09-11,2014-09-18


In [112]:
df['days_to_forward_complaint']=pd.to_datetime(df['Date sent to company'])-pd.to_datetime(df['Date received'])
df['days_to_forward_complaint'] = df['days_to_forward_complaint'].dt.days
df.head(3)

Unnamed: 0,Date received,Product,Issue,Company,State,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,days_to_forward_complaint
0,2015-01-04,Credit reporting,Incorrect information on credit report,Experian Information Solutions Inc.,TX,Web,2015-01-04,Closed with explanation,Yes,No,0
1,2013-09-04,Credit card,Credit determination,"CITIBANK, N.A.",AZ,Web,2013-09-03,Closed with explanation,Yes,No,-1
2,2014-06-10,Debt collection,Disclosure verification of debt,SYNCHRONY FINANCIAL,CO,Web,2014-06-10,Closed with explanation,Yes,No,0


In [113]:
df.drop(['Date received','Date sent to company'], axis=1, inplace=True)

#### For models to reduce computation time we can use sample of the data for the model

In [114]:
df1 = df.groupby('Consumer disputed?').sample(n=50000)
df1.reset_index(inplace=True)
df1.shape

(100000, 10)

## Text processing

##### For Vectorization
* TFIDF
* CountVectorizer
* NLTK/Scipy Library
* Pretrained Glove

##### Steps for text processing
* Remove punctuation
* Remove Stop Words
* Lower Casing
* Tokenization
* Stemming/Lemmatization


##### Note
* `Issue` column has text which has to be preprocessed.
* The text need to be transformed into vectors as the algorithm will be able to make predictions. In this case, it will be used the Term Frequency-Inverse Document Frequency (TFIDF) weight to evaluate how import a word is to a document in a collection of documents.
* After removing the punctuation and lower casing the words, the importance of the word is determined in terms of the frequency.

In [115]:
# list of stopword which will be remmoved
stopwords_list = stopwords.words('english')+list(string.punctuation)

In [116]:
def process_text(issue):
    """
    Tokenizes the input text, removes stopwords and non-alphabetic words.

    Args:
        issue (str): The input text to be processed.

    Returns:
        list: A list of processed tokens.

    """

    # Tokenize the input text
    tokens = nltk.word_tokenize(issue)

    # Remove stopwords and convert tokens to lowercase
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]

    # Remove non-alphabetic words
    stopwords_removed = [word for word in stopwords_removed if word.isalpha()]

    return stopwords_removed

def concat_strings(words_list):
    """
    Concatenates a list of words into a single string.

    Args:
        words_list (list): The list of words to be concatenated.

    Returns:
        str: The concatenated string.

    """

    concat_words = ''
    for word in words_list:
        concat_words += word + ' '
    return concat_words.strip()

lemmatizer = WordNetLemmatizer()

def lemmatizer_concat(words_list):
    """
    Lemmatizes each word in the given list and concatenates them into a single string.

    Args:
        words_list (list): The list of words to be lemmatized and concatenated.

    Returns:
        str: The lemmatized and concatenated string.

    """

    # Filter out any NaN values in the list
    list_of_words = [i for i in words_list if i is not np.nan]

    lemmatized_list = []
    for idx, word in enumerate(words_list):
        # Lemmatize each word
        lemmatized_list.append(lemmatizer.lemmatize(word))

    return concat_strings(lemmatized_list)

In [117]:
df1.columns

Index(['index', 'Product', 'Issue', 'Company', 'State', 'Submitted via',
       'Company response to consumer', 'Timely response?',
       'Consumer disputed?', 'days_to_forward_complaint'],
      dtype='object')

In [118]:
# Prepare data with text processing
nltk.download('punkt')
nltk.download('wordnet')

for i in range(len(df1)):
    
    # Iterate through all the rows and extract each 'Issue'
    text = process_text(df1['Issue'].loc[i])
    final_texts = lemmatizer_concat(text)
    
    # Change the 'Issue' column into the processed text
    df1['Issue'].loc[i] = final_texts
    if i % 5000 == 0:
        print(f'Processed Row Number {i}')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sheip\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sheip\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Issue'].loc[i] = final_texts


Processed Row Number 0
Processed Row Number 5000
Processed Row Number 10000
Processed Row Number 15000
Processed Row Number 20000
Processed Row Number 25000
Processed Row Number 30000
Processed Row Number 35000
Processed Row Number 40000
Processed Row Number 45000
Processed Row Number 50000
Processed Row Number 55000
Processed Row Number 60000
Processed Row Number 65000
Processed Row Number 70000
Processed Row Number 75000
Processed Row Number 80000
Processed Row Number 85000
Processed Row Number 90000
Processed Row Number 95000


In [119]:
tfidv = TfidfVectorizer(max_features=None, strip_accents='unicode',
                        analyzer='word', ngram_range=(1,2))

df_vect = tfidv.fit_transform(df1['Issue'])
print(df_vect)
feature_names = tfidv.get_feature_names_out()
feature_names

  (0, 304)	0.4691923911482545
  (0, 117)	0.4691923911482545
  (0, 101)	0.3471647111090267
  (0, 303)	0.4691923911482545
  (0, 115)	0.468030088377164
  (1, 103)	0.33796712527389555
  (1, 63)	0.33796712527389555
  (1, 28)	0.33796712527389555
  (1, 73)	0.33796712527389555
  (1, 208)	0.33796712527389555
  (1, 62)	0.33796712527389555
  (1, 27)	0.33796712527389555
  (1, 72)	0.33796712527389555
  (1, 101)	0.29364226172809677
  (2, 66)	0.39023157336624575
  (2, 191)	0.39023157336624575
  (2, 178)	0.39023157336624575
  (2, 134)	0.39023157336624575
  (2, 64)	0.387975331940506
  (2, 190)	0.39023157336624575
  (2, 175)	0.2967688284611039
  (3, 86)	0.3801513733499381
  (3, 157)	0.38830373786893463
  (3, 154)	0.38830373786893463
  (3, 240)	0.3801513733499381
  :	:
  (99996, 76)	0.38406267051213633
  (99996, 150)	0.38406267051213633
  (99996, 155)	0.3825992506669758
  (99996, 264)	0.38406267051213633
  (99996, 74)	0.37723459435170925
  (99996, 149)	0.3482747976666052
  (99997, 61)	0.3941010718331192


array(['account', 'account opening', 'account term', 'acct',
       'acct credited', 'acct wrong', 'action', 'adding', 'adding money',
       'advance', 'advance fee', 'advertising', 'advertising marketing',
       'amount', 'amount charged', 'amt', 'application',
       'application originator', 'application processing', 'applied',
       'applied receive', 'apply', 'apr', 'apr interest', 'arbitration',
       'atm', 'atm card', 'attempt', 'attempt collect', 'available',
       'available promised', 'balance', 'balance transfer', 'bank',
       'bank account', 'bank acct', 'bankruptcy', 'billing',
       'billing dispute', 'billing statement', 'broker', 'ca',
       'ca contact', 'ca repay', 'ca stop', 'card', 'card protection',
       'cash', 'cash advance', 'caused', 'caused fund', 'change',
       'charge', 'charge bank', 'charged', 'charged bank', 'charged fee',
       'charged received', 'check', 'closing', 'closing account',
       'closing management', 'collect', 'collect debt'

### Data Processing

Concat old data withvectorized data from Issue column

In [120]:
df1 = pd.concat([df1, pd.DataFrame(df_vect.toarray())], axis=1)

In [121]:
df1.drop(['Issue','index'], axis=1, inplace=True)

In [122]:
df1

Unnamed: 0,Product,Company,State,Submitted via,Company response to consumer,Timely response?,Consumer disputed?,days_to_forward_complaint,0,1,...,301,302,303,304,305,306,307,308,309,310
0,Debt collection,"Optimum Outcomes, Inc.",TX,Web,Closed with explanation,Yes,No,3,0.000000,0.000000,...,0.0,0.0,0.469192,0.469192,0.0,0.0,0.0,0.0,0.0,0.0
1,Debt collection,"CAINE & WEINER COMPANY, INC.",KY,Web,Closed with explanation,Yes,No,0,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,Mortgage,WELLS FARGO & COMPANY,CA,Phone,Closed with explanation,Yes,No,6,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,Credit reporting,"EQUIFAX, INC.",NY,Web,Closed with explanation,Yes,No,0,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,Money transfers,"WESTERN UNION COMPANY, THE",WA,Phone,Closed with explanation,Yes,No,3,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,Credit reporting,"EQUIFAX, INC.",VA,Web,Closed with explanation,Yes,Yes,0,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
99996,Debt collection,Five Lakes Agency,IL,Web,Closed with explanation,Yes,Yes,15,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
99997,Bank account or service,CAPITAL ONE FINANCIAL CORPORATION,DE,Web,Closed with explanation,Yes,Yes,2,0.272141,0.394101,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
99998,Mortgage,PHH Mortgage Services Corporation,NY,Web,Closed with non-monetary relief,Yes,Yes,1,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [123]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Columns: 319 entries, Product to 310
dtypes: float64(311), int64(1), object(7)
memory usage: 243.4+ MB


In [134]:
from sklearn.model_selection import train_test_split
X = df1.drop(['Consumer disputed?'],axis=1)
y = df1['Consumer disputed?']

In [135]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Columns: 318 entries, Product to 310
dtypes: float64(311), int64(1), object(6)
memory usage: 242.6+ MB


In [136]:
X.shape, y.shape

((100000, 318), (100000,))

In [137]:
X.columns

Index([                     'Product',                      'Company',
                              'State',                'Submitted via',
       'Company response to consumer',             'Timely response?',
          'days_to_forward_complaint',                              0,
                                    1,                              2,
       ...
                                  301,                            302,
                                  303,                            304,
                                  305,                            306,
                                  307,                            308,
                                  309,                            310],
      dtype='object', length=318)

Intialize feature for transformation

In [46]:
X.to_csv('test_X.csv')

In [141]:
X.columns[317]
X.columns = X.columns.astype(str)

In [140]:
X.columns[7:]
# X.columns[7:319] = X.columns[7:319].astype(str)
for i in X.columns[7:]:
    X.columns[i] = str(X.columns[i])


TypeError: Index does not support mutable operations

In [27]:
X['Company'].dtype

dtype('O')

-----------------------------

In [76]:
x_columnms = ['Product','Company','State','Submitted via',
            'Company response to consumer','Timely response?',
            'days_to_forward_complaint' ]

X_col = ['Product','Company','State','Submitted via',
            'Company response to consumer','Timely response?']

for i in x_columnms:
    print(i,X[i].dtype)
    
X_test = X[X_col]
X_test

Product object
Company object
State object
Submitted via object
Company response to consumer object
Timely response? object
days_to_forward_complaint int64


Unnamed: 0,Product,Company,State,Submitted via,Company response to consumer,Timely response?
0,Payday loan,"Mobiloans, LLC",FL,Web,Untimely response,No
1,Mortgage,M&T BANK CORPORATION,DE,Web,Closed with explanation,Yes
2,Credit reporting,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",MD,Referral,Closed with non-monetary relief,Yes
3,Credit reporting,"EQUIFAX, INC.",IL,Web,Closed with non-monetary relief,Yes
4,Credit reporting,Experian Information Solutions Inc.,AL,Web,Closed with explanation,Yes
...,...,...,...,...,...,...
99995,Mortgage,NATIONSTAR MORTGAGE LLC,FL,Web,Closed with monetary relief,No
99996,Debt collection,U.S. BANCORP,KY,Referral,Closed with explanation,Yes
99997,Credit reporting,"EQUIFAX, INC.",TX,Web,Closed with explanation,Yes
99998,Mortgage,Ocwen Financial Corporation,IL,Web,Closed with explanation,Yes


In [78]:
encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(X_test[['Timely response?']])
pd.DataFrame(encoded_data.toarray())


Unnamed: 0,0,1
0,1.0,0.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0
...,...,...
99995,1.0,0.0
99996,0.0,1.0
99997,0.0,1.0
99998,0.0,1.0


In [86]:
from category_encoders import BinaryEncoder

encoder = BinaryEncoder()

df_encoded = encoder.fit_transform(X_test[['Product','Timely response?','Submitted via','State','Company response to consumer','Company']])
pd.DataFrame(df_encoded).astype('float64').dtypes

Product_0                         float64
Product_1                         float64
Product_2                         float64
Product_3                         float64
Timely response?_0                float64
Timely response?_1                float64
Submitted via_0                   float64
Submitted via_1                   float64
Submitted via_2                   float64
State_0                           float64
State_1                           float64
State_2                           float64
State_3                           float64
State_4                           float64
State_5                           float64
Company response to consumer_0    float64
Company response to consumer_1    float64
Company response to consumer_2    float64
Company_0                         float64
Company_1                         float64
Company_2                         float64
Company_3                         float64
Company_4                         float64
Company_5                         

------------------------

In [142]:
binary_features = ['Product','Timely response?','Submitted via','State','Company response to consumer','Company']
# onehot_features = ['Timely response?']
# numerical_features =[str(x) for x in range(311)]

Create column tansformer for transformation

In [143]:
binary_encoder_pipeline = Pipeline(steps=[
    ('SimpleImputer', SimpleImputer(strategy='most_frequent')),
    ('BinaryEncoder', BinaryEncoder())                               
])

preprocessor = ColumnTransformer(
    [
        ('binary_encoder_pipeline', binary_encoder_pipeline, binary_features)
    ]
,remainder='passthrough'
)

In [144]:
X = preprocessor.fit_transform(X)

In [147]:
X_df = pd.DataFrame(X)
X_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,332,333,334,335,336,337,338,339,340,341
0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.469192,0.469192,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
99996,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
99997,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
99998,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [149]:
np.array(X).dtype

dtype('float64')

In [150]:
X_df.columns

RangeIndex(start=0, stop=342, step=1)

In [151]:
y = np.where(y.values == 'Yes', 0, 1)

In [152]:
y

array([1, 1, 1, ..., 0, 0, 0])

## Handling Imbalanced Dataset

* Synthetic Minority oversampling Technique or SMOTE is another technique to oversample the minority class, duplicate the minority dataset.

* SMOTE is one of the famous oversampling techniques and is very effective in handling class imbalance. Combine SMOTE to some undersampling technique(ENN, Tomek) to increase the effectiveness of handling the minority class.

In [153]:
smt = SMOTETomek(random_state=42, sampling_strategy='minority', n_jobs=-1)
X_res, y_res = smt.fit_resample(X_df,y)



In [158]:
X_res.shape, y_res.shape

((92870, 342), (92870,))

## Model Selection


In [169]:
def evaluate_clf(true, prediction):
    acc = accuracy_score(true, prediction)
    f1 = f1_score(true, prediction)
    precision = precision_score(true, prediction)
    recall = recall_score(true, prediction)
    roc_auc = roc_auc_score(true, prediction)
    return (acc, f1, precision, recall, roc_auc)

The history saving thread hit an unexpected error (OperationalError('database or disk is full')).History will not be written to the database.


In [170]:
models = {
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Logistic Regression': LogisticRegression(),
    'K-Neighbor classification': KNeighborsClassifier(),
    'XGBClassifier': XGBClassifier(),
    'CatBoost Classifier': CatBoostClassifier(verbose=False),
    'AdaBoost Classifier': AdaBoostClassifier()
}

In [173]:
def evaluation_models(X, y, models):
    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                        y, 
                                                        test_size=0.2, 
                                                        random_state=42)
    
    models_list = []
    accuracy_list = []
    auc = []
    
    for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        
        model_train_accuracy, model_train_f1, model_train_precision,\
        model_train_recall, model_train_rocauc_score = evaluate_clf(y_train, y_train_pred)
        
        model_test_accuracy, model_test_f1, model_test_precision,\
        model_test_recall, model_test_rocauc_score = evaluate_clf(y_test, y_test_pred)
        
        print(list(models.keys())[i])
        models_list.append(list(models.keys())[i])
        
        print('Model Evalution on Training Data')
        print('- Accuracy Score:', model_train_accuracy)
        print('- F1 Score Score:', model_train_f1)
        print('- Precision Score:', model_train_precision)
        print('- Recall Score:', model_train_recall)
        print('- ROC AUC Score:', model_train_rocauc_score)
        print('------------------------------------------------------------------------------------------------')
        print('Model Evalution on Testing Data')
        print('- Accuracy Score:', model_test_accuracy)
        print('- F1 Score Score:', model_test_f1)
        print('- Precision Score:', model_test_precision)
        print('- Recall Score:', model_test_recall)
        print('- ROC AUC Score:', model_test_rocauc_score)
        
        auc.append(model_test_rocauc_score)
        
        print('='*35)
        print()
        
    report = pd.DataFrame(list(zip(models_list, accuracy_list)),
                          columns=['Model Name', 'Accuracy']).sort_values(by=['Accuracy'],ascending=False)
    
    return report

In [None]:
report = evaluation_models(X=X_res, y=y_res, models = models)