In [71]:
import os
import re
import pandas as pd
import string
from dotenv import find_dotenv, load_dotenv

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [12]:
load_dotenv()

True

In [41]:
RANDOM_STATE: int = 42
DIR_DATA_RAW: str = os.getenv("DIR_DATA_RAW")

In [42]:
df = pd.read_csv(f"../{DIR_DATA_RAW}/consumer_complaints.csv")
df.head()

  df = pd.read_csv(f"../{DIR_DATA_RAW}/consumer_complaints.csv")


Unnamed: 0,date_received,product,sub_product,issue,sub_issue,consumer_complaint_narrative,company_public_response,company,state,zipcode,tags,consumer_consent_provided,submitted_via,date_sent_to_company,company_response_to_consumer,timely_response,consumer_disputed?,complaint_id
0,08/30/2013,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,U.S. Bancorp,CA,95993,,,Referral,09/03/2013,Closed with explanation,Yes,Yes,511074
1,08/30/2013,Mortgage,Other mortgage,"Loan servicing, payments, escrow account",,,,Wells Fargo & Company,CA,91104,,,Referral,09/03/2013,Closed with explanation,Yes,Yes,511080
2,08/30/2013,Credit reporting,,Incorrect information on credit report,Account status,,,Wells Fargo & Company,NY,11764,,,Postal mail,09/18/2013,Closed with explanation,Yes,No,510473
3,08/30/2013,Student loan,Non-federal student loan,Repaying your loan,Repaying your loan,,,"Navient Solutions, Inc.",MD,21402,,,Email,08/30/2013,Closed with explanation,Yes,Yes,510326
4,08/30/2013,Debt collection,Credit card,False statements or representation,Attempted to collect wrong amount,,,Resurgent Capital Services L.P.,GA,30106,,,Web,08/30/2013,Closed with explanation,Yes,Yes,511067


In [43]:
# Removing Unwanted Data
df = df[['product', 'consumer_complaint_narrative']]

In [44]:
df.isnull().sum()

product                              0
consumer_complaint_narrative    489151
dtype: int64

In [45]:
df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


In [46]:
X = df[['consumer_complaint_narrative']].copy()
y = df['product'].copy()

In [149]:
y.value_counts()

Debt collection            17552
Mortgage                   14919
Credit reporting           12526
Credit card                 7929
Bank account or service     5711
Consumer Loan               3678
Student loan                2128
Prepaid card                 861
Payday loan                  726
Money transfers              666
Other financial service      110
Name: product, dtype: int64

In [None]:
count_vectorizer = CountVectorizer(stop_words='english')

In [132]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, remove_stopwords=True, lower_case=True, remove_punctuations=True, remove_digits=True, remove_extraspaces=True,
                 stemming=False, lemmatization=False):
        self.remove_stopwords = remove_stopwords
        self.lower_case = lower_case
        self.remove_punctuations = remove_punctuations
        self.remove_digits = remove_digits
        self.remove_extraspaces = remove_extraspaces
        self.stemming = stemming
        self.lemmatization = lemmatization
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_ = X.iloc[:, 0].copy()
        if self.remove_stopwords:
            stop_words = set(stopwords.words('english'))
            X_ = X_.apply(lambda x: " ".join([item for item in x.split() if item not in stop_words]))
            
        if self.lower_case:
            X_ = X_.str.lower()
            
        if self.remove_punctuations:
            X_ = X_.apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '' , x))
            
        if self.remove_digits:
            X_ = X_.str.replace('\d+', '', regex=True)
            
        if self.remove_extraspaces:
            X_ = X_.apply(lambda x: re.sub(' +', ' ', x))
                        
        if self.stemming:
            stemmer = PorterStemmer()
            X_ = X_.apply(lambda x: " ".join([stemmer.stem(word) for word in x.split()]))
                        
        if self.lemmatization:
            lemmatizer = WordNetLemmatizer()
            X_ = X_.apply(lambda x: " ".join([lemmatizer.lemmatize(word) for word in x.split()]))
        
        return X_

In [140]:
text_preprocessor = TextPreprocessor(lemmatization=True)

In [145]:
text_preprocessor.fit_transform(X)

190126    xxxx claimed i owe xxxx year despite proof pay...
190135    due inconsistency amount owed i told m t bank ...
190155    in xxxxxxxx wage i earned job decreased almost...
190207    i open current mortgage chase bank xxxx chase ...
190208    xxxx submitted xxxxxxxx at time i submitted co...
                                ...                        
553084    xxxx xxxx reporting incorrectly payment time v...
553085        reflecting incorrect payment status have time
553086    i paying month direct debit withdrawal checkin...
553090    i recently became aware amerisave mortgage cor...
553096    bank america demonstrated ongoing level incomp...
Name: consumer_complaint_narrative, Length: 66806, dtype: object

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)