In [1]:
import pandas as pd
import numpy as np
import nltk
from IPython.display import display
from decimal import *
from nltk.sentiment import SentimentIntensityAnalyzer

### Useful column cleaning helper functions

In [2]:
### Define function
def trim_head_tail_space(df_column):
    clean_df_column = df_column.str.strip()
    print("row changes in column " + str(df_column.name) +": " ,sum(df_column!=clean_df_column))
    return clean_df_column

In [3]:
### Define function
def remove_consecutive_spaces(df_column):
    clean_df_column = df_column.replace('\s\s+', ' ', regex=True)
    print("row changes in column " + str(df_column.name) +": " ,sum(df_column!=clean_df_column))
    return clean_df_column

In [4]:
### Define function
def remove_special_characters(df_column,bad_characters_list):
    clean_df_column = df_column
    for bad_char in bad_characters_list:
        clean_df_column = clean_df_column.str.replace(bad_char,' ')
        print("row changes in column " + str(df_column.name) + " after removing character " + str(bad_char) + ": " ,sum(df_column!=clean_df_column))
    clean_df_column = clean_df_column.str.title()
    return clean_df_column

### Import data and perform initial exploratory data analysis

In [5]:
emails_raw = pd.read_csv("./emails_data/consumer_complaints.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [6]:
emails_raw.head()

Unnamed: 0,date_received,product,sub_product,issue,sub_issue,consumer_complaint_narrative,company_public_response,company,state,zipcode,tags,consumer_consent_provided,submitted_via,date_sent_to_company,company_response_to_consumer,timely_response,consumer_disputed?,complaint_id
0,08/30/2013,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,U.S. Bancorp,CA,95993,,,Referral,09/03/2013,Closed with explanation,Yes,Yes,511074
1,08/30/2013,Mortgage,Other mortgage,"Loan servicing, payments, escrow account",,,,Wells Fargo & Company,CA,91104,,,Referral,09/03/2013,Closed with explanation,Yes,Yes,511080
2,08/30/2013,Credit reporting,,Incorrect information on credit report,Account status,,,Wells Fargo & Company,NY,11764,,,Postal mail,09/18/2013,Closed with explanation,Yes,No,510473
3,08/30/2013,Student loan,Non-federal student loan,Repaying your loan,Repaying your loan,,,"Navient Solutions, Inc.",MD,21402,,,Email,08/30/2013,Closed with explanation,Yes,Yes,510326
4,08/30/2013,Debt collection,Credit card,False statements or representation,Attempted to collect wrong amount,,,Resurgent Capital Services L.P.,GA,30106,,,Web,08/30/2013,Closed with explanation,Yes,Yes,511067


In [7]:
emails_raw.describe

<bound method NDFrame.describe of        date_received           product                  sub_product  \
0         08/30/2013          Mortgage               Other mortgage   
1         08/30/2013          Mortgage               Other mortgage   
2         08/30/2013  Credit reporting                          NaN   
3         08/30/2013      Student loan     Non-federal student loan   
4         08/30/2013   Debt collection                  Credit card   
...              ...               ...                          ...   
555952    07/01/2014          Mortgage               Other mortgage   
555953    07/01/2014          Mortgage               Other mortgage   
555954    07/10/2012          Mortgage  Conventional fixed mortgage   
555955    04/14/2015   Debt collection                I do not know   
555956    08/14/2014   Debt collection                I do not know   

                                           issue  \
0       Loan modification,collection,foreclosure   
1       L

In [8]:
print('Summary of missing data per percent per column')
print('-' * 30)
for col in emails_raw.columns:
    percentage_missing = np.mean(emails_raw[col].isnull())
    print('{} - {}%'.format(col, round(percentage_missing*100)))

Summary of missing data per percent per column
------------------------------
date_received - 0%
product - 0%
sub_product - 28%
issue - 0%
sub_issue - 62%
consumer_complaint_narrative - 88%
company_public_response - 85%
company - 0%
state - 1%
zipcode - 1%
tags - 86%
consumer_consent_provided - 78%
submitted_via - 0%
date_sent_to_company - 0%
company_response_to_consumer - 0%
timely_response - 0%
consumer_disputed? - 0%
complaint_id - 0%


### Sentiment assignment for initial email data.

Instead of manually assigning Sentiment Label for each email, we are
going to use The "vader_lexicon" package form the nltk package to assign each
email in our dataset a label of either neg, neu and pos for negative, neutral and positive 

In [9]:
nltk.download([
     "vader_lexicon",
 ])

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\AD\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [10]:
sentiment = SentimentIntensityAnalyzer()

### Perform some text data cleaning

In [11]:
trim_head_tail_space(emails_raw['consumer_complaint_narrative'])

row changes in column consumer_complaint_narrative:  555522


0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
         ... 
555952    NaN
555953    NaN
555954    NaN
555955    NaN
555956    NaN
Name: consumer_complaint_narrative, Length: 555957, dtype: object

In [12]:
bad_characters_list=['\n', '*']
remove_special_characters(emails_raw['consumer_complaint_narrative'],bad_characters_list)

row changes in column consumer_complaint_narrative after removing character 
:  555676
row changes in column consumer_complaint_narrative after removing character *:  555679


  """


0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
         ... 
555952    NaN
555953    NaN
555954    NaN
555955    NaN
555956    NaN
Name: consumer_complaint_narrative, Length: 555957, dtype: object

In [13]:
remove_consecutive_spaces(emails_raw['consumer_complaint_narrative'])

row changes in column consumer_complaint_narrative:  555774


0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
         ... 
555952    NaN
555953    NaN
555954    NaN
555955    NaN
555956    NaN
Name: consumer_complaint_narrative, Length: 555957, dtype: object

In [14]:
filtered_emails = emails_raw[emails_raw['consumer_complaint_narrative'].notnull()]

In [15]:
filtered_emails.head(3)

Unnamed: 0,date_received,product,sub_product,issue,sub_issue,consumer_complaint_narrative,company_public_response,company,state,zipcode,tags,consumer_consent_provided,submitted_via,date_sent_to_company,company_response_to_consumer,timely_response,consumer_disputed?,complaint_id
190126,03/19/2015,Debt collection,"Other (i.e. phone, health club, etc.)",Cont'd attempts collect debt not owed,Debt was paid,XXXX has claimed I owe them {$27.00} for XXXX ...,,"Diversified Consultants, Inc.",NY,121XX,Older American,Consent provided,Web,03/19/2015,Closed with explanation,Yes,No,1290516
190135,03/19/2015,Consumer Loan,Vehicle loan,Managing the loan or lease,,Due to inconsistencies in the amount owed that...,,M&T Bank Corporation,VA,221XX,Servicemember,Consent provided,Web,03/19/2015,Closed with explanation,Yes,No,1290492
190155,03/19/2015,Mortgage,Conventional fixed mortgage,"Loan modification,collection,foreclosure",,In XX/XX/XXXX my wages that I earned at my job...,,Wells Fargo & Company,CA,946XX,,Consent provided,Web,03/19/2015,Closed with explanation,Yes,Yes,1290524


In [16]:
filtered_emails.shape

(66806, 18)

### Create new data frame to hold annotated values, assign those values

In [18]:
sentiment_df = pd.DataFrame(columns = ["sentiment","text"])

In [19]:
for text in filtered_emails['consumer_complaint_narrative']:
#for text in filtered_emails['consumer_complaint_narrative'][:10]:  
    sent = 'neu'
    m_stats = [0,0]
    
    #tweak these values
    neg_min = Decimal('.1')
    pos_min = Decimal('.15')
    
    negative = Decimal(str(sentiment.polarity_scores(text)['neg']))
    positive = Decimal(str(sentiment.polarity_scores(text)['pos']))
    
    if neg_min < negative:
        sent = 'neg'   
    elif pos_min < positive:
        sent = 'pos'  
    else:      
        sent = 'neu'   

    m_stats[1] = text
    m_stats[0] = sent
    
    sentiment_df = sentiment_df.append(pd.Series(m_stats, index=['sentiment','text']), ignore_index=True)
    
print("Preprocessing complete")   

Preprocessing complete


In [20]:
sentiment_df.head(10)

Unnamed: 0,sentiment,text
0,neg,XXXX has claimed I owe them {$27.00} for XXXX ...
1,neu,Due to inconsistencies in the amount owed that...
2,neu,In XX/XX/XXXX my wages that I earned at my job...
3,neu,I have an open and current mortgage with Chase...
4,neg,XXXX was submitted XX/XX/XXXX. At the time I s...
5,neu,Experian is reporting my OPEN and CURRENT Mort...
6,neg,This complaint is against Wells Fargo Bank for...
7,neu,I spoke to XXXX of green tree representatives ...
8,neu,i opened XXXX Bank of America credit cards 15-...
9,neu,I applied for a loan with XXXX XXXX and had pu...


In [21]:
#Test
row = 0
text = sentiment_df['text'].values[row]
msentiment= sentiment.polarity_scores(sentiment_df['text'].values[row])
print(f'{msentiment} \n {text}')

{'neg': 0.126, 'neu': 0.874, 'pos': 0.0, 'compound': -0.7661} 
 XXXX has claimed I owe them {$27.00} for XXXX years despite the PROOF of PAYMENT I sent them : canceled check and their ownPAID INVOICE for {$27.00}! 
They continue to insist I owe them and collection agencies are after me. 
How can I stop this harassment for a bill I already paid four years ago? 



### Export

In [22]:
sentiment_df.to_csv('new_emails.csv', encoding='utf-8',index = None, header='true')