In [1]:
import pandas as pd

# Load cleaned data
df = pd.read_csv("../data/amex_cleaned_complaints.csv")

# Confirm data is loaded
print("Rows:", len(df))
print("Sample complaint:\n", df['consumer_complaint_narrative'].iloc[0])


Rows: 5409
Sample complaint:
 I am writing to file a formal complaint with the Consumer Financial Protection Bureau ( CFPB ) regarding my XXXX XXXX American Express card account, issued by American Express. I am deeply dissatisfied with the recent changes to the card 's benefits, which have rendered the card significantly less valuable to me as a consumer. 

I have been a loyal customer have held the XXXX XXXX American Express card for 1 year and have been a Skymiles member since XXXX. I initially signed up for this card because of the enticing benefits it offered, including [ paid XXXX XXXX XXXX access, priority boarding, and boosts when spending {$25000.00} and {$50000.00}. ]. These benefits were a major factor in my decision to become a cardholder and to continue paying the annual fee. 

However, I was recently informed that effective XX/XX/XXXX, several of the benefits that initially attracted me to this card will no longer be available. The specific changes that have been announce

In [2]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)            # remove URLs
    text = re.sub(r'[^a-z\s]', '', text)                  # remove numbers/punctuation
    text = re.sub(r'\s+', ' ', text).strip()              # remove extra whitespace
    return text

df['clean_text'] = df['consumer_complaint_narrative'].apply(clean_text)
print("Text cleaned.")


Text cleaned.


In [3]:
import nltk
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Only run once
nltk.download('vader_lexicon')

analyzer = SentimentIntensityAnalyzer()

# Apply VADER
df['vader_sentiment'] = df['clean_text'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

# Sentiment label
df['sentiment_label'] = df['vader_sentiment'].apply(
    lambda x: 'positive' if x > 0.2 else 'negative' if x < -0.2 else 'neutral'
)

print("Sentiment analysis complete.")
print("Sentiment counts:\n", df['sentiment_label'].value_counts())


[nltk_data] Error loading vader_lexicon: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


Sentiment analysis complete.
Sentiment counts:
 sentiment_label
positive    2834
negative    2062
neutral      513
Name: count, dtype: int64


In [4]:
df.to_csv("../data/amex_complaints_sentiment.csv", index=False)
print("Saved sentiment-scored file to: ../data/amex_complaints_sentiment.csv")


Saved sentiment-scored file to: ../data/amex_complaints_sentiment.csv
