In [32]:
# customer_grievance_analysis.py

import pandas as pd
import numpy as np
import re
import json
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import nltk
import string
import torch
from datetime import datetime

# Download required NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('wordnet', quiet=True)

# Step 1: Data Understanding & Preprocessing
try:
    # Attempt to read the CSV file
    df = pd.read_csv('banking_complaints_2023.csv')
except FileNotFoundError:
    print("File 'banking_complaints_2023.csv' not found. Creating a sample dataset.")
    # Create a sample dataset
    sample_data = {
        'Complaint Description': [
            "The ATM ate my card and I couldn't withdraw money!",
            "Loan approval took too long, very frustrating.",
            "Online banking app crashes every time I log in.",
            "Great service at the branch, very helpful staff.",
            "Fraudulent charges on my credit card, need refund!",
            "Checking account fees are too high, unacceptable.",
            "Mortgage application process was smooth and fast.",
            "Customer service was rude when I called about my issue."
        ],
        'Department': [
            'ATM Services', 'Loans', 'Online Banking', 'Branch Services',
            'Credit Card', 'Checking Account', 'Mortgage', 'Customer Service'
        ],
        'Date': [
            '2023-01-15', '2023-02-10', '2023-03-05', '2023-04-20',
            '2023-05-12', '2023-06-18', '2023-07-22', '2023-08-30'
        ]
    }
    df = pd.DataFrame(sample_data)

# Check and correct data types
print("\nInitial Data Types:")
print(df.dtypes)

df['Complaint Description'] = df['Complaint Description'].astype(str)
df['Department'] = df['Department'].astype('category')
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Find date range
if 'Date' in df.columns:
    date_range = (df['Date'].min(), df['Date'].max())
    print(f"Date Range: {date_range[0]} to {date_range[1]}")
else:
    print("No Date column found.")

# Define preprocessing function
def preprocessing(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    # Convert to lowercase
    text = text.lower()

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords and punctuation
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]

    # Apply lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join tokens back into text
    return ' '.join(tokens)

# Apply preprocessing to 'Complaint Description'
df['Cleaned_Complaint'] = df['Complaint Description'].apply(preprocessing)

# Step 2: Text Feature Engineering (TF-IDF)
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(df['Cleaned_Complaint']).toarray()
y = df['Department']

# Step 3: Complaint Classification (Random Forest as baseline)
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)
y_pred_rf = rf_classifier.predict(X_test)
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, y_pred_rf, zero_division=0))

# Step 4: Transformer-based Modeling
# Load pre-trained transformer model (DistilBERT) for classification
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
try:
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(y.cat.categories))
except Exception as e:
    print(f"Error loading transformer model: {e}. Using pipeline for inference.")

# Tokenize complaints for transformer
def tokenize_complaints(texts, max_length=128):
    return tokenizer(
        texts.tolist(),
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

# Prepare data for transformer
encoded_data = tokenize_complaints(df['Cleaned_Complaint'])
labels = pd.Categorical(df['Department']).codes

# Split indices for train-test split
indices = np.arange(len(labels))
X_train_idx, X_test_idx, y_train_trans, y_test_trans = train_test_split(
    indices, labels, test_size=0.2, random_state=42
)

# Convert to PyTorch dataset
class ComplaintDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels, indices):
        self.encodings = encodings
        self.labels = labels
        self.indices = indices

    def __getitem__(self, idx):
        idx = self.indices[idx]
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.indices)

train_dataset = ComplaintDataset(encoded_data, labels, X_train_idx)
test_dataset = ComplaintDataset(encoded_data, labels, X_test_idx)

# Use pipeline for inference (simplified due to computational constraints)
try:
    classifier = pipeline("text-classification", model=model_name, tokenizer=model_name, top_k=None)
    transformer_preds = classifier(df['Cleaned_Complaint'].tolist())
    transformer_labels = [max(pred, key=lambda x: x['score'])['label'] for pred in transformer_preds]
    print("\nTransformer Model Predictions (Sample):")
    print(transformer_labels[:5])  # Display first 5 for brevity
except Exception as e:
    print(f"Error running transformer pipeline: {e}. Skipping transformer predictions.")

# Step 5: Sentiment Analysis
analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):
    scores = analyzer.polarity_scores(text)
    return scores['compound']

df['Sentiment_Score'] = df['Cleaned_Complaint'].apply(get_sentiment)
sentiment_by_dept = df.groupby('Department', observed=True)['Sentiment_Score'].mean().sort_values()
print("\nAverage Sentiment Score by Department:")
print(sentiment_by_dept)

# Step 6: Business Insights & Strategy Recommendations
print("\nBusiness Insights and Recommendations:")
for dept, score in sentiment_by_dept.items():
    if score < -0.5:
        print(f"Urgent: {dept} (Score: {score:.2f}) - Investigate critical issues (e.g., fraud, outages). Escalate to compliance team.")
    elif score < 0:
        print(f"{dept} (Score: {score:.2f}) - Address process inefficiencies (e.g., streamline loan approvals).")
    elif score < 0.5:
        print(f"{dept} (Score: {score:.2f}) - Monitor for emerging issues; enhance UX (e.g., improve app interface).")
    else:
        print(f"{dept} (Score: {score:.2f}) - Leverage positive feedback for marketing or loyalty programs.")

# Save processed data
df.to_csv('processed_complaints.csv', index=False)
print("\nProcessed data saved to 'processed_complaints.csv'.")

# Step 7: Generate Chart.js configuration for visualization
sentiment_by_dept_dict = sentiment_by_dept.to_dict()
labels = list(sentiment_by_dept_dict.keys())
scores = list(sentiment_by_dept_dict.values())

chart_config = {
    "type": "bar",
    "data": {
        "labels": labels,
        "datasets": [{
            "label": "Average Sentiment Score",
            "data": scores,
            "backgroundColor": ["#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4", "#FF9F43", "#D4A5A5", "#66B2B2", "#FFD1DC"],
            "borderColor": ["#D00000", "#008080", "#005566", "#5B8A72", "#FF7F00", "#A66A6A", "#3D7A7A", "#FF8C94"],
            "borderWidth": 1
        }]
    },
    "options": {
        "scales": {
            "y": {
                "beginAtZero": True,
                "title": {
                    "display": True,
                    "text": "Sentiment Score"
                }
            },
            "x": {
                "title": {
                    "display": True,
                    "text": "Department"
                }
            }
        },
        "plugins": {
            "title": {
                "display": True,
                "text": "Average Sentiment Scores by Department"
            }
        }
    }
}

print("\nChart.js Configuration for Sentiment Scores:")
print(json.dumps(chart_config, indent=2))

File 'banking_complaints_2023.csv' not found. Creating a sample dataset.

Initial Data Types:
Complaint Description    object
Department               object
Date                     object
dtype: object
Date Range: 2023-01-15 00:00:00 to 2023-08-30 00:00:00

Random Forest Classification Report:
                  precision    recall  f1-score   support

Checking Account       0.00      0.00      0.00       1.0
           Loans       0.00      0.00      0.00       1.0
        Mortgage       0.00      0.00      0.00       0.0

        accuracy                           0.00       2.0
       macro avg       0.00      0.00      0.00       2.0
    weighted avg       0.00      0.00      0.00       2.0



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu



Transformer Model Predictions (Sample):
['LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0']

Average Sentiment Score by Department:
Department
Checking Account   -0.4588
Customer Service   -0.4588
Online Banking     -0.4019
Credit Card        -0.1531
Mortgage            0.0000
ATM Services        0.0000
Loans               0.0516
Branch Services     0.7845
Name: Sentiment_Score, dtype: float64

Business Insights and Recommendations:
Checking Account (Score: -0.46) - Address process inefficiencies (e.g., streamline loan approvals).
Customer Service (Score: -0.46) - Address process inefficiencies (e.g., streamline loan approvals).
Online Banking (Score: -0.40) - Address process inefficiencies (e.g., streamline loan approvals).
Credit Card (Score: -0.15) - Address process inefficiencies (e.g., streamline loan approvals).
Mortgage (Score: 0.00) - Monitor for emerging issues; enhance UX (e.g., improve app interface).
ATM Services (Score: 0.00) - Monitor for emerging issues; enhance UX 