In [8]:
!pip install faiss-cpu



In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import os
import numpy as np
import faiss
from datetime import datetime
import json
import spacy

# Initializing Components and Models
print("Initializing components...")
embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
summarizer = pipeline("summarization", model = "facebook/bart-large-cnn")
sentiment_analyzer = pipeline("sentiment-analysis")
nlp = spacy.load("en_core_web_sm")

# Load Dataset
print("Loading dataset...")
data_file = "processed_complaints.csv"
if os.path.exists(data_file):
    import pandas as pd
    data = pd.read_csv(data_file)
    data = data.loc[:, ~data.columns.str.contains('^Unnamed')]
    data['processed_narrative'] = data['processed_narrative'].str.lower().str.strip()
else:
    raise FileNotFoundError("Dataset file not found!")

# Load or compute embeddings
embeddings_file = "embeddings.npy"
if os.path.exists(embeddings_file):
    print("Loading precomputed embeddings...")
    embeddings = np.load(embeddings_file)
else:
    print("Computing embeddings...")
    embeddings = np.array(embedder.encode(data['processed_narrative'].tolist()))
    np.save(embeddings_file, embeddings)

# Initialize FAISS index
dimension = embeddings.shape[1]
index = faiss.read_index('faiss_index.bin')
print(f"FAISS index initialized with {index.ntotal} entries.")

# Functions
def retrieve_documents(query, k=3):
    query_embeddings = embedder.encode([query])
    distances, indices = index.search(query_embeddings, k)
    return [
        {"document": data.iloc[i]['narrative'], "distance": dist}
        for i, dist in zip(indices[0], distances[0])
    ]

def summarize_text(text):
    return summarizer(text, max_length=100, min_length=25, do_sample=False)[0]["summary_text"]

def analyze_sentiment(text):
    return sentiment_analyzer(text)[0]

def extract_entities(text):
  doc = nlp(text)
  return [(ent.text, ent.label_) for ent in doc.ents]

def log_session(query, response, sentiment, entities):
    log_entry = {
        "timestamp": datetime.now().isoformat(),
        "query": query,
        "response": response,
        "sentiment": sentiment,
        "entities": entities
    }
    log_file = "chatbot_logs.json"

    if os.path.exists(log_file):
        with open(log_file, 'r') as file:
            logs = json.load(file)
    else:
        logs = []
    logs.append(log_entry)

    with open(log_file, 'w') as file:
        json.dump(logs, file, indent = 4)

def chatbot():
    print("Welcome to the Customer Support Chatbot!")
    print("Type 'exit' to quit.")
    while True:
        query = input("\nYour Query: ")
        if query.lower() == "exit":
            print("Thank you for using the chatbot!")
            break

        retrieved_docs = retrieve_documents(query)
        if not retrieved_docs:
            print("No relevant documents found.")
            continue

        top_document = retrieved_docs[0]['document']
        summary = summarize_text(top_document)

        sentiment = analyze_sentiment(top_document)
        entities = extract_entities(top_document)

        print("\n--- Chatbot Response ---")
        print(f"Response: {summary}")
        print(f"Sentiment: {sentiment['label']} (Score: {sentiment['score']:.2f})")
        print("Entities:", entities)
        print("\n--- Additional Info ---")
        for doc in retrieved_docs:
            print(f"Retrieved Doc: {doc['document']}\nDistance: {doc['distance']:.4f}\n")

        log_session(query, summary, sentiment, entities)

if __name__ == "__main__":
    chatbot()


Initializing components...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cpu
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


Loading dataset...
Loading precomputed embeddings...
FAISS index initialized with 50000 entries.
Welcome to the Customer Support Chatbot!
Type 'exit' to quit.

Your Query: Why was my credit card charged twice?


Your max_length is set to 100, but your input_length is only 17. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=8)



--- Chatbot Response ---
Response: Two credit card stolen last night. One one debit charge done last night charge amount of $1,000. Two credit cards stolen. One debit card stolen.
Sentiment: NEGATIVE (Score: 0.98)
Entities: [('two', 'CARDINAL'), ('last night', 'TIME'), ('last night', 'TIME')]

--- Additional Info ---
Retrieved Doc: two credit card stolen last night one one debit charge done last night charge amount
Distance: 0.6169

Retrieved Doc: credit card company double charged product purchased card purchase made charge duplicated disputed duplicate charge company continues demand pay product twice contacted company made purchase claim payed yet
Distance: 0.6345

Retrieved Doc: cancelled card charged
Distance: 0.7137


Your Query: Explain why my refund is delayed.


Your max_length is set to 100, but your input_length is only 27. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=13)



--- Chatbot Response ---
Response:  refund told wait day money refund back saving account waited still receive money called boa said receive money said already refund money back account.
Sentiment: NEGATIVE (Score: 0.99)
Entities: [('day', 'DATE'), ('boa', 'PERSON')]

--- Additional Info ---
Retrieved Doc: refund told wait day money refund back saving account waited still receive money called boa said receive money said already refund money back account
Distance: 0.7159

Retrieved Doc: hey may ask get refund dollar refund back name date date thank
Distance: 0.7567

Retrieved Doc: purchased flight ticket wife throw delayed made miss rest flight covered full refund policy told u file refund application day receive money back never got refund contacted asked u wait time specific date waited filed claim capital one credit card service answer stating didnt contact refund refund policy provide documentation proof policy delayed flight refund application document proved right also showed com

Your max_length is set to 100, but your input_length is only 4. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=2)



--- Chatbot Response ---
Response: A former U.S. Marine was arrested on suspicion of sexually assaulting a woman. The charges were later dropped and the woman was cleared of all charges.
Sentiment: NEGATIVE (Score: 0.99)
Entities: []

--- Additional Info ---
Retrieved Doc: disputed charge
Distance: 0.6347

Retrieved Doc: filed dispute given credit charge others denied without explanation charge charged dollar fee led charged fee
Distance: 0.6794

Retrieved Doc: dispute service concerning charge account made effort discard reservation inadvertently booked mistake call immediately placed explain mistake requested cancellation doubt transaction transmitted az charged reservation never processed service used following dispute charge case merchant dispute amount
Distance: 0.7404


Your Query: How do I resolve an issue with a retail banking account?


Your max_length is set to 100, but your input_length is only 7. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)



--- Chatbot Response ---
Response: CNN.com is unable to link bank account account account to account linked to CNN.com. Please email us at jennifer.linning@cnn.com with any photos or video of you.
Sentiment: NEGATIVE (Score: 1.00)
Entities: []

--- Additional Info ---
Retrieved Doc: unable link bank account account
Distance: 0.6631

Retrieved Doc: cant open bank account bank america owe fraud consumer report false ive tried dispute tell nothing want record
Distance: 0.6986

Retrieved Doc: saw checking account transferred primary bank first national bank dont account open done went ahead contacted first national filed complaint directly im currently waiting paper work complete complaint process also second time two month unauthorized payment taken directly account im sure identity compromised
Distance: 0.7097


Your Query: What should I do about unauthorized transactions?


Your max_length is set to 100, but your input_length is only 7. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)



--- Chatbot Response ---
Response: Fraudulent unauthorized inquires report. fraudulent unauthorized inquiries report. fraudulent unauthorized inquiries report. Fraudulent unauthorized Inquiries Report. fraudulently unauthorized Inquires Report.
Sentiment: NEGATIVE (Score: 1.00)
Entities: []

--- Additional Info ---
Retrieved Doc: fraudulent unauthorized inquires report
Distance: 0.8396

Retrieved Doc: fraudulent unauthorized inquires report
Distance: 0.8396

Retrieved Doc: fraudulent unauthorized inquires report
Distance: 0.8396


Your Query: Why did my payment history disappear from the credit report?


Your max_length is set to 100, but your input_length is only 9. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)



--- Chatbot Response ---
Response: U.S. credit information reported delinquent paid a year ago. Credit information reported delinquency paid year ago: $1.2 billion.
Sentiment: NEGATIVE (Score: 0.99)
Entities: [('year ago', 'DATE')]

--- Additional Info ---
Retrieved Doc: credit information reported delinquent paid year ago
Distance: 0.4527

Retrieved Doc: credit information reported delinquent paid year ago
Distance: 0.4527

Retrieved Doc: credit information reported delinquent paid year ago
Distance: 0.4527


Your Query: Why did Comenity Bank change my credit limit without notifying me?


Your max_length is set to 100, but your input_length is only 81. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)



--- Chatbot Response ---
Response:  credit report changed since got card incorrect asked information wasnt notified agreement didnt information caused utilization change xxxxredit score lower point rep wasnt able answer question bill always paid time month balance reward program lowering credit limit automatically disqualify taking advantage reward discrimination letter email notifying change changing credit limit charged lowered.
Sentiment: NEGATIVE (Score: 1.00)
Entities: [('comenity bank', 'ORG'), ('bill bal', 'PERSON')]

--- Additional Info ---
Retrieved Doc: comenity bank lowered credit limit tried using card said suspended didnt rec bill bal said got credit report changed since got card incorrect asked information wasnt notified agreement doesnt information caused utilization change xxxxredit score lower point rep wasnt able answer question bill always paid time month balance reward program lowering credit limit automatically disqualify taking advantage reward discrimination let

Your max_length is set to 100, but your input_length is only 93. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=46)



--- Chatbot Response ---
Response: Well fargo credit card service card went make payment credit card saw card balance extremely high four fraudulent charge authorize two back back charge respectively another charge target finally charge vendor say well fargo payment assistance program credit card payment still work.
Sentiment: NEGATIVE (Score: 1.00)
Entities: [('four', 'CARDINAL'), ('two', 'CARDINAL'), ('month', 'DATE')]

--- Additional Info ---
Retrieved Doc: contacted well fargo credit card service card went make payment credit card saw card balance extremely high four fraudulent charge authorize two back back charge respectively another charge target finally charge vendor say well fargo payment assistance program credit card payment prior since still work due covid frugal spending used card month time period called make aware fraudulent charge let know could said help take care fast forward see balance transfer well fargo put fraudulent charge back onto account saying liable never 

Your max_length is set to 100, but your input_length is only 48. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)



--- Chatbot Response ---
Response: Currently mortgage freedom mortgage requested put forbearance due beginning began requesting taken forbearance called countless time made payment brought account current still removed forbearance. Multiple request told purposely kept forbearance protect point still forbearance continuously told take nothing happening absolutely unacceptable.
Sentiment: NEGATIVE (Score: 1.00)
Entities: []

--- Additional Info ---
Retrieved Doc: currently mortgage freedom mortgage requested put forbearance due beginning began requesting taken forbearance called countless time made payment brought account current still removed forbearance multiple request told purposely kept forbearance protect point still forbearance continuously told take nothing happening absolutely unacceptable
Distance: 0.6317

Retrieved Doc: currently month forbearance plan lender despite reported negatively credit bureau resulting point drop score late mortgage changed yet submitting false inform

Your max_length is set to 100, but your input_length is only 33. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=16)



--- Chatbot Response ---
Response:  capitalone charged back twice disputed charge occurred due to dissatisfaction merchant contacted capitalone dispute amount. merchant requested information provided without giving mr courtesy phone call letter charged back amount twice.
Sentiment: NEGATIVE (Score: 0.98)
Entities: []

--- Additional Info ---
Retrieved Doc: capitalone charged back twice disputed charge occurred due dissatisfaction merchant contacted capitalone dispute amount requested information provided without giving mr courtesy phone call letter charged back amount twice
Distance: 0.8343

Retrieved Doc: cancelled order charge remained bill refund issued contacted capital one credit resolve dispute capital one issued temporary credit credit account investigated dispute capital one concluded merchant credited account problem charge still credit card bill capital one take charge bill even though merchant issued credit date charge went store management gave paper work order order cance

Your max_length is set to 100, but your input_length is only 7. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)



--- Chatbot Response ---
Response:  account showing late payment charge is taken from a bank account. The account was used to pay a late payment to a bank. The bank has since taken the account back.
Sentiment: NEGATIVE (Score: 0.97)
Entities: []

--- Additional Info ---
Retrieved Doc: account showing late payment charge
Distance: 0.5619

Retrieved Doc: account showing late payment charge
Distance: 0.5619

Retrieved Doc: account showing late payment charge
Distance: 0.5619


Your Query: exit
Thank you for using the chatbot!
Initializing components...


Device set to use cpu
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


Loading dataset...
Loading precomputed embeddings...
FAISS index initialized with 50000 entries.
Welcome to the Customer Support Chatbot!
Type 'exit' to quit.

Your Query: exit
Thank you for using the chatbot!


In [11]:
import json
from collections import Counter

with open('chatbot_logs.json', 'r') as file:
  logs = json.load(file)

queries = [log['query'] for log in logs]
query_count = Counter(queries)

print("Most common queries:")
for query, count in query_count.most_common(5):
  print(f"{query}: {count} times")

Most common queries:
Why was my credit card charged twice?: 2 times
Explain why my refund is delayed.: 2 times
What is the process to dispute a charge?: 2 times
How do I resolve an issue with a retail banking account?: 2 times
What should I do about unauthorized transactions?: 2 times
Most common queries:
Why was my credit card charged twice?: 2 times
Explain why my refund is delayed.: 2 times
What is the process to dispute a charge?: 2 times
How do I resolve an issue with a retail banking account?: 2 times
What should I do about unauthorized transactions?: 2 times


In [12]:
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
query_embeddings = embedder.encode(queries)

kmeans = KMeans(n_clusters=5, random_state=42).fit(query_embeddings)
clustered_queries = {i: [] for i in range(5)}

for query, label in zip(queries, kmeans.labels_):
  clustered_queries[label].append(query)

for cluster, queries in clustered_queries.items():
  print(f"Cluster {cluster}:")
  for query in queries[:5]:  # Limit to 5 examples per cluster
    print(f"  - {query}")

Cluster 0:
  - Why was my credit card charged twice?
  - Why was my credit card charged twice?
  - Can you help me with fraudulent charges on my Wells Fargo debit card?
  - Why was I charged late fees even though I paid my credit card bill on time?
Cluster 1:
  - How do I resolve an issue with a retail banking account?
  - What should I do about unauthorized transactions?
  - How do I resolve an issue with a retail banking account?
  - What should I do about unauthorized transactions?
Cluster 2:
  - What is the process to dispute a charge?
  - What is the process to dispute a charge?
  - How do I handle a dispute with Capital One about a canceled order?
Cluster 3:
  - Why did my payment history disappear from the credit report?
  - Why did my payment history disappear from the credit report?
  - Why did Comenity Bank change my credit limit without notifying me?
  - What should I do if my mortgage account was marked as forbearance incorrectly?
Cluster 4:
  - Explain why my refund is del

In [13]:
high_negatives = [log for log in logs if log['sentiment']['label'] == 'NEGATIVE' and log['sentiment']['score'] > 0.99]

print("Queries with high negative sentiment:")
for log in high_negatives:
  print(f"Query: {log['query']}")
  print(f"Sentiment Score: {log['sentiment']['score']}")
  print("-" * 30)

Queries with high negative sentiment:
Query: Explain why my refund is delayed.
Sentiment Score: 0.9938639402389526
------------------------------
Query: How do I resolve an issue with a retail banking account?
Sentiment Score: 0.9995288848876953
------------------------------
Query: What should I do about unauthorized transactions?
Sentiment Score: 0.9986843466758728
------------------------------
Query: Why did my payment history disappear from the credit report?
Sentiment Score: 0.9906863570213318
------------------------------
Query: Explain why my refund is delayed.
Sentiment Score: 0.9938639402389526
------------------------------
Query: How do I resolve an issue with a retail banking account?
Sentiment Score: 0.9995288848876953
------------------------------
Query: What should I do about unauthorized transactions?
Sentiment Score: 0.9986843466758728
------------------------------
Query: Why did my payment history disappear from the credit report?
Sentiment Score: 0.99068635702133

In [14]:
def explain_retrieval(query, k=3):
  query_embedding = embedder.encode([query])
  distances, indices = index.search(query_embedding, k)

  explanations = []
  for i, dist in zip(indices[0], distances[0]):
    explanations.append({
      "retrieved_doc": data.iloc[i]['narrative'],
      "distance": dist
    })
  return explanations

explanations = explain_retrieval(query)
for explanation in explanations:
  print(f"Retrieved Document: {explanation['retrieved_doc']}")
  print(f"Distance Score: {explanation['distance']}")
  print("-" * 30)

Retrieved Document: refund told wait day money refund back saving account waited still receive money called boa said receive money said already refund money back account
Distance Score: 0.7158880233764648
------------------------------
Retrieved Document: hey may ask get refund dollar refund back name date date thank
Distance Score: 0.7567122578620911
------------------------------
Retrieved Document: purchased flight ticket wife throw delayed made miss rest flight covered full refund policy told u file refund application day receive money back never got refund contacted asked u wait time specific date waited filed claim capital one credit card service answer stating didnt contact refund refund policy provide documentation proof policy delayed flight refund application document proved right also showed commenting fraud capital one closer case even proved right agent doesnt seems care feel neglected took advantage want right citizen united state low say
Distance Score: 0.827225208282470

In [None]:
# Sample Queries
# Why was my credit card charged twice?
# Explain why my refund is delayed.
# What is the process to dispute a charge?
# How do I resolve an issue with a retail banking account?
# What should I do about unauthorized transactions?
# Why did my payment history disappear from the credit report?
# Why did Comenity Bank change my credit limit without notifying me?
# Can you help me with fraudulent charges on my Wells Fargo debit card?
# What should I do if my mortgage account was marked as forbearance incorrectly?
# How do I handle a dispute with Capital One about a canceled order?
# Why was I charged late fees even though I paid my credit card bill on time?