In [2]:
pip install transformers torch faiss-cpu scikit-learn pandas sentence-transformers

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.meta

In [3]:
import pandas as pd

sample_emails = [
    "I am having issues with the login. Please fix ASAP!",
    "Thanks for the help, everything is working now.",
    "Can I get a refund for my last purchase?",
    "My order hasn't arrived yet. What's going on?",
    "The product is great, really happy with it!",
    "I'm unable to reset my password.",
    "App keeps crashing when I try to open it.",
    "Can you update me on the shipping status?",
    "Worst customer service ever.",
    "I want to cancel my subscription immediately."
]

emails_df = pd.DataFrame({'id': range(1, len(sample_emails)+1), 'email': sample_emails})
emails_df.to_csv('emails.csv', index=False)
emails_df.head()


Unnamed: 0,id,email
0,1,I am having issues with the login. Please fix ...
1,2,"Thanks for the help, everything is working now."
2,3,Can I get a refund for my last purchase?
3,4,My order hasn't arrived yet. What's going on?
4,5,"The product is great, really happy with it!"


In [4]:
emails_df = pd.read_csv('emails.csv')
emails = emails_df['email'].tolist()
emails_df.head()

Unnamed: 0,id,email
0,1,I am having issues with the login. Please fix ...
1,2,"Thanks for the help, everything is working now."
2,3,Can I get a refund for my last purchase?
3,4,My order hasn't arrived yet. What's going on?
4,5,"The product is great, really happy with it!"


In [5]:
from transformers import pipeline

# Load the zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


In [6]:
labels = ["Login Issue", "Refund Request", "Order Status", "Complaint", "Praise", "Password Reset", "Technical Issue", "Cancellation"]

In [7]:
def classify_intent(email):
    result = classifier(email, candidate_labels=labels)
    return result['labels'][0]  # return top intent

In [8]:
emails_df['intent'] = emails_df['email'].apply(classify_intent)
emails_df[['email', 'intent']]

Unnamed: 0,email,intent
0,I am having issues with the login. Please fix ...,Login Issue
1,"Thanks for the help, everything is working now.",Praise
2,Can I get a refund for my last purchase?,Refund Request
3,My order hasn't arrived yet. What's going on?,Order Status
4,"The product is great, really happy with it!",Praise
5,I'm unable to reset my password.,Password Reset
6,App keeps crashing when I try to open it.,Cancellation
7,Can you update me on the shipping status?,Order Status
8,Worst customer service ever.,Complaint
9,I want to cancel my subscription immediately.,Cancellation


In [9]:
# Load sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


In [10]:
def analyze_sentiment(email):
    result = sentiment_analyzer(email)[0]
    return result['label'], round(result['score'], 3)

In [11]:
emails_df[['sentiment', 'sentiment_score']] = emails_df['email'].apply(
    lambda x: pd.Series(analyze_sentiment(x))
)

emails_df[['email', 'intent', 'sentiment', 'sentiment_score']]

Unnamed: 0,email,intent,sentiment,sentiment_score
0,I am having issues with the login. Please fix ...,Login Issue,NEGATIVE,0.998
1,"Thanks for the help, everything is working now.",Praise,POSITIVE,1.0
2,Can I get a refund for my last purchase?,Refund Request,NEGATIVE,0.999
3,My order hasn't arrived yet. What's going on?,Order Status,NEGATIVE,0.998
4,"The product is great, really happy with it!",Praise,POSITIVE,1.0
5,I'm unable to reset my password.,Password Reset,NEGATIVE,0.998
6,App keeps crashing when I try to open it.,Cancellation,NEGATIVE,0.989
7,Can you update me on the shipping status?,Order Status,NEGATIVE,0.997
8,Worst customer service ever.,Complaint,NEGATIVE,1.0
9,I want to cancel my subscription immediately.,Cancellation,NEGATIVE,1.0


In [12]:
# Load summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


In [13]:
def summarize_email(email):
    # Minimum and maximum lengths control output brevity
    result = summarizer(email, max_length=30, min_length=5, do_sample=False)
    return result[0]['summary_text']

In [14]:
def summarize_email_safe(email):
    if len(email.split()) < 5:
        return email  # Skip summarization if too short
    result = summarizer(email, max_length=30, min_length=5, do_sample=False)
    return result[0]['summary_text']

emails_df['summary'] = emails_df['email'].apply(summarize_email_safe)

Your max_length is set to 30, but your input_length is only 14. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)
Your max_length is set to 30, but your input_length is only 12. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)
Your max_length is set to 30, but your input_length is only 12. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)
Your max_length is set to 30, but your input_length is only 14. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)
Your max_len

In [15]:
from sentence_transformers import SentenceTransformer

# Load a sentence transformer model
embedder = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [16]:
# Get embeddings for each email
email_embeddings = embedder.encode(emails_df['email'].tolist(), convert_to_numpy=True)

In [17]:
import faiss
import numpy as np

# Create FAISS index
dimension = email_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)

# Add vectors to index
index.add(email_embeddings)

In [18]:
def find_similar_emails(query, top_k=3):
    query_vector = embedder.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_vector, top_k)

    print("\n📩 Incoming Email:")
    print(query)
    print("\n🔍 Top Similar Emails:")

    for i in range(top_k):
        idx = indices[0][i]
        dist = distances[0][i]
        print(f"\n[{i+1}] (Distance: {dist:.4f})")
        print(emails_df.iloc[idx]['email'])
        print("Intent:", emails_df.iloc[idx]['intent'])
        print("Sentiment:", emails_df.iloc[idx]['sentiment'])
        print("Summary:", emails_df.iloc[idx]['summary'])

In [19]:
test_email = "Why is my payment stuck? I need help urgently!"
find_similar_emails(test_email)


📩 Incoming Email:
Why is my payment stuck? I need help urgently!

🔍 Top Similar Emails:

[1] (Distance: 1.3104)
I want to cancel my subscription immediately.
Intent: Cancellation
Sentiment: NEGATIVE
Summary: "I want to cancel my subscription immediately," she said. "I don't want to see any more of these videos."

[2] (Distance: 1.3214)
I'm unable to reset my password.
Intent: Password Reset
Sentiment: NEGATIVE
Summary: 'I'm unable to reset my password. I'm sorry,' she writes.

[3] (Distance: 1.3298)
I am having issues with the login. Please fix ASAP!
Intent: Login Issue
Sentiment: NEGATIVE
Summary: I am having issues with the login. Please fix ASAP!


Recommendation Engine

In [20]:
def recommend_action(intent, sentiment):
    if intent == "Login Issue" or intent == "Password Reset":
        return "Send password reset instructions."
    elif intent == "Refund Request":
        if sentiment == "NEGATIVE":
            return "Prioritize refund and send apology note."
        else:
            return "Send refund process steps."
    elif intent == "Order Status":
        return "Check order status and reply with tracking link."
    elif intent == "Technical Issue":
        return "Forward to tech team with error details."
    elif intent == "Cancellation":
        return "Acknowledge cancellation and stop billing."
    elif intent == "Complaint":
        return "Apologize and escalate to customer care manager."
    elif intent == "Praise":
        return "Send thank-you note or a loyalty reward."
    else:
        return "Route to general support team for review."

In [21]:
emails_df['next_step'] = emails_df.apply(
    lambda row: recommend_action(row['intent'], row['sentiment']),
    axis=1
)

emails_df[['email', 'intent', 'sentiment', 'next_step']]

Unnamed: 0,email,intent,sentiment,next_step
0,I am having issues with the login. Please fix ...,Login Issue,NEGATIVE,Send password reset instructions.
1,"Thanks for the help, everything is working now.",Praise,POSITIVE,Send thank-you note or a loyalty reward.
2,Can I get a refund for my last purchase?,Refund Request,NEGATIVE,Prioritize refund and send apology note.
3,My order hasn't arrived yet. What's going on?,Order Status,NEGATIVE,Check order status and reply with tracking link.
4,"The product is great, really happy with it!",Praise,POSITIVE,Send thank-you note or a loyalty reward.
5,I'm unable to reset my password.,Password Reset,NEGATIVE,Send password reset instructions.
6,App keeps crashing when I try to open it.,Cancellation,NEGATIVE,Acknowledge cancellation and stop billing.
7,Can you update me on the shipping status?,Order Status,NEGATIVE,Check order status and reply with tracking link.
8,Worst customer service ever.,Complaint,NEGATIVE,Apologize and escalate to customer care manager.
9,I want to cancel my subscription immediately.,Cancellation,NEGATIVE,Acknowledge cancellation and stop billing.


In [24]:
def summarize_email_dynamic(email):
    num_words = len(email.split())

    if num_words < 5:
        return email  # Skip summarization if too short

    max_len = min(30, int(num_words * 1.2))
    min_len = max(5, int(num_words * 0.3))

    result = summarizer(email, max_length=max_len, min_length=min_len, do_sample=False)
    return result[0]['summary_text']

In [25]:
def find_similar_emails(query, top_k=3):
    query_vector = embedder.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_vector, top_k)

    print("\n📩 Incoming Email:")
    print(query)

    # Classify and analyze
    intent = classify_intent(query)
    sentiment, score = analyze_sentiment(query)
    summary = summarize_email_dynamic(query)
    next_step = recommend_action(intent, sentiment)

    print("\n🔍 AI Insights:")
    print(f"- Intent: {intent}")
    print(f"- Sentiment: {sentiment} ({score})")
    print(f"- Summary: {summary}")
    print(f"- Recommended Next Step: {next_step}")

    print("\n📚 Top Similar Past Emails:")
    for i in range(top_k):
        idx = indices[0][i]
        print(f"\n[{i+1}]")
        print(f"Email: {emails_df.iloc[idx]['email']}")
        print(f"Intent: {emails_df.iloc[idx]['intent']}")
        print(f"Sentiment: {emails_df.iloc[idx]['sentiment']}")
        print(f"Summary: {emails_df.iloc[idx]['summary']}")
        print(f"Next Step: {emails_df.iloc[idx]['next_step']}")

In [26]:
find_similar_emails("I want to cancel my order. It never arrived and I’m frustrated.")


📩 Incoming Email:
I want to cancel my order. It never arrived and I’m frustrated.

🔍 AI Insights:
- Intent: Cancellation
- Sentiment: NEGATIVE (1.0)
- Summary: "I want to cancel my order. It never arrived
- Recommended Next Step: Acknowledge cancellation and stop billing.

📚 Top Similar Past Emails:

[1]
Email: I want to cancel my subscription immediately.
Intent: Cancellation
Sentiment: NEGATIVE
Summary: "I want to cancel my subscription immediately," she said. "I don't want to see any more of these videos."
Next Step: Acknowledge cancellation and stop billing.

[2]
Email: My order hasn't arrived yet. What's going on?
Intent: Order Status
Sentiment: NEGATIVE
Summary: CNN.com will feature iReporter photos in a weekly Travel Snapshots gallery. Visit CNN.com/Travel each week for a
Next Step: Check order status and reply with tracking link.

[3]
Email: Can I get a refund for my last purchase?
Intent: Refund Request
Sentiment: NEGATIVE
Summary: Can I get a refund for my last purchase? I w

In [27]:
# Select relevant columns to export
emails_df_to_export = emails_df[['email', 'intent', 'sentiment', 'summary', 'next_step']]

# Export to CSV
emails_df_to_export.to_csv('email_insights.csv', index=False)

print("Exported to email_insights.csv")


Exported to email_insights.csv


In [28]:
from google.colab import files
files.download('email_insights.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>