In [1]:
from sklearn.cluster import AgglomerativeClustering

from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Corpus with example sentences
corpus = [
    "A man is eating food.",
    "A man is eating a piece of bread.",
    "A man is eating pasta.",
    "The girl is carrying a baby.",
    "The baby is carried by the woman",
    "A man is riding a horse.",
    "A man is riding a white horse on an enclosed ground.",
    "A monkey is playing drums.",
    "Someone in a gorilla costume is playing a set of drums.",
    "A cheetah is running behind its prey.",
    "A cheetah chases prey on across a field.",
]
corpus_embeddings = embedder.encode(corpus)

# Some models don't automatically normalize the embeddings, in which case you should normalize the embeddings:
# corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

# Perform agglomerative clustering
clustering_model = AgglomerativeClustering(
    n_clusters=None, distance_threshold=1.5
)  # , affinity='cosine', linkage='average', distance_threshold=0.4)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(corpus[sentence_id])

for i, cluster in clustered_sentences.items():
    print("Cluster ", i + 1)
    print(cluster)
    print("")

  from .autonotebook import tqdm as notebook_tqdm


Cluster  1
['A man is eating food.', 'A man is eating a piece of bread.', 'A man is eating pasta.']

Cluster  5
['The girl is carrying a baby.', 'The baby is carried by the woman']

Cluster  2
['A man is riding a horse.', 'A man is riding a white horse on an enclosed ground.']

Cluster  3
['A monkey is playing drums.', 'Someone in a gorilla costume is playing a set of drums.']

Cluster  4
['A cheetah is running behind its prey.', 'A cheetah chases prey on across a field.']



  return forward_call(*args, **kwargs)


Testing the sample convo.

In [None]:
corpus = [
    "Emma: Good morning, everyone. I wanted to start today’s meeting with a quick recap. Our sales numbers for Q2 have been steady, but not exactly where we projected. We’re about 8% below the target.",
    "David: Yeah, I noticed that too. The leads from the last campaign were decent, but conversions dropped. We’re getting traffic, but it’s not translating into sales.",
    "Sofia: That’s exactly what I saw in the analytics. Our click-through rates are high, especially from the social ads, but a lot of visitors drop off before checkout. I think the landing page might be the bottleneck.",
    "Liam: Before we jump into new campaigns, we need to remember we’ve already used 72% of the marketing budget for the year. If we plan any big push, we’ll need to optimize our spend.",
    "Emma: Good point, Liam. That’s why I wanted us to brainstorm ideas that can give us better ROI without overspending. David, what’s your take on improving conversions?",
    "David: Honestly, I think the sales team needs more qualified leads. Right now, they’re spending too much time chasing prospects who aren’t ready to buy. Maybe we should focus on retargeting past visitors.",
    "Sofia: I can set up retargeting campaigns for both Google and Facebook. Also, we can use email automation to bring back customers who abandoned their carts. That usually has a decent success rate.",
    "Liam: Retargeting is fine, but remember—those campaigns can get expensive if we don’t set proper limits. I’d suggest running a two-week test before committing.",
    "Emma: Agreed. Sofia, can you prepare a pilot campaign with a capped daily budget?",
    "Sofia: Absolutely. I’ll aim for $50 per day and track the cost per acquisition closely.",
    "David: Another thing—could we add more personalized offers for returning customers? I think discounts or loyalty points could push them over the edge.",
    "Liam: I like the loyalty idea. That way, the spending becomes an investment in customer retention.",
    "Emma: Let’s put that on the roadmap. Sofia, coordinate with David to design the loyalty program structure.",
    "Sofia: Got it. We could also test short video ads instead of just static images. Engagement is usually higher with video.",
    "David: True. And maybe the videos could feature customer testimonials. People trust real stories.",
    "Liam: Just make sure production costs are minimal. We can film them in-house.",
    "Emma: Perfect. Here’s how we’ll divide responsibilities:\n\nSofia – Launch retargeting campaigns, design loyalty program structure, prepare video ad concepts.\n\nDavid – Coordinate with sales team, identify potential customer testimonials, and refine lead qualification.\n\nLiam – Monitor spending and ROI for each campaign, approve budget changes.\n\nEmma – Oversee strategy execution, align campaigns with brand goals, report to leadership.",
    "David: Sounds good to me. Let’s touch base again in two weeks to review results.",
    "Sofia: I’ll send out a campaign calendar today.",
    "Liam: And I’ll send you all a budget breakdown by the end of the day.",
    "Emma: Great. Thanks, everyone—let’s make this quarter end stronger than it started."
]


In [6]:
from sklearn.cluster import AgglomerativeClustering

from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Corpus with example sentences
corpus = [
    "Emma: Good morning, everyone. I wanted to start today’s meeting with a quick recap. Our sales numbers for Q2 have been steady, but not exactly where we projected. We’re about 8% below the target.",
    "David: Yeah, I noticed that too. The leads from the last campaign were decent, but conversions dropped. We’re getting traffic, but it’s not translating into sales.",
    "Sofia: That’s exactly what I saw in the analytics. Our click-through rates are high, especially from the social ads, but a lot of visitors drop off before checkout. I think the landing page might be the bottleneck.",
    "Liam: Before we jump into new campaigns, we need to remember we’ve already used 72% of the marketing budget for the year. If we plan any big push, we’ll need to optimize our spend.",
    "Emma: Good point, Liam. That’s why I wanted us to brainstorm ideas that can give us better ROI without overspending. David, what’s your take on improving conversions?",
    "David: Honestly, I think the sales team needs more qualified leads. Right now, they’re spending too much time chasing prospects who aren’t ready to buy. Maybe we should focus on retargeting past visitors.",
    "Sofia: I can set up retargeting campaigns for both Google and Facebook. Also, we can use email automation to bring back customers who abandoned their carts. That usually has a decent success rate.",
    "Liam: Retargeting is fine, but remember—those campaigns can get expensive if we don’t set proper limits. I’d suggest running a two-week test before committing.",
    "Emma: Agreed. Sofia, can you prepare a pilot campaign with a capped daily budget?",
    "Sofia: Absolutely. I’ll aim for $50 per day and track the cost per acquisition closely.",
    "David: Another thing—could we add more personalized offers for returning customers? I think discounts or loyalty points could push them over the edge.",
    "Liam: I like the loyalty idea. That way, the spending becomes an investment in customer retention.",
    "Emma: Let’s put that on the roadmap. Sofia, coordinate with David to design the loyalty program structure.",
    "Sofia: Got it. We could also test short video ads instead of just static images. Engagement is usually higher with video.",
    "David: True. And maybe the videos could feature customer testimonials. People trust real stories.",
    "Liam: Just make sure production costs are minimal. We can film them in-house.",
    "Emma: Perfect. Here’s how we’ll divide responsibilities:\n\nSofia – Launch retargeting campaigns, design loyalty program structure, prepare video ad concepts.\n\nDavid – Coordinate with sales team, identify potential customer testimonials, and refine lead qualification.\n\nLiam – Monitor spending and ROI for each campaign, approve budget changes.\n\nEmma – Oversee strategy execution, align campaigns with brand goals, report to leadership.",
    "David: Sounds good to me. Let’s touch base again in two weeks to review results.",
    "Sofia: I’ll send out a campaign calendar today.",
    "Liam: And I’ll send you all a budget breakdown by the end of the day.",
    "Emma: Great. Thanks, everyone—let’s make this quarter end stronger than it started."
]

corpus_embeddings = embedder.encode(corpus)

# Some models don't automatically normalize the embeddings, in which case you should normalize the embeddings:
# corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

# Perform agglomerative clustering
clustering_model = AgglomerativeClustering(
    n_clusters=None, distance_threshold=1.5
)  # , affinity='cosine', linkage='average', distance_threshold=0.4)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(corpus[sentence_id])

for i, cluster in clustered_sentences.items():
    print("Cluster ", i + 1)
    print(cluster)
    print("")

Cluster  2
['Emma: Good morning, everyone. I wanted to start today’s meeting with a quick recap. Our sales numbers for Q2 have been steady, but not exactly where we projected. We’re about 8% below the target.', 'Emma: Agreed. Sofia, can you prepare a pilot campaign with a capped daily budget?', 'Sofia: Absolutely. I’ll aim for $50 per day and track the cost per acquisition closely.', 'Emma: Let’s put that on the roadmap. Sofia, coordinate with David to design the loyalty program structure.', 'Emma: Perfect. Here’s how we’ll divide responsibilities:\n\nSofia – Launch retargeting campaigns, design loyalty program structure, prepare video ad concepts.\n\nDavid – Coordinate with sales team, identify potential customer testimonials, and refine lead qualification.\n\nLiam – Monitor spending and ROI for each campaign, approve budget changes.\n\nEmma – Oversee strategy execution, align campaigns with brand goals, report to leadership.', 'David: Sounds good to me. Let’s touch base again in two 

  return forward_call(*args, **kwargs)


In [3]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import normalize


from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Corpus with example sentences
corpus = [
    "Emma: Good morning, everyone. I wanted to start today’s meeting with a quick recap. Our sales numbers for Q2 have been steady, but not exactly where we projected. We’re about 8% below the target.",
    "David: Yeah, I noticed that too. The leads from the last campaign were decent, but conversions dropped. We’re getting traffic, but it’s not translating into sales.",
    "Sofia: That’s exactly what I saw in the analytics. Our click-through rates are high, especially from the social ads, but a lot of visitors drop off before checkout. I think the landing page might be the bottleneck.",
    "Liam: Before we jump into new campaigns, we need to remember we’ve already used 72% of the marketing budget for the year. If we plan any big push, we’ll need to optimize our spend.",
    "Emma: Good point, Liam. That’s why I wanted us to brainstorm ideas that can give us better ROI without overspending. David, what’s your take on improving conversions?",
    "David: Honestly, I think the sales team needs more qualified leads. Right now, they’re spending too much time chasing prospects who aren’t ready to buy. Maybe we should focus on retargeting past visitors.",
    "Sofia: I can set up retargeting campaigns for both Google and Facebook. Also, we can use email automation to bring back customers who abandoned their carts. That usually has a decent success rate.",
    "Liam: Retargeting is fine, but remember—those campaigns can get expensive if we don’t set proper limits. I’d suggest running a two-week test before committing.",
    "Emma: Agreed. Sofia, can you prepare a pilot campaign with a capped daily budget?",
    "Sofia: Absolutely. I’ll aim for $50 per day and track the cost per acquisition closely.",
    "David: Another thing—could we add more personalized offers for returning customers? I think discounts or loyalty points could push them over the edge.",
    "Liam: I like the loyalty idea. That way, the spending becomes an investment in customer retention.",
    "Emma: Let’s put that on the roadmap. Sofia, coordinate with David to design the loyalty program structure.",
    "Sofia: Got it. We could also test short video ads instead of just static images. Engagement is usually higher with video.",
    "David: True. And maybe the videos could feature customer testimonials. People trust real stories.",
    "Liam: Just make sure production costs are minimal. We can film them in-house.",
    "Emma: Perfect. Here’s how we’ll divide responsibilities:\n\nSofia – Launch retargeting campaigns, design loyalty program structure, prepare video ad concepts.\n\nDavid – Coordinate with sales team, identify potential customer testimonials, and refine lead qualification.\n\nLiam – Monitor spending and ROI for each campaign, approve budget changes.\n\nEmma – Oversee strategy execution, align campaigns with brand goals, report to leadership.",
    "David: Sounds good to me. Let’s touch base again in two weeks to review results.",
    "Sofia: I’ll send out a campaign calendar today.",
    "Liam: And I’ll send you all a budget breakdown by the end of the day.",
    "Emma: Great. Thanks, everyone—let’s make this quarter end stronger than it started."
]
corpus_embeddings = normalize(corpus_embeddings)
corpus_embeddings = embedder.encode(corpus)

# Some models don't automatically normalize the embeddings, in which case you should normalize the embeddings:
# corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

# Perform agglomerative clustering
clustering_model = AgglomerativeClustering(
    n_clusters=None, distance_threshold=1.5
)  # , affinity='cosine', linkage='average', distance_threshold=0.4)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(corpus[sentence_id])

for i, cluster in clustered_sentences.items():
    print("Cluster ", i + 1)
    print(cluster)
    print("")

Cluster  2
['Emma: Good morning, everyone. I wanted to start today’s meeting with a quick recap. Our sales numbers for Q2 have been steady, but not exactly where we projected. We’re about 8% below the target.', 'Emma: Agreed. Sofia, can you prepare a pilot campaign with a capped daily budget?', 'Sofia: Absolutely. I’ll aim for $50 per day and track the cost per acquisition closely.', 'Emma: Let’s put that on the roadmap. Sofia, coordinate with David to design the loyalty program structure.', 'Emma: Perfect. Here’s how we’ll divide responsibilities:\n\nSofia – Launch retargeting campaigns, design loyalty program structure, prepare video ad concepts.\n\nDavid – Coordinate with sales team, identify potential customer testimonials, and refine lead qualification.\n\nLiam – Monitor spending and ROI for each campaign, approve budget changes.\n\nEmma – Oversee strategy execution, align campaigns with brand goals, report to leadership.', 'David: Sounds good to me. Let’s touch base again in two 

  return forward_call(*args, **kwargs)


In [1]:
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize

# Step 1: Load model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2: Load your conversation data
corpus = [
    "Emma: Good morning, everyone. I wanted to start today’s meeting with a quick recap. Our sales numbers for Q2 have been steady, but not exactly where we projected. We’re about 8% below the target.",
    "David: Yeah, I noticed that too. The leads from the last campaign were decent, but conversions dropped. We’re getting traffic, but it’s not translating into sales.",
    "Sofia: That’s exactly what I saw in the analytics. Our click-through rates are high, especially from the social ads, but a lot of visitors drop off before checkout. I think the landing page might be the bottleneck.",
    "Liam: Before we jump into new campaigns, we need to remember we’ve already used 72% of the marketing budget for the year. If we plan any big push, we’ll need to optimize our spend.",
    "Emma: Good point, Liam. That’s why I wanted us to brainstorm ideas that can give us better ROI without overspending. David, what’s your take on improving conversions?",
    "David: Honestly, I think the sales team needs more qualified leads. Right now, they’re spending too much time chasing prospects who aren’t ready to buy. Maybe we should focus on retargeting past visitors.",
    "Sofia: I can set up retargeting campaigns for both Google and Facebook. Also, we can use email automation to bring back customers who abandoned their carts. That usually has a decent success rate.",
    "Liam: Retargeting is fine, but remember—those campaigns can get expensive if we don’t set proper limits. I’d suggest running a two-week test before committing.",
    "Emma: Agreed. Sofia, can you prepare a pilot campaign with a capped daily budget?",
    "Sofia: Absolutely. I’ll aim for $50 per day and track the cost per acquisition closely.",
    "David: Another thing—could we add more personalized offers for returning customers? I think discounts or loyalty points could push them over the edge.",
    "Liam: I like the loyalty idea. That way, the spending becomes an investment in customer retention.",
    "Emma: Let’s put that on the roadmap. Sofia, coordinate with David to design the loyalty program structure.",
    "Sofia: Got it. We could also test short video ads instead of just static images. Engagement is usually higher with video.",
    "David: True. And maybe the videos could feature customer testimonials. People trust real stories.",
    "Liam: Just make sure production costs are minimal. We can film them in-house.",
    "Emma: Perfect. Here’s how we’ll divide responsibilities:\n\nSofia – Launch retargeting campaigns, design loyalty program structure, prepare video ad concepts.\n\nDavid – Coordinate with sales team, identify potential customer testimonials, and refine lead qualification.\n\nLiam – Monitor spending and ROI for each campaign, approve budget changes.\n\nEmma – Oversee strategy execution, align campaigns with brand goals, report to leadership.",
    "David: Sounds good to me. Let’s touch base again in two weeks to review results.",
    "Sofia: I’ll send out a campaign calendar today.",
    "Liam: And I’ll send you all a budget breakdown by the end of the day.",
    "Emma: Great. Thanks, everyone—let’s make this quarter end stronger than it started."
]

# Step 3: Embed and normalize
embeddings = embedder.encode(corpus)
embeddings = normalize(embeddings)

# Step 4: Cosine distance function
def cosine_distance(a, b):
    return 1 - np.dot(a, b)

# Step 5: Agglomerative clustering implementation
def agglomerative_clustering(embeddings, threshold=0.5):
    # Initialize clusters as list of indices
    clusters = [[i] for i in range(len(embeddings))]

    while True:
        min_dist = float("inf")
        merge_a, merge_b = None, None

        # Find closest pair of clusters
        for i in range(len(clusters)):
            for j in range(i+1, len(clusters)):
                # Compute average distance between clusters
                dist_sum = 0
                count = 0
                for idx_a in clusters[i]:
                    for idx_b in clusters[j]:
                        dist_sum += cosine_distance(embeddings[idx_a], embeddings[idx_b])
                        count += 1
                avg_dist = dist_sum / count

                if avg_dist < min_dist:
                    min_dist = avg_dist
                    merge_a, merge_b = i, j

        # Stop if no pairs under threshold
        if min_dist > threshold:
            break

        # Merge the two closest clusters
        clusters[merge_a].extend(clusters[merge_b])
        del clusters[merge_b]

    return clusters

# Step 6: Run clustering
clusters = agglomerative_clustering(embeddings, threshold=0.5)

# Step 7: Show results
for i, cluster in enumerate(clusters):
    print(f"\nCluster {i+1}:")
    for idx in cluster:
        print(" -", corpus[idx])


  from .autonotebook import tqdm as notebook_tqdm



Cluster 1:
 - Emma: Good morning, everyone. I wanted to start today’s meeting with a quick recap. Our sales numbers for Q2 have been steady, but not exactly where we projected. We’re about 8% below the target.

Cluster 2:
 - David: Yeah, I noticed that too. The leads from the last campaign were decent, but conversions dropped. We’re getting traffic, but it’s not translating into sales.
 - David: Honestly, I think the sales team needs more qualified leads. Right now, they’re spending too much time chasing prospects who aren’t ready to buy. Maybe we should focus on retargeting past visitors.

Cluster 3:
 - Sofia: That’s exactly what I saw in the analytics. Our click-through rates are high, especially from the social ads, but a lot of visitors drop off before checkout. I think the landing page might be the bottleneck.

Cluster 4:
 - Liam: Before we jump into new campaigns, we need to remember we’ve already used 72% of the marketing budget for the year. If we plan any big push, we’ll need

  return forward_call(*args, **kwargs)


In [2]:
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize

# Step 1: Load model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2: Conversation data
corpus = [
    "Emma: Good morning, everyone. I wanted to start today’s meeting with a quick recap. Our sales numbers for Q2 have been steady, but not exactly where we projected. We’re about 8% below the target.",
    "David: Yeah, I noticed that too. The leads from the last campaign were decent, but conversions dropped. We’re getting traffic, but it’s not translating into sales.",
    "Sofia: That’s exactly what I saw in the analytics. Our click-through rates are high, especially from the social ads, but a lot of visitors drop off before checkout. I think the landing page might be the bottleneck.",
    "Liam: Before we jump into new campaigns, we need to remember we’ve already used 72% of the marketing budget for the year. If we plan any big push, we’ll need to optimize our spend.",
    "Emma: Good point, Liam. That’s why I wanted us to brainstorm ideas that can give us better ROI without overspending. David, what’s your take on improving conversions?",
    "David: Honestly, I think the sales team needs more qualified leads. Right now, they’re spending too much time chasing prospects who aren’t ready to buy. Maybe we should focus on retargeting past visitors.",
    "Sofia: I can set up retargeting campaigns for both Google and Facebook. Also, we can use email automation to bring back customers who abandoned their carts. That usually has a decent success rate.",
    "Liam: Retargeting is fine, but remember—those campaigns can get expensive if we don’t set proper limits. I’d suggest running a two-week test before committing.",
    "Emma: Agreed. Sofia, can you prepare a pilot campaign with a capped daily budget?",
    "Sofia: Absolutely. I’ll aim for $50 per day and track the cost per acquisition closely.",
    "David: Another thing—could we add more personalized offers for returning customers? I think discounts or loyalty points could push them over the edge.",
    "Liam: I like the loyalty idea. That way, the spending becomes an investment in customer retention.",
    "Emma: Let’s put that on the roadmap. Sofia, coordinate with David to design the loyalty program structure.",
    "Sofia: Got it. We could also test short video ads instead of just static images. Engagement is usually higher with video.",
    "David: True. And maybe the videos could feature customer testimonials. People trust real stories.",
    "Liam: Just make sure production costs are minimal. We can film them in-house.",
    "Emma: Perfect. Here’s how we’ll divide responsibilities:\n\nSofia – Launch retargeting campaigns, design loyalty program structure, prepare video ad concepts.\n\nDavid – Coordinate with sales team, identify potential customer testimonials, and refine lead qualification.\n\nLiam – Monitor spending and ROI for each campaign, approve budget changes.\n\nEmma – Oversee strategy execution, align campaigns with brand goals, report to leadership.",
    "David: Sounds good to me. Let’s touch base again in two weeks to review results.",
    "Sofia: I’ll send out a campaign calendar today.",
    "Liam: And I’ll send you all a budget breakdown by the end of the day.",
    "Emma: Great. Thanks, everyone—let’s make this quarter end stronger than it started."
]

# Step 3: Group consecutive messages from the same speaker
grouped_corpus = []
current_speaker = None
current_text = ""

for line in corpus:
    speaker, _, text = line.partition(":")
    if speaker == current_speaker:
        current_text += " " + text.strip()
    else:
        if current_text:
            grouped_corpus.append(f"{current_speaker}: {current_text.strip()}")
        current_speaker = speaker
        current_text = text.strip()
if current_text:
    grouped_corpus.append(f"{current_speaker}: {current_text.strip()}")

# Step 4: Embed & normalize
embeddings = normalize(embeddings)
embeddings = embedder.encode(grouped_corpus)

# Step 5: Cosine distance
def cosine_distance(a, b):
    return 1 - np.dot(a, b)

# Step 6: Agglomerative clustering with max_clusters
def agglomerative_clustering(embeddings, max_clusters=7):
    clusters = [[i] for i in range(len(embeddings))]

    while len(clusters) > max_clusters:
        min_dist = float("inf")
        merge_a, merge_b = None, None

        # Find closest pair of clusters
        for i in range(len(clusters)):
            for j in range(i+1, len(clusters)):
                dist_sum = 0
                count = 0
                for idx_a in clusters[i]:
                    for idx_b in clusters[j]:
                        dist_sum += cosine_distance(embeddings[idx_a], embeddings[idx_b])
                        count += 1
                avg_dist = dist_sum / count

                if avg_dist < min_dist:
                    min_dist = avg_dist
                    merge_a, merge_b = i, j

        clusters[merge_a].extend(clusters[merge_b])
        del clusters[merge_b]

    return clusters

# Step 7: Run clustering
clusters = agglomerative_clustering(embeddings, max_clusters=7)

# Step 8: Print results
for i, cluster in enumerate(clusters):
    print(f"\nCluster {i+1}:")
    for idx in cluster:
        print(" -", grouped_corpus[idx])



Cluster 1:
 - Emma: Good morning, everyone. I wanted to start today’s meeting with a quick recap. Our sales numbers for Q2 have been steady, but not exactly where we projected. We’re about 8% below the target.
 - Emma: Great. Thanks, everyone—let’s make this quarter end stronger than it started.
 - Emma: Agreed. Sofia, can you prepare a pilot campaign with a capped daily budget?
 - Sofia: I’ll send out a campaign calendar today.
 - Emma: Let’s put that on the roadmap. Sofia, coordinate with David to design the loyalty program structure.
 - Emma: Perfect. Here’s how we’ll divide responsibilities:

Sofia – Launch retargeting campaigns, design loyalty program structure, prepare video ad concepts.

David – Coordinate with sales team, identify potential customer testimonials, and refine lead qualification.

Liam – Monitor spending and ROI for each campaign, approve budget changes.

Emma – Oversee strategy execution, align campaigns with brand goals, report to leadership.
 - Sofia: Absolute

  return forward_call(*args, **kwargs)


In [5]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import normalize
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import pdist, squareform
import numpy as np

# 1. Prepare the corpus (same as yours)
corpus = [
    "Emma: Good morning, everyone. I wanted to start today’s meeting with a quick recap. Our sales numbers for Q2 have been steady, but not exactly where we projected. We’re about 8% below the target.",
    "David: Yeah, I noticed that too. The leads from the last campaign were decent, but conversions dropped. We’re getting traffic, but it’s not translating into sales.",
    "Sofia: That’s exactly what I saw in the analytics. Our click-through rates are high, especially from the social ads, but a lot of visitors drop off before checkout. I think the landing page might be the bottleneck.",
    "Liam: Before we jump into new campaigns, we need to remember we’ve already used 72% of the marketing budget for the year. If we plan any big push, we’ll need to optimize our spend.",
    "Emma: Good point, Liam. That’s why I wanted us to brainstorm ideas that can give us better ROI without overspending. David, what’s your take on improving conversions?",
    "David: Honestly, I think the sales team needs more qualified leads. Right now, they’re spending too much time chasing prospects who aren’t ready to buy. Maybe we should focus on retargeting past visitors.",
    "Sofia: I can set up retargeting campaigns for both Google and Facebook. Also, we can use email automation to bring back customers who abandoned their carts. That usually has a decent success rate.",
    "Liam: Retargeting is fine, but remember—those campaigns can get expensive if we don’t set proper limits. I’d suggest running a two-week test before committing.",
    "Emma: Agreed. Sofia, can you prepare a pilot campaign with a capped daily budget?",
    "Sofia: Absolutely. I’ll aim for $50 per day and track the cost per acquisition closely.",
    "David: Another thing—could we add more personalized offers for returning customers? I think discounts or loyalty points could push them over the edge.",
    "Liam: I like the loyalty idea. That way, the spending becomes an investment in customer retention.",
    "Emma: Let’s put that on the roadmap. Sofia, coordinate with David to design the loyalty program structure.",
    "Sofia: Got it. We could also test short video ads instead of just static images. Engagement is usually higher with video.",
    "David: True. And maybe the videos could feature customer testimonials. People trust real stories.",
    "Liam: Just make sure production costs are minimal. We can film them in-house.",
    "Emma: Perfect. Here’s how we’ll divide responsibilities:\n\nSofia – Launch retargeting campaigns, design loyalty program structure, prepare video ad concepts.\n\nDavid – Coordinate with sales team, identify potential customer testimonials, and refine lead qualification.\n\nLiam – Monitor spending and ROI for each campaign, approve budget changes.\n\nEmma – Oversee strategy execution, align campaigns with brand goals, report to leadership.",
    "David: Sounds good to me. Let’s touch base again in two weeks to review results.",
    "Sofia: I’ll send out a campaign calendar today.",
    "Liam: And I’ll send you all a budget breakdown by the end of the day.",
    "Emma: Great. Thanks, everyone—let’s make this quarter end stronger than it started."
]

# 2. Compute embeddings and normalize
embedder = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedder.encode(corpus)
embeddings = normalize(embeddings)

# 3. Compute cosine distance matrix
distance_matrix = squareform(pdist(embeddings, metric='cosine'))

# 4. Agglomerative clustering with average linkage
clustering_model = AgglomerativeClustering(
    n_clusters=None,
    metric='precomputed',   # updated for sklearn >= 1.4
    linkage='average',
    distance_threshold=0.7
)
cluster_labels = clustering_model.fit_predict(distance_matrix)

# 5. Group sentences by cluster
clustered_sentences = {}
for idx, label in enumerate(cluster_labels):
    clustered_sentences.setdefault(label, []).append(corpus[idx])

# 6. Print clusters
for cluster_id, sentences in clustered_sentences.items():
    print(f"Cluster {cluster_id + 1}:")
    for s in sentences:
        print("-", s)
    print("")


Cluster 5:
- Emma: Good morning, everyone. I wanted to start today’s meeting with a quick recap. Our sales numbers for Q2 have been steady, but not exactly where we projected. We’re about 8% below the target.
- Emma: Agreed. Sofia, can you prepare a pilot campaign with a capped daily budget?
- Sofia: Absolutely. I’ll aim for $50 per day and track the cost per acquisition closely.
- Emma: Let’s put that on the roadmap. Sofia, coordinate with David to design the loyalty program structure.
- Emma: Perfect. Here’s how we’ll divide responsibilities:

Sofia – Launch retargeting campaigns, design loyalty program structure, prepare video ad concepts.

David – Coordinate with sales team, identify potential customer testimonials, and refine lead qualification.

Liam – Monitor spending and ROI for each campaign, approve budget changes.

Emma – Oversee strategy execution, align campaigns with brand goals, report to leadership.
- Sofia: I’ll send out a campaign calendar today.
- Emma: Great. Thanks

  return forward_call(*args, **kwargs)


In [8]:
import json
import string
from nltk.tokenize import sent_tokenize, word_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import pdist, squareform
from collections import defaultdict
import nltk
import numpy as np

# Optional: Custom stopwords for conversational junk
EXTRA_STOPWORDS = set([
    "sure", "talk", "thing", "alright", "yeah", "good", "right", "like", "actually",
    "okay", "look", "let", "just", "team", "going", "got", "make", "bit", "fine"
])

# 🔧 Filter keywords post-processing
def filter_keywords(keywords):
    """Remove keywords that are substrings of longer keywords and limit to top 5."""
    filtered = []
    for kw in keywords:
        if all(kw not in other or kw == other for other in filtered):
            filtered.append(kw)
    return filtered[:5]

# 1. Load & preprocess
def load_entries(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)
    entries = []
    for entry in data:
        for sent in sent_tokenize(entry["text"]):
            entries.append({
                "text": sent.strip(),
                "name": entry["name"],
                "timestamp": entry["timestamp"]
            })
    return entries

# 2. Embedding
def embed_texts(texts, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    embs = model.encode(texts)
    return normalize(embs)

# 3. Clustering
def cluster_texts(embs, threshold=0.5):
    dmat = squareform(pdist(embs, metric="cosine"))
    aml = AgglomerativeClustering(
        n_clusters=None, metric="precomputed",
        linkage="average", distance_threshold=threshold
    )
    return aml.fit_predict(dmat)

# 4. Merge small clusters
def merge_small(entries, embs, labels, min_size=4):
    cluster_map = defaultdict(list)
    for i, lbl in enumerate(labels):
        cluster_map[lbl].append(i)
    big = {k: v for k, v in cluster_map.items() if len(v) >= min_size}
    small = {k: v for k, v in cluster_map.items() if len(v) < min_size}
    centroids = {k: embs[v].mean(axis=0) for k, v in big.items()}
    new_lbls = labels.copy()
    for sk, idxs in small.items():
        centroid = embs[idxs].mean(axis=0)
        if not centroids:
            continue
        best = min(centroids, key=lambda k: 1 - centroid @ centroids[k])
        for i in idxs:
            new_lbls[i] = best
    return new_lbls

# 5. Improved TF-IDF Keyword extraction with filtering and stopwords
def extract_keywords(sentences, top_k=5):
    from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
    stopwords = list(ENGLISH_STOP_WORDS.union(EXTRA_STOPWORDS))

    def clean(txt):
        return " ".join([
            w for w in word_tokenize(txt.lower())
            if w.isalnum() and w not in stopwords
        ])

    corpus = [clean(s) for s in sentences]
    if not any(corpus):
        return []

    vectorizer = TfidfVectorizer(
        stop_words=stopwords,
        ngram_range=(1, 3),  # unigrams + bigrams + trigrams
        min_df=1
    )
    tfidf = vectorizer.fit_transform(corpus)
    sums = tfidf.sum(axis=0).A1
    terms = vectorizer.get_feature_names_out()
    sorted_terms = [terms[i] for i in sums.argsort()[::-1]]
    return filter_keywords(sorted_terms)

# 6. Full pipeline with enhanced keywords
def main(filepath="convo.json"):
    entries = load_entries(filepath)
    texts = [e["text"] for e in entries]
    embs = embed_texts(texts)

    labels = cluster_texts(embs)
    labels = merge_small(entries, embs, labels)

    clustered = defaultdict(list)
    for i, lbl in enumerate(labels):
        clustered[lbl].append(entries[i])

    for cid, grp in sorted(clustered.items(), key=lambda x: -len(x[1])):
        sents = [e["text"] for e in grp]
        keywords = extract_keywords(sents)
        print(f"🔹 Cluster {cid+1} ({len(sents)} sentences)")
        print(f"📝 Keywords: {', '.join(keywords)}")
        for e in grp:
            print(f" - [{e['timestamp']}] {e['name']}: {e['text']}")
        print("")

if __name__ == "__main__":
    main("convo.json")


  return forward_call(*args, **kwargs)


🔹 Cluster 10 (78 sentences)
📝 Keywords: sales, leads, conversions, data, seasonal
 - [00:00] Emma: Alright, team, before we talk creative or budget, I want to start with the analytics.
 - [00:00] Emma: How are we actually performing compared to last month?
 - [00:06] Sofia: Overall traffic is up 12%, but conversions dropped from 3.2% to 2.7%.
 - [00:06] Sofia: The click-through rates are healthy—it’s the post-click engagement where we’re losing people.
 - [00:14] David: That matches what sales is seeing.
 - [00:14] David: We’re getting more leads, but the percentage that actually respond to follow-up calls is lower.
 - [00:14] David: So, a quality issue.
 - [00:21] Liam: From a cost perspective, our cost per click is stable, but cost per acquisition has gone up about 18%.
 - [00:28] Emma: Could this be seasonal?
 - [00:28] Emma: Or is it more about the targeting getting too broad?
 - [00:31] Sofia: Seasonality plays a small part, but I think it’s more audience drift.
 - [00:31] Sofia: 

Using HDSCAN

In [1]:
# cluster_convo_hdbscan.py
# pip install sentence-transformers hdbscan scikit-learn nltk

import json, argparse, re
from collections import defaultdict
from typing import List, Dict, Any
import numpy as np

from sentence_transformers import SentenceTransformer
import hdbscan
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize

# ----------------------------
# 1) Load + sentence-split
# ----------------------------
def load_entries(path: str) -> List[Dict[str, Any]]:
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

def split_into_sentences(text: str) -> List[str]:
    # Try NLTK, else a simple regex fallback
    try:
        import nltk
        try:
            nltk.data.find("tokenizers/punkt")
        except LookupError:
            nltk.download("punkt", quiet=True)
        from nltk.tokenize import sent_tokenize
        sents = sent_tokenize(text)
    except Exception:
        sents = re.split(r"(?<=[.!?])\s+", text.strip())
    return [s.strip() for s in sents if s.strip()]

def expand_to_sentences(entries: List[Dict[str, str]]) -> List[Dict[str, str]]:
    out = []
    for e in entries:
        for s in split_into_sentences(e["text"]):
            out.append({"timestamp": e["timestamp"], "name": e["name"], "text": s})
    return out

# ----------------------------
# 2) Embeddings
# ----------------------------
def embed_texts(texts: List[str], model_name="all-MiniLM-L6-v2") -> np.ndarray:
    model = SentenceTransformer(model_name)
    embs = model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
    return embs

# ----------------------------
# 3) HDBSCAN
# ----------------------------
def run_hdbscan(embs: np.ndarray, min_cluster_size=6, min_samples=2, metric="euclidean"):
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        metric=metric,
        cluster_selection_method="eom"
    )
    labels = clusterer.fit_predict(embs)
    return labels

# ----------------------------
# 4) Keyword extraction
# ----------------------------
def cluster_keywords(texts: List[str], top_k=6) -> List[str]:
    if not texts:
        return []
    vec = TfidfVectorizer(stop_words="english", ngram_range=(1, 3), min_df=1)
    X = vec.fit_transform(texts)
    scores = np.asarray(X.sum(axis=0)).ravel()
    terms = vec.get_feature_names_out()
    order = np.argsort(scores)[::-1]
    # filter substrings to keep cleaner keywords
    picked = []
    for idx in order:
        term = terms[idx]
        if not any(term in p and term != p for p in picked):
            picked.append(term)
        if len(picked) >= top_k:
            break
    return picked

# ----------------------------
# 5) Optional: reattach noise
# ----------------------------
def reattach_noise_to_nearest(labels: np.ndarray, embs: np.ndarray, sim_threshold=0.68) -> np.ndarray:
    new_labels = labels.copy()
    valid = [c for c in np.unique(labels) if c != -1]
    if not valid:
        return new_labels
    centroids = {c: normalize(embs[labels == c].mean(axis=0, keepdims=True))[0] for c in valid}
    for i, lbl in enumerate(labels):
        if lbl == -1:
            sims = [(c, float(np.dot(embs[i], centroids[c]))) for c in valid]
            best_c, best_s = max(sims, key=lambda x: x[1])
            if best_s >= sim_threshold:
                new_labels[i] = best_c
    return new_labels

# ----------------------------
# 6) Print result
# ----------------------------
def print_clusters(items: List[Dict[str, str]], labels: np.ndarray, show_noise=True):
    clusters = defaultdict(list)
    for it, lbl in zip(items, labels):
        clusters[lbl].append(it)

    # Order by cluster size (noise last)
    order = sorted([c for c in clusters.keys() if c != -1], key=lambda c: -len(clusters[c]))
    if show_noise and -1 in clusters:
        order += [-1]

    for cid in order:
        lines = clusters[cid]
        texts = [x["text"] for x in lines]
        if cid == -1:
            print(f"\n🟦 Noise ({len(lines)} sentences)")
        else:
            kws = ", ".join(cluster_keywords(texts))
            print(f"\n🔹 Cluster {cid} ({len(lines)} sentences)")
            print(f"📝 Keywords: {kws}")
        for x in lines:
            print(f" - [{x['timestamp']}] {x['name']}: {x['text']}")

# ----------------------------
# 7) Main
# ----------------------------
def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--path", default="convo.js", help="Path to convo.js (JSON array).")
    ap.add_argument("--min_cluster_size", type=int, default=8, help="HDBSCAN min_cluster_size.")
    ap.add_argument("--min_samples", type=int, default=2, help="HDBSCAN min_samples.")
    ap.add_argument("--reattach_noise", action="store_true", help="Reattach noise to nearest cluster.")
    ap.add_argument("--noise_threshold", type=float, default=0.7, help="Cosine sim threshold for reattaching noise.")
    args = ap.parse_args()

    raw = load_entries(args.path)
    items = expand_to_sentences(raw)
    texts = [x["text"] for x in items]

    embs = embed_texts(texts)  # normalized
    labels = run_hdbscan(embs, min_cluster_size=args.min_cluster_size, min_samples=args.min_samples)

    if args.reattach_noise:
        labels = reattach_noise_to_nearest(labels, embs, sim_threshold=args.noise_threshold)

    print_clusters(items, labels, show_noise=not args.reattach_noise)

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm
usage: ipykernel_launcher.py [-h] [--path PATH]
                             [--min_cluster_size MIN_CLUSTER_SIZE]
                             [--min_samples MIN_SAMPLES] [--reattach_noise]
                             [--noise_threshold NOISE_THRESHOLD]
ipykernel_launcher.py: error: unrecognized arguments: --f=/run/user/1000/jupyter/runtime/kernel-v34de41731aebdd7835994e1b8cf4ced532de2355a.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
