In [1]:
import pandas as pd 
import random

In [2]:
knowledge = {
    "competitors": ["CompetitorX", "CompetitorY", "CompetitorZ"],
    "features": ["analytics", "AI engine", "data pipeline"],
    "pricing_keywords": ["discount", "pricing model", "budget"],
    "security_keywords": ["SOC2 certified", "encryption"]
}

In [3]:
def generate_snippet(theme):
    if theme == "Competition":
        return f"We like your product, but {random.choice(knowledge['competitors'])} offers better {random.choice(knowledge['features'])}."
    elif theme == "Pricing Discussion":
        return f"The {random.choice(knowledge['features'])} looks great, but the {random.choice(knowledge['pricing_keywords'])} is too high."
    elif theme == "Security":
        return f"Is your solution {random.choice(knowledge['security_keywords'])}? Our team is concerned about compliance."
    elif theme == "Objection":
        return f"I’m not sure if {random.choice(knowledge['features'])} solves our use case. We’re still evaluating."
    return ""
themes = ["Competition", "Pricing Discussion", "Security", "Objection"]
data = []
for i in range(1, 201):
    theme = random.choice(themes)
    snippet = generate_snippet(theme)
    data.append({"id": i, "text_snippet": snippet, "labels": theme})


In [4]:
df = pd.DataFrame(data)

In [5]:
df.to_csv("calls_dataset.csv", index=False)

In [6]:
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/prajwala/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
import random
nltk.download('wordnet')
nltk.download('omw-1.4')


knowledge = {
    "competitors": ["CompetitorX", "CompetitorY", "CompetitorZ", "TechCorp", "MediPlus"],
    "features": ["analytics", "AI engine", "data pipeline", "automation", "remote monitoring"],
    "pricing_keywords": ["discount", "pricing model", "subscription cost", "budget"],
    "security_keywords": ["SOC2 certified", "data compliance", "encryption", "privacy policy"]
}

def random_swap(text, n=1):
    words = word_tokenize(text)
    for _ in range(n):
        idx1, idx2 = random.sample(range(len(words)), 2)
        words[idx1], words[idx2] = words[idx2], words[idx1]
    return " ".join(words)


def generate_multiclass_snippet():
    snippet = ""
    labels = set()

    if random.random() < 0.5:  
        snippet += f"We like your product, but {random.choice(knowledge['competitors'])} offers better {random.choice(knowledge['features'])}. "
        labels.add("Competition")

    if random.random() < 0.5:  
        snippet += f"The {random.choice(knowledge['features'])} looks great, but the {random.choice(knowledge['pricing_keywords'])} is too high. "
        labels.add("Pricing Discussion")

    if random.random() < 0.5: 
        snippet += f"Is your solution {random.choice(knowledge['security_keywords'])}? Our team is concerned about compliance. "
        labels.add("Security")

    if random.random() < 0.5:
        snippet += f"I'm not sure if {random.choice(knowledge['features'])} solves our use case. We're still evaluating. "
        labels.add("Objection")

    if not labels:
        labels.add("Objection") 
        snippet += f"I'm not sure if {random.choice(knowledge['features'])} solves our use case. We're still evaluating."


    return snippet.strip(), ", ".join(labels)


multiclass_data = []
for i in range(1, 601): 
    snippet, labels = generate_multiclass_snippet()
    multiclass_data.append({"id": i, "text_snippet": snippet, "labels": labels})
for i in range(601, 1201): 
    snippet, labels = generate_multiclass_snippet()
    multiclass_data.append({"id": i, "text_snippet": snippet, "labels": labels})

import pandas as pd
df2 = pd.DataFrame(multiclass_data)
df2.to_csv("multiclass_calls_dataset.csv", index=False)

print("multiclass dataset created!")


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/prajwala/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/prajwala/nltk_data...


multiclass dataset created!


[nltk_data]   Package omw-1.4 is already up-to-date!


In [8]:
import pandas as pd
import random

df3 = pd.read_csv("multiclass_calls_dataset.csv")
df3['labels'] = df3['labels'].astype(str)


random_indices = random.sample(range(len(df3)), 200)

def assign_rand_label(index, labels):
    if index in random_indices:
        return 'rand' 
    return labels 

df3['labels'] = [assign_rand_label(idx, label) for idx, label in enumerate(df3['labels'])]
df3.to_csv("multiclass_calls_dataset_with_rand.csv", index=False)

print("Dataset updated with 200 random 'rand' labels!")


Dataset updated with 200 random 'rand' labels!
