Creation of dataset based on specific scenarios that can be heppening in SurveySparrow

In [2]:
import csv
import random

# More specific categories
categories = [
    "Survey Creation",
    "Data Collection",
    "Results Analysis",
    "Account Management",
    "Pricing and Billing",
    "Technical Issues",
    "Integrations",
    "Survey Distribution"
]

# More realistic queries with varying complexity
queries = {
    "Survey Creation": [
        "How do I add branching logic to my survey?",
        "Can I use custom CSS in my survey design?",
        "Is there a way to randomize question order?",
        "How many question types does SurveySparrow offer?",
        "Can I create a multi-language survey?",
    ],
    "Data Collection": [
        "What's the maximum number of responses I can collect?",
        "How can I prevent duplicate responses?",
        "Is it possible to collect responses offline?",
        "Can I set an expiry date for my survey?",
        "How do I enable partial response saving?",
    ],
    "Results Analysis": [
        "How can I create custom reports for specific question types?",
        "Is there a way to filter responses based on specific criteria?",
        "Can I generate word clouds from open-ended responses?",
        "How do I export my survey data to SPSS format?",
        "Is it possible to set up automated report generation?",
    ],
    "Account Management": [
        "How do I add team members to my account?",
        "Can I transfer ownership of a survey to another user?",
        "What's the process for upgrading from a free to a paid plan?",
        "How can I enable two-factor authentication for my account?",
        "I need to close my account, what steps should I take?",
    ],
    "Pricing and Billing": [
        "Can you explain the difference between your pricing tiers?",
        "Is there a discount for annual billing?",
        "How do I update my credit card information?",
        "Do you offer any special pricing for non-profit organizations?",
        "I was charged twice this month, can you help me understand why?",
    ],
    "Technical Issues": [
        "The survey embed code isn't working on my website",
        "I'm getting a 404 error when trying to access my results",
        "The email invitations aren't being delivered to some respondents",
        "My custom domain isn't resolving correctly for my surveys",
        "The survey is loading very slowly for respondents, how can I optimize it?",
    ],
    "Integrations": [
        "How do I set up the Zapier integration?",
        "Can SurveySparrow integrate directly with our CRM system?",
        "Is there an API available for custom integrations?",
        "How do I connect my Google Analytics account to track survey performance?",
        "Can I use webhooks to send survey data to our internal systems?",
    ],
    "Survey Distribution": [
        "What's the best way to share my survey on social media?",
        "How can I embed the survey in an email newsletter?",
        "Is there a QR code option for sharing surveys?",
        "Can I schedule automated reminder emails for incomplete responses?",
        "How do I create a custom URL for my survey?",
    ]
}

# Refined escalation and sentiment options
escalation_options = ["Escalation needed", "No escalation needed"]
sentiment_options = ["Positive", "Negative", "Neutral"]

# Generate variations with more context
def generate_variations(query, category, n=3):
    variations = [query]
    prefixes = [
        f"I'm having trouble with {category.lower()}: ",
        f"Can you help me understand how to ",
        f"I'm confused about {category.lower()}: ",
        f"I need assistance with {category.lower()}: ",
        f"Could you explain how to "
    ]
    for _ in range(n-1):
        variations.append(random.choice(prefixes) + query.lower())
    return variations

# Create the dataset
dataset = []
for category, category_queries in queries.items():
    for query in category_queries:
        variations = generate_variations(query, category)
        for variation in variations:
            # Assign escalation need based on query complexity
            escalation = "Escalation needed" if len(variation.split()) > 10 or "error" in variation.lower() or "isn't working" in variation.lower() else "No escalation needed"

            # Assign sentiment based on query content
            if any(word in variation.lower() for word in ["error", "trouble", "confused", "isn't working"]):
                sentiment = "Negative"
            elif any(word in variation.lower() for word in ["help", "explain", "understand"]):
                sentiment = "Neutral"
            else:
                sentiment = "Positive"

            dataset.append([variation, escalation, sentiment, category])

# Shuffle the dataset
random.shuffle(dataset)

# Write to CSV
with open('Agent_escalation.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Query", "Escalation", "Sentiment", "Category"])
    writer.writerows(dataset)

print(f"Dataset created with {len(dataset)} entries.")

Dataset created with 120 entries.


Using BERT model for Agent escalation

In [3]:
#Import libraries

import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [6]:
#Load and prepare the data

df = pd.read_csv('/content/Agent_escalation.csv')


sentences = df.Query.values
labels = (df.Escalation == "Escalation needed").astype(int).values

In [7]:
#Load the BERT tokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
# Tokenize all of the sentences and map the tokens to their word IDs
input_ids = []
attention_masks = []

for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,
                        add_special_tokens = True,
                        max_length = 64,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

# Convert to tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [11]:
# Split into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(input_ids, labels, random_state=42, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=42, test_size=0.1)

#Batch Size
batch_size = 32

#Create dataloaders
train_data = TensorDataset(x_train, train_masks, y_train)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(x_test, validation_masks,y_test)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [12]:
# Load BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr = 2e-5,eps = 1e-8)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# Training loop
device = torch.device('cpu')
model.to(device)
epochs = 4

for epoch in range(epochs):
    model.train()
    for batch in train_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        model.zero_grad()
        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()

In [20]:
# Validation
model.eval()
eval_loss, eval_accuracy = 0, 0
for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask)
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        eval_accuracy += (logits.argmax(axis=1) == label_ids).mean()
print(f"Epoch {epoch+1}, Validation Accuracy: {eval_accuracy/len(validation_dataloader)}")

Epoch 4, Validation Accuracy: 0.9166666666666666


In [23]:
# Test the model
model.eval()
predictions = []
true_labels = []
for batch in validation_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    predictions.extend(logits.argmax(axis=1))
    true_labels.extend(label_ids)

print(classification_report(true_labels, predictions, target_names=['No escalation needed', 'Escalation needed']))

                      precision    recall  f1-score   support

No escalation needed       0.67      1.00      0.80         2
   Escalation needed       1.00      0.90      0.95        10

            accuracy                           0.92        12
           macro avg       0.83      0.95      0.87        12
        weighted avg       0.94      0.92      0.92        12



Saving the developed model

In [30]:
import os
from google.colab import drive
import torch

# Mount Google Drive
drive.mount('/content/drive')

# Define the path where you want to save the model in your Google Drive
save_path = '/content/drive/My Drive/BERT_SurveySparrow_Model'

# Create the directory if it doesn't exist
os.makedirs(save_path, exist_ok=True)

# Save the model
model.save_pretrained(save_path)

# Save the tokenizer
tokenizer.save_pretrained(save_path)

print(f"Model and tokenizer saved to Google Drive at: {save_path}")

# Verify that the files are saved
!ls "{save_path}"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model and tokenizer saved to Google Drive at: /content/drive/My Drive/BERT_SurveySparrow_Model
config.json	   README.txt		    tokenizer_config.json
model.safetensors  special_tokens_map.json  vocab.txt
