In [20]:
# # Turn-Level Classification with BERT

In [36]:
# ============================================
# 0) Install Requirements (if necessary)
# ============================================
# In a new environment or Google Colab, you might need:
!pip install transformers datasets accelerate pandas scikit-learn




In [22]:
# ============================================
# 1) Imports
# ============================================
import pandas as pd
import numpy as np
from datasets import Dataset, ClassLabel
from transformers import (
    BertTokenizer, 
    BertForSequenceClassification, 
    TrainingArguments, 
    Trainer
)
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

print("All libraries imported successfully!")


All libraries imported successfully!


In [23]:
# ============================================
# 2) Load the CSV dataset
# ============================================
# We'll assume you saved the table of data to "dialogues.csv".
# The table has columns:
#   CONVERSATION_ID | CONVERSATION_STEP | TEXT | CONTEXT | LABEL

df = pd.read_csv("/Users/ashansubodha/Desktop/VOIP Vishing/conversation-prediction/FINAL_DATASET2.csv")
print("Data loaded. Number of rows:", len(df))
print(df.head(10))


Data loaded. Number of rows: 967
   CONVERSATION_ID  CONVERSATION_STEP  \
0                0                  1   
1                0                  2   
2                0                  3   
3                0                  4   
4                0                  5   
5                0                  6   
6                0                  7   
7                0                  8   
8                0                  9   
9                0                 10   

                                                TEXT  \
0  Hello, this is [Your Name]'s personal assistan...   
1  Hi, I'm Sam. I saw an ad about a photography w...   
2  Hi Sam, it's great to hear of your interest in...   
3  Thanks! I was wondering about the skill level ...   
4  The workshop is designed to accommodate all sk...   
5  That sounds perfect. What's the registration p...   
6  You can register through our website. I can gu...   
7  A direct link would be great. Can you also tel...   
8  Certainl

In [24]:
# ============================================
# 3) Label Inspection
# ============================================
# Let's see what unique labels exist in the 'LABEL' column.
unique_labels = df["LABEL"].unique()
print("Unique labels:", unique_labels)

# Example output might be:
# ['neutral', 'slightly_suspicious', 'suspicious', 'potential_scam', 'legitimate']
# or something similar depending on your data.


Unique labels: ['neutral' 'slightly_suspicious' 'legitimate' 'potential_scam'
 'suspicious' 'highly_suspicious' ' neutral' 'scam' 'scam_response'
 ' scam_response' ' potential_scam' ' legitimate' ' scam'
 ' dismissing official protocols' ' emphasizing security and compliance'
 ' ready for further engagement' ' adhering to protocols'
 ' Expressing admiration' ' even winning gold!"'
 ' Describing early experience' ' Describing inline skating'
 ' Reflecting on the difficulty' ' but then they started catching up."'
 ' Expressing emotional reaction' ' Reflecting on perseverance'
 ' Reflecting on Korean resilience'
 ' winter sports—they’re consistently at the top."'
 ' and I always thought I would be good at it."'
 ' Reflecting on personal experience' " Praising Korea's sports prowess"
 ' Concluding statement']


In [25]:
# ============================================
# 4) Prepare the Data for Turn-Level Classification
# ============================================
# We'll define a simple classification task: each row is one example,
# input = TEXT, label = LABEL.

# Some labels might be "neutral", "potential_scam", "suspicious", etc.
# We need to map them to integer IDs for the model.

label_list = list(unique_labels)
label_list.sort()  # optional, to keep them in a deterministic order
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}
print("label2id:", label2id)

# We'll add a numeric label column to the DataFrame
df["numeric_label"] = df["LABEL"].apply(lambda x: label2id[x])


label2id: {' Concluding statement': 0, ' Describing early experience': 1, ' Describing inline skating': 2, ' Expressing admiration': 3, ' Expressing emotional reaction': 4, " Praising Korea's sports prowess": 5, ' Reflecting on Korean resilience': 6, ' Reflecting on perseverance': 7, ' Reflecting on personal experience': 8, ' Reflecting on the difficulty': 9, ' adhering to protocols': 10, ' and I always thought I would be good at it."': 11, ' but then they started catching up."': 12, ' dismissing official protocols': 13, ' emphasizing security and compliance': 14, ' even winning gold!"': 15, ' legitimate': 16, ' neutral': 17, ' potential_scam': 18, ' ready for further engagement': 19, ' scam': 20, ' scam_response': 21, ' winter sports—they’re consistently at the top."': 22, 'highly_suspicious': 23, 'legitimate': 24, 'neutral': 25, 'potential_scam': 26, 'scam': 27, 'scam_response': 28, 'slightly_suspicious': 29, 'suspicious': 30}


In [26]:
# ============================================
# 5) Train-Test Split
# ============================================
# We'll do a simple 80-20 split at the row-level. 
# If you need conversation-level splits (i.e. no conversation is partially in train/test),
# you'll do that by grouping on CONVERSATION_ID. 
# For now, let's keep it simple.

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)
print("Train size:", len(train_df), "Test size:", len(test_df))


Train size: 773 Test size: 194


In [27]:
# ============================================
# 6) Convert to Hugging Face Dataset
# ============================================
train_dataset = Dataset.from_pandas(train_df[["TEXT", "numeric_label"]])
test_dataset  = Dataset.from_pandas(test_df[["TEXT", "numeric_label"]])

# rename "numeric_label" to "labels" so Trainer knows it's the label
train_dataset = train_dataset.rename_column("numeric_label", "labels")
test_dataset  = test_dataset.rename_column("numeric_label", "labels")

print("Train example:", train_dataset[0])


Train example: {'TEXT': 'While we understand the urgency, we must adhere to our internal protocols for legal matters. All requests of this nature need to be documented and reviewed by our legal team.', 'labels': 21, '__index_level_0__': 720}


In [28]:
# ============================================
# 7) Tokenization
# ============================================
# We'll load a BERT tokenizer (could be "bert-base-uncased" or any variant).
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(
        examples["TEXT"], 
        truncation=True,
        padding="max_length",
        max_length=128
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset  = test_dataset.map(tokenize_function,  batched=True)

# Remove the original text column to keep only tokenized fields
train_dataset = train_dataset.remove_columns(["TEXT"])
test_dataset  = test_dataset.remove_columns(["TEXT"])

# set format to PyTorch
train_dataset.set_format("torch")
test_dataset.set_format("torch")

print("Train sample after tokenization:", train_dataset[0])


Map: 100%|██████████| 773/773 [00:00<00:00, 3776.61 examples/s]
Map: 100%|██████████| 194/194 [00:00<00:00, 4434.06 examples/s]

Train sample after tokenization: {'labels': tensor(21), '__index_level_0__': tensor(720), 'input_ids': tensor([  101,  2096,  2057,  3305,  1996, 19353,  1010,  2057,  2442, 25276,
         2000,  2256,  4722, 16744,  2005,  3423,  5609,  1012,  2035, 11186,
         1997,  2023,  3267,  2342,  2000,  2022,  8832,  1998,  8182,  2011,
         2256,  3423,  2136,  1012,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,    




In [29]:
# ============================================
# 8) Load the Model for Sequence Classification
# ============================================
# We'll define the number of labels as the length of label_list.

num_labels = len(label_list)
model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)

# We'll also define a custom compute_metrics function for evaluation
def compute_metrics(eval_pred):
    from sklearn.metrics import accuracy_score, f1_score
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
# ============================================
# 9) Define Training Arguments
# ============================================
training_args = TrainingArguments(
    output_dir="turn_classification",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

# We'll use the Trainer API from Hugging Face
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [31]:
# ============================================
# 10) Train the Model
# ============================================
trainer.train()


  9%|▊         | 50/582 [00:14<01:57,  4.54it/s]

{'loss': 2.6102, 'grad_norm': 7.814505577087402, 'learning_rate': 4.570446735395189e-05, 'epoch': 0.26}


 17%|█▋        | 100/582 [00:25<01:48,  4.46it/s]

{'loss': 2.0148, 'grad_norm': 9.731621742248535, 'learning_rate': 4.140893470790378e-05, 'epoch': 0.52}


 26%|██▌       | 150/582 [00:37<01:37,  4.43it/s]

{'loss': 1.7797, 'grad_norm': 24.606828689575195, 'learning_rate': 3.7113402061855674e-05, 'epoch': 0.77}


                                                 
 33%|███▎      | 194/582 [00:53<07:26,  1.15s/it]

{'eval_loss': 1.780285120010376, 'eval_accuracy': 0.4484536082474227, 'eval_f1': 0.3686819429877295, 'eval_runtime': 3.0667, 'eval_samples_per_second': 63.261, 'eval_steps_per_second': 15.978, 'epoch': 1.0}


 34%|███▍      | 200/582 [00:56<03:44,  1.70it/s]

{'loss': 1.8361, 'grad_norm': 10.021393775939941, 'learning_rate': 3.2817869415807564e-05, 'epoch': 1.03}


 43%|████▎     | 250/582 [01:07<01:14,  4.45it/s]

{'loss': 1.4984, 'grad_norm': 27.179426193237305, 'learning_rate': 2.852233676975945e-05, 'epoch': 1.29}


 52%|█████▏    | 300/582 [01:18<01:02,  4.52it/s]

{'loss': 1.5212, 'grad_norm': 11.409460067749023, 'learning_rate': 2.422680412371134e-05, 'epoch': 1.55}


 60%|██████    | 350/582 [01:30<00:51,  4.52it/s]

{'loss': 1.2466, 'grad_norm': 8.256134033203125, 'learning_rate': 1.9931271477663232e-05, 'epoch': 1.8}


                                                 
 67%|██████▋   | 388/582 [01:40<00:38,  5.01it/s]

{'eval_loss': 1.5115593671798706, 'eval_accuracy': 0.5721649484536082, 'eval_f1': 0.4911569189178284, 'eval_runtime': 2.618, 'eval_samples_per_second': 74.102, 'eval_steps_per_second': 18.716, 'epoch': 2.0}


 69%|██████▊   | 400/582 [01:45<00:45,  4.02it/s]

{'loss': 1.3203, 'grad_norm': 11.056227684020996, 'learning_rate': 1.5635738831615122e-05, 'epoch': 2.06}


 77%|███████▋  | 450/582 [01:56<00:29,  4.52it/s]

{'loss': 1.2439, 'grad_norm': 8.876267433166504, 'learning_rate': 1.134020618556701e-05, 'epoch': 2.32}


 86%|████████▌ | 500/582 [02:07<00:18,  4.51it/s]

{'loss': 1.0048, 'grad_norm': 8.128521919250488, 'learning_rate': 7.0446735395189e-06, 'epoch': 2.58}


 95%|█████████▍| 550/582 [02:18<00:07,  4.52it/s]

{'loss': 1.0282, 'grad_norm': 11.993239402770996, 'learning_rate': 2.7491408934707903e-06, 'epoch': 2.84}


                                                 
100%|██████████| 582/582 [02:30<00:00,  4.99it/s]

{'eval_loss': 1.4126800298690796, 'eval_accuracy': 0.5721649484536082, 'eval_f1': 0.5082952163815565, 'eval_runtime': 2.6158, 'eval_samples_per_second': 74.164, 'eval_steps_per_second': 18.732, 'epoch': 3.0}


100%|██████████| 582/582 [02:31<00:00,  3.83it/s]

{'train_runtime': 151.8442, 'train_samples_per_second': 15.272, 'train_steps_per_second': 3.833, 'train_loss': 1.5335614968001638, 'epoch': 3.0}





TrainOutput(global_step=582, training_loss=1.5335614968001638, metrics={'train_runtime': 151.8442, 'train_samples_per_second': 15.272, 'train_steps_per_second': 3.833, 'total_flos': 152578352247552.0, 'train_loss': 1.5335614968001638, 'epoch': 3.0})

In [32]:
# ============================================
# 11) Evaluate on the Test Set
# ============================================
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)


100%|██████████| 49/49 [00:02<00:00, 18.91it/s]

Evaluation results: {'eval_loss': 1.4126800298690796, 'eval_accuracy': 0.5721649484536082, 'eval_f1': 0.5082952163815565, 'eval_runtime': 3.003, 'eval_samples_per_second': 64.601, 'eval_steps_per_second': 16.317, 'epoch': 3.0}





In [34]:
import torch

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = model.to(device)

def classify_utterance(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
    # Move inputs to device:
    for k, v in inputs.items():
        inputs[k] = v.to(device)
    with torch.no_grad():
        outputs = model(**inputs)  # model is already on device
        logits = outputs.logits
        predicted_class_id = logits.argmax(dim=-1).item()
    return id2label[predicted_class_id]


In [35]:
sample_text = "Hello, I'm calling about your bank account and need urgent details."
predicted_label = classify_utterance(sample_text)
print(f"Sample Text: {sample_text}")
print(f"Predicted Label: {predicted_label}")

Sample Text: Hello, I'm calling about your bank account and need urgent details.
Predicted Label: scam
