In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import math
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence


## RAW DATA

In [10]:
legal_conversations = [
    [
        "A: Hello, I’m Sarah from XYZ Bank’s support team. How can I help you today?",
        "B: Hi Sarah, I was looking to update my address on my account. Could you assist me with that?",
        "A: Certainly. Could you please verify the last four digits of your account number?",
        "B: The last four digits are 1234.",
        "A: Great. I see your current address is 123 Elm Street. What would you like to update it to?",
        "B: I’d like it changed to 456 Oak Avenue, Springfield.",
        "A: Perfect. I’ve updated the address. Is there anything else I can help with?",
        "B: No, that’s all. Thank you so much.",
        "A: You’re welcome! Have a wonderful day."
    ],
    [
        "A: Good morning, this is Max from ABC Internet Services. How may I assist you?",
        "B: Hi Max, my internet has been running slower than usual. Can you help me figure out why?",
        "A: Sure, let’s run a quick diagnostic. Could you confirm the email address associated with your account?",
        "B: It’s jane.doe@example.com.",
        "A: Thanks. I see there’s some scheduled maintenance in your area which might cause slow speeds. It should be resolved by tomorrow morning.",
        "B: Got it, thanks for checking. Is there any way to get a temporary speed boost?",
        "A: Unfortunately, not during maintenance. But I can offer you a small credit for the inconvenience. Would that help?",
        "B: That would be great. Thanks!",
        "A: I’ve applied a $5 credit. Anything else I can do?",
        "B: No, that’s all. Appreciate your help.",
        "A: My pleasure. Have a nice day!"
    ],
    [
        "A: Hello, Julie from Secure Payments. How can I help?",
        "B: Hi Julie, I want to set a travel notice on my credit card.",
        "A: Absolutely. Could I have the last transaction amount you made so I can verify your identity?",
        "B: My last transaction was $45 at GroceryMart.",
        "A: Perfect, I see that. What dates and countries will you be traveling to?",
        "B: I’ll be in Germany from June 10th to June 20th.",
        "A: Got it. I’ve placed a travel notice for those dates. You’re all set.",
        "B: Thank you, that’s all I needed.",
        "A: You’re welcome. Safe travels!"
    ]
]

In [11]:
vishing_conversations = [
    [
        "A: Hello, this is Andrew calling from Premium Bank’s fraud department.",
        "B: Oh, hi. Is there an issue with my account?",
        "A: Yes, we noticed several suspicious charges. Could you provide your full account number so we can secure your account immediately?",
        "B: I’m not comfortable giving my full account number over the phone.",
        "A: It’s urgent! Your account is at risk right now. If you don’t provide the account and your PIN, we can’t protect your money.",
        "B: I should call the official bank number before giving this information.",
        "A: There’s no time. Just give me your PIN, we’ll reverse the charges right now.",
        "B: I’ll hang up and check with the bank directly. Goodbye.",
        "A: Wait, no, don’t disconnect—!"
    ],
    [
        "A: Hi, I’m calling from the government tax office. We have an urgent notice for you.",
        "B: The tax office? Is there a problem?",
        "A: Yes, there is a warrant for your arrest due to unpaid taxes. To fix this, you need to pay immediately.",
        "B: That sounds suspicious. I don’t think the tax office calls like this.",
        "A: If you don’t give me your credit card number right now, the police will be at your door in an hour.",
        "B: No, I’m going to hang up and verify this through official channels.",
        "A: Don’t you dare hang up! You must pay now!",
        "B: (Hangs up)"
    ],
    [
        "A: Good afternoon, this is Alex from Techy Support for your mobile service.",
        "B: Hi, what’s the issue?",
        "A: Your phone has been compromised. To fix it, I need your password and PIN so I can access your device remotely.",
        "B: That’s not normal procedure.",
        "A: It’s an emergency! Hackers are stealing your data. Give me your PIN so I can lock them out.",
        "B: I’m going to call the official support line and verify.",
        "A: No time! They’ll steal everything right now if you don’t comply!",
        "B: I don’t believe you. Goodbye.",
        "A: Wait, I…!"
    ]
]

In [12]:
all_conversations = legal_conversations + vishing_conversations


## Get pre trained vocabulary

In [2]:
!pip install transformers



In [3]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Initialize the pre-trained BERT model to extract embeddings
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Freeze BERT parameters if you don't want to fine-tune them
for param in bert_model.parameters():
    param.requires_grad = False


tokenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<00:00, 7.79kB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.40MB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 2.88MB/s]
config.json: 100%|██████████| 570/570 [00:00<00:00, 177kB/s]
model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [7]:
import torch

# Check if CUDA (GPU) is available
if torch.cuda.is_available():
    device = torch.device('cuda')
    print("GPU is available")
else:
    device = torch.device('cpu')
    print("Only CPU is available")

Only CPU is available


In [8]:
model = model.to(device)

In [9]:
class ConversationDataset(Dataset):
    def __init__(self, conversations, tokenizer, max_length=512):
        self.data = []
        for conv in conversations:
            # Flatten the conversation into a single string
            conv_text = " ".join(conv)
            # Encode the conversation with tokenizer, adding special tokens
            encoding = tokenizer.encode_plus(
                conv_text,
                add_special_tokens=True,
                max_length=max_length,
                truncation=True,
                padding='max_length',
                return_tensors='pt'
            )
            input_ids = encoding['input_ids'].squeeze(0)  # Shape: [max_length]
            attention_mask = encoding['attention_mask'].squeeze(0)  # Shape: [max_length]
            self.data.append((input_ids, attention_mask))
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        input_ids, attention_mask = self.data[idx]
        # For language modeling: input = all tokens except last, target = all tokens except first
        return input_ids[:-1], input_ids[1:], attention_mask[:-1]

In [13]:
def collate_fn(batch):
    inp_batch, tgt_batch, mask_batch = zip(*batch)
    inp_padded = torch.stack(inp_batch)
    tgt_padded = torch.stack(tgt_batch)
    src_mask = torch.stack(mask_batch).unsqueeze(1).unsqueeze(2)  # Shape: [B, 1, 1, S]
    return inp_padded, tgt_padded, src_mask

dataset = ConversationDataset(all_conversations, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)


ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.