In [1]:
"""
Conversation Prediction (Partial->Remainder) with a From-Scratch Encoder-Decoder
using BERT Tokenizer for subword merges.

Steps:
1) Read CSV of conversation data, build partial->remainder pairs.
2) Tokenize with BertTokenizer (WordPiece).
3) Train a minimal Transformer-based seq2seq.
4) Save model + tokenizer for reuse.
5) Load model + tokenizer and run inference to generate remainder from partial snippet.
"""


'\nConversation Prediction (Partial->Remainder) with a From-Scratch Encoder-Decoder\nusing BERT Tokenizer for subword merges.\n\nSteps:\n1) Read CSV of conversation data, build partial->remainder pairs.\n2) Tokenize with BertTokenizer (WordPiece).\n3) Train a minimal Transformer-based seq2seq.\n4) Save model + tokenizer for reuse.\n5) Load model + tokenizer and run inference to generate remainder from partial snippet.\n'

In [2]:
# ========================================
# 1) Imports
# ========================================
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import os

from transformers import BertTokenizer, PreTrainedTokenizerFast

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu


In [5]:
# ========================================
# 2) CSV -> Partial->Remainder
# ========================================
def build_convo_pairs(csv_path, partial_ratio=0.5):
    """
    Reads CSV with columns (CONVERSATION_ID, CONVERSATION_STEP, TEXT, ...)
    Groups lines by conversation, sorts by step, then splits each conversation
    into partial vs remainder text segments based on partial_ratio.
    """
    df = pd.read_csv(csv_path)
    data_pairs = []
    for convo_id, group in df.groupby("CONVERSATION_ID"):
        group_sorted = group.sort_values("CONVERSATION_STEP")
        texts = group_sorted["TEXT"].tolist()
        if len(texts) < 2:
            continue

        cutoff = max(1, int(len(texts)*partial_ratio))
        partial_list = texts[:cutoff]
        remainder_list = texts[cutoff:]

        partial_str = "\n".join(partial_list).strip()
        remainder_str = "\n".join(remainder_list).strip() if remainder_list else ""

        data_pairs.append({
            "partial": partial_str,
            "remainder": remainder_str
        })
    return data_pairs

# Example usage (adapt the path):
csv_path = "C:/Users/DELL/Desktop/VOIP_Phishing_Attacks/Repos/convoPredict/conversation-prediction/FINAL_DATASET2.csv"  # <-- update with your CSV path
data_pairs = build_convo_pairs(csv_path, partial_ratio=0.5)
print("Number of partial->remainder pairs:", len(data_pairs))
if len(data_pairs)>0:
    print("Sample pair:\n", data_pairs[0])


Number of partial->remainder pairs: 76
Sample pair:
 {'partial': "Hello, this is [Your Name]'s personal assistant. How may I assist you today?\nHi, I'm Sam. I saw an ad about a photography workshop hosted by [Org Name] next month. I'm interested in registering but had a few questions.\nHi Sam, it's great to hear of your interest in the photography workshop. I'd be happy to help with any questions you have.\nThanks! I was wondering about the skill level required for participants. I'm fairly new to photography.\nThe workshop is designed to accommodate all skill levels, from beginners to more experienced photographers. [Org Name] aims to ensure everyone can learn and grow, regardless of their starting point.\nThat sounds perfect. What's the registration process?", 'remainder': "You can register through our website. I can guide you through the steps if you'd like, or send you a direct link to the registration page.\nA direct link would be great. Can you also tell me about the workshop fee?

In [6]:
# ========================================
# 3) BERT Tokenizer Setup
# ========================================
# We'll use the "bert-base-uncased" tokenizer from Hugging Face.
# BERT typically has [PAD], [CLS], [SEP], etc. We'll rely on [PAD] for padding.

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
pad_token_id = tokenizer.pad_token_id
print("Vocab size:", len(tokenizer))
print("Pad token:", tokenizer.pad_token, "ID:", pad_token_id)


Vocab size: 30522
Pad token: [PAD] ID: 0


In [7]:
# ========================================
# 4) Dataset + Collate
# ========================================
class ConversationDataset(Dataset):
    """
    Each item: partial -> remainder text
    We'll tokenize with the BERT tokenizer (WordPiece).
    """
    def __init__(self, data_pairs, tokenizer, max_len=64):
        self.data = data_pairs
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        ex = self.data[idx]
        partial_text = ex["partial"]
        remainder_text= ex["remainder"]

        # encode partial
        enc_partial = self.tokenizer.encode(
            partial_text,
            add_special_tokens=False,
            truncation=True,
            max_length=self.max_len
        )
        # encode remainder
        enc_remainder= self.tokenizer.encode(
            remainder_text,
            add_special_tokens=False,
            truncation=True,
            max_length=self.max_len
        )

        return {
            "enc_partial": enc_partial,
            "enc_remainder": enc_remainder
        }

def collate_fn(batch):
    partial_list = [b["enc_partial"] for b in batch]
    remainder_list= [b["enc_remainder"] for b in batch]

    max_len_enc = max(len(p) for p in partial_list)
    max_len_dec = max(len(r) for r in remainder_list)

    partial_padded = []
    remainder_padded= []

    for p, r in zip(partial_list, remainder_list):
        p_pad = p + [pad_token_id]*(max_len_enc-len(p))
        r_pad = r + [pad_token_id]*(max_len_dec-len(r))
        partial_padded.append(p_pad)
        remainder_padded.append(r_pad)

    return {
        "partial_batch": torch.tensor(partial_padded, dtype=torch.long),
        "remainder_batch": torch.tensor(remainder_padded, dtype=torch.long)
    }

dataset = ConversationDataset(data_pairs, tokenizer, max_len=64)
loader  = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

# quick test
for batch in loader:
    print("partial_batch shape:", batch["partial_batch"].shape)
    print("remainder_batch shape:", batch["remainder_batch"].shape)
    break


partial_batch shape: torch.Size([2, 64])
remainder_batch shape: torch.Size([2, 64])


In [8]:
# ========================================
# 5) Minimal Encoder-Decoder
# ========================================
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0,d_model,2)*(-math.log(10000.0)/d_model))
        pe[:, 0::2] = torch.sin(position*div_term)
        pe[:, 1::2] = torch.cos(position*div_term)
        self.register_buffer("pe", pe.unsqueeze(0))  # shape [1, max_len, d_model]

    def forward(self, x):
        seq_len = x.size(1)
        return x + self.pe[:, :seq_len, :].to(x.device)

class TransformerEncoderDecoder(nn.Module):
    def __init__(
        self,
        vocab_size,
        d_model=128,
        nhead=4,
        num_encoder_layers=2,
        num_decoder_layers=2,
        pad_token_id=0
    ):
        super().__init__()
        self.d_model = d_model
        self.pad_token_id = pad_token_id

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_enc = PositionalEncoding(d_model)
        self.pos_dec = PositionalEncoding(d_model)

        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            batch_first=True
        )
        self.out_fc = nn.Linear(d_model, vocab_size)

    def _generate_causal_mask(self, sz):
        mask = torch.triu(torch.ones(sz, sz))==1
        mask = mask.transpose(0,1).masked_fill(mask==1, float('-inf'))
        return mask

    def forward(self, src, tgt):
        """
        src, tgt shapes: [B, seq_len]
        """
        enc_emb = self.embedding(src)*math.sqrt(self.d_model)
        enc_emb = self.pos_enc(enc_emb)

        dec_emb = self.embedding(tgt)*math.sqrt(self.d_model)
        dec_emb = self.pos_dec(dec_emb)

        src_key_padding_mask = (src==self.pad_token_id)
        tgt_key_padding_mask = (tgt==self.pad_token_id)

        seq_len_dec = tgt.size(1)
        causal_mask = self._generate_causal_mask(seq_len_dec).to(src.device)

        out = self.transformer(
            src=enc_emb,
            tgt=dec_emb,
            src_key_padding_mask=src_key_padding_mask,
            tgt_key_padding_mask=tgt_key_padding_mask,
            tgt_mask=causal_mask
        )
        logits = self.out_fc(out)
        return logits


In [15]:
# ========================================
# 6) Training
# ========================================
vocab_size = len(tokenizer)
model = TransformerEncoderDecoder(
    vocab_size=vocab_size,
    d_model=128,
    nhead=4,
    num_encoder_layers=2,
    num_decoder_layers=2,
    pad_token_id=pad_token_id
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=pad_token_id)

epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss=0.0
    for batch in loader:
        partial_batch = batch["partial_batch"].to(device)
        remainder_batch= batch["remainder_batch"].to(device)

        # teacher forcing
        dec_in = remainder_batch[:, :-1]
        labels = remainder_batch[:, 1:].contiguous()

        optimizer.zero_grad()
        logits = model(partial_batch, dec_in)  # [B, dec_len, vocab_size]
        B,L,V = logits.shape
        loss = criterion(logits.view(B*L,V), labels.view(B*L))
        loss.backward()
        optimizer.step()
        total_loss+=loss.item()
    avg_loss = total_loss/len(loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss={avg_loss:.4f}")


Epoch 1/10, Loss=7.8924
Epoch 2/10, Loss=5.3842
Epoch 3/10, Loss=4.6943
Epoch 4/10, Loss=4.1671
Epoch 5/10, Loss=3.7374
Epoch 6/10, Loss=3.3957
Epoch 7/10, Loss=3.1258
Epoch 8/10, Loss=2.9069
Epoch 9/10, Loss=2.7317
Epoch 10/10, Loss=2.6361


In [16]:
# ========================================
# 7) Save Model + Tokenizer
# ========================================
save_dir = "conversation_model_bert_tokenizer"
os.makedirs(save_dir, exist_ok=True)

model_path = os.path.join(save_dir, "model_state.pt")
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")

# We can save the tokenizer in Hugging Face format
tokenizer.save_pretrained(save_dir)
print(f"Tokenizer saved to {save_dir}")


Model saved to conversation_model_bert_tokenizer\model_state.pt
Tokenizer saved to conversation_model_bert_tokenizer


In [17]:
# ========================================
# 8) Inference: Generating the Remainder
# ========================================
def load_model_and_tokenizer(model_dir):
    from transformers import BertTokenizer
    # Load tokenizer
    loaded_tokenizer = BertTokenizer.from_pretrained(model_dir)

    # We must reconstruct the same model architecture
    # (ideally, store hyperparams in a config file)
    model = TransformerEncoderDecoder(
        vocab_size=len(loaded_tokenizer),
        d_model=128,
        nhead=4,
        num_encoder_layers=2,
        num_decoder_layers=2,
        pad_token_id=loaded_tokenizer.pad_token_id
    )
    # Load weights
    model_state_path = os.path.join(model_dir, "model_state.pt")
    model.load_state_dict(torch.load(model_state_path, map_location=device))
    model.to(device)
    model.eval()
    return model, loaded_tokenizer

def generate_remainder(model, tokenizer, partial_text, max_new_tokens=50):
    model.eval()
    with torch.no_grad():
        enc_partial = tokenizer.encode(
            partial_text,
            add_special_tokens=False
        )
        src = torch.tensor([enc_partial], dtype=torch.long).to(device)

        # We'll start the decoder with a single pad token to serve as "BOS"
        dec_in = [tokenizer.pad_token_id]
        dec_tensor = torch.tensor([dec_in], dtype=torch.long).to(device)

        for _ in range(max_new_tokens):
            logits = model(src, dec_tensor)
            next_token_logits = logits[0, -1, :]
            next_id = torch.argmax(next_token_logits).item()

            dec_in.append(next_id)
            dec_tensor = torch.tensor([dec_in], dtype=torch.long).to(device)

            # Stop if we produce a pad token again or if you define some other stopping
            if next_id == tokenizer.pad_token_id:
                break

        # decode the subwords after the initial "pad" start
        generated_ids = dec_in[1:]
        text_out = tokenizer.decode(generated_ids, skip_special_tokens=True)
    return text_out


In [18]:
# Example usage
loaded_model, loaded_tok = load_model_and_tokenizer(save_dir)

test_partial = "Hello, I'm Sam. I saw an ad about a photography workshop."
print("=== PARTIAL ===")
print(test_partial)
res = generate_remainder(loaded_model, loaded_tok, test_partial)
print("\n=== PREDICTED REMAINDER ===")
print(res)


=== PARTIAL ===
Hello, I'm Sam. I saw an ad about a photography workshop.

=== PREDICTED REMAINDER ===



  model.load_state_dict(torch.load(model_state_path, map_location=device))
