In [9]:
"""
Conversation Prediction with a Custom Encoder-Decoder,
using GPT-2 Tokenizer for subword merges.

Steps:
1) Load a CSV of conversation data, group by CONVERSATION_ID, and build
   (partial, remainder) pairs.
2) Use GPT2Tokenizer from transformers for tokenization (subword approach).
3) Build a minimal from-scratch encoder-decoder Transformer in PyTorch.
4) Train the model on partial->remainder generation.
5) Demonstrate inference (greedy decoding).
"""


'\nConversation Prediction with a Custom Encoder-Decoder,\nusing GPT-2 Tokenizer for subword merges.\n\nSteps:\n1) Load a CSV of conversation data, group by CONVERSATION_ID, and build\n   (partial, remainder) pairs.\n2) Use GPT2Tokenizer from transformers for tokenization (subword approach).\n3) Build a minimal from-scratch encoder-decoder Transformer in PyTorch.\n4) Train the model on partial->remainder generation.\n5) Demonstrate inference (greedy decoding).\n'

In [10]:
# ========================================
# 1) Imports
# ========================================
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from transformers import GPT2Tokenizer
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cpu


In [12]:
# ========================================
# 2) CSV -> Partial->Remainder
# ========================================
def build_convo_pairs(csv_path, partial_ratio=0.5):
    """
    Reads CSV with columns:
       CONVERSATION_ID, CONVERSATION_STEP, TEXT
    Groups by conversation ID, sorts by step, then
    splits into (partial, remainder) text segments.

    partial_ratio determines the fraction of lines
    used for partial snippet, remainder is the rest.
    """
    df = pd.read_csv(csv_path)
    data_pairs = []
    for convo_id, group in df.groupby("CONVERSATION_ID"):
        group_sorted = group.sort_values("CONVERSATION_STEP")
        texts = group_sorted["TEXT"].tolist()
        if len(texts) < 2:
            continue

        cutoff = max(1, int(len(texts)*partial_ratio))
        partial_list = texts[:cutoff]
        remainder_list = texts[cutoff:]

        partial_str = "\n".join(partial_list).strip()
        remainder_str = "\n".join(remainder_list).strip() if remainder_list else ""

        data_pairs.append({
            "partial": partial_str,
            "remainder": remainder_str
        })

    return data_pairs

# Example usage (Update csv_path to your actual path)
csv_path = "C:/Users/DELL/Desktop/VOIP_Phishing_Attacks/Repos/convoPredict/conversation-prediction/FINAL_DATASET2.csv"  # <-- update with your CSV path
conversation_data = build_convo_pairs(csv_path, partial_ratio=0.5)
print("Number of partial->remainder pairs:", len(conversation_data))
if len(conversation_data) > 0:
    print("Sample pair:\n", conversation_data[0])


Number of partial->remainder pairs: 76
Sample pair:
 {'partial': "Hello, this is [Your Name]'s personal assistant. How may I assist you today?\nHi, I'm Sam. I saw an ad about a photography workshop hosted by [Org Name] next month. I'm interested in registering but had a few questions.\nHi Sam, it's great to hear of your interest in the photography workshop. I'd be happy to help with any questions you have.\nThanks! I was wondering about the skill level required for participants. I'm fairly new to photography.\nThe workshop is designed to accommodate all skill levels, from beginners to more experienced photographers. [Org Name] aims to ensure everyone can learn and grow, regardless of their starting point.\nThat sounds perfect. What's the registration process?", 'remainder': "You can register through our website. I can guide you through the steps if you'd like, or send you a direct link to the registration page.\nA direct link would be great. Can you also tell me about the workshop fee?

In [13]:
# ========================================
# 3) GPT-2 Tokenizer Setup
# ========================================
# We'll use GPT2Tokenizer for subword merges.
# GPT-2 doesn't define a pad token by default, so we
# set pad_token = eos_token for convenience.

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # reuse <|endoftext|> as pad
pad_token_id = tokenizer.eos_token_id

print("GPT2 Vocab size:", len(tokenizer))
print("EOS token id:", pad_token_id)


GPT2 Vocab size: 50257
EOS token id: 50256


In [14]:
# ========================================
# 4) PyTorch Dataset
# ========================================
class ConversationDataset(Dataset):
    """
    Each item: partial -> remainder text.
    We'll tokenize with GPT2Tokenizer.
    """
    def __init__(self, data_pairs, tokenizer, max_len=64):
        self.data = data_pairs
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        ex = self.data[idx]
        partial_text = ex["partial"]
        remainder_text= ex["remainder"]

        enc_partial = self.tokenizer.encode(
            partial_text,
            add_special_tokens=False,
            max_length=self.max_len,
            truncation=True
        )
        enc_remainder = self.tokenizer.encode(
            remainder_text,
            add_special_tokens=False,
            max_length=self.max_len,
            truncation=True
        )

        return {
            "enc_partial": enc_partial,
            "enc_remainder": enc_remainder
        }

def collate_fn(batch):
    partial_list = [b["enc_partial"] for b in batch]
    remainder_list= [b["enc_remainder"] for b in batch]

    max_len_enc = max(len(p) for p in partial_list)
    max_len_dec = max(len(r) for r in remainder_list)

    partial_padded = []
    remainder_padded= []

    for p, r in zip(partial_list, remainder_list):
        p_pad = p + [pad_token_id]*(max_len_enc - len(p))
        r_pad = r + [pad_token_id]*(max_len_dec - len(r))
        partial_padded.append(p_pad)
        remainder_padded.append(r_pad)

    return {
        "partial_batch": torch.tensor(partial_padded, dtype=torch.long),
        "remainder_batch": torch.tensor(remainder_padded, dtype=torch.long)
    }

dataset = ConversationDataset(conversation_data, tokenizer, max_len=64)
loader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

# quick test
for batch in loader:
    print("partial_batch shape:", batch["partial_batch"].shape)
    print("remainder_batch shape:", batch["remainder_batch"].shape)
    break


partial_batch shape: torch.Size([2, 64])
remainder_batch shape: torch.Size([2, 64])


In [15]:
# ========================================
# 5) Minimal Encoder-Decoder from scratch
# ========================================
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0,d_model,2)*(-math.log(10000.0)/d_model))
        pe[:,0::2] = torch.sin(position*div_term)
        pe[:,1::2] = torch.cos(position*div_term)
        self.register_buffer("pe", pe.unsqueeze(0))  # shape [1, max_len, d_model]

    def forward(self, x):
        seq_len = x.size(1)
        x = x + self.pe[:, :seq_len, :].to(x.device)
        return x

class TransformerEncoderDecoder(nn.Module):
    def __init__(self, vocab_size, d_model=128, nhead=4, num_encoder_layers=2, num_decoder_layers=2, pad_token_id=50256):
        super().__init__()
        self.d_model = d_model
        self.pad_token_id = pad_token_id
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_enc = PositionalEncoding(d_model)
        self.pos_dec = PositionalEncoding(d_model)

        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            batch_first=True
        )
        self.fc_out = nn.Linear(d_model, vocab_size)

    def _generate_causal_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz))==1).transpose(0,1)
        mask = mask.masked_fill(mask==1, float('-inf'))
        return mask

    def forward(self, src, tgt):
        # src, tgt shapes: [B, seq_len]
        enc_emb = self.embedding(src)*math.sqrt(self.d_model)
        enc_emb = self.pos_enc(enc_emb)

        dec_emb = self.embedding(tgt)*math.sqrt(self.d_model)
        dec_emb = self.pos_dec(dec_emb)

        src_key_padding_mask = (src==self.pad_token_id)
        tgt_key_padding_mask = (tgt==self.pad_token_id)

        seq_len_dec = tgt.size(1)
        causal_mask = self._generate_causal_mask(seq_len_dec).to(src.device)

        out = self.transformer(
            src=enc_emb,
            tgt=dec_emb,
            src_key_padding_mask=src_key_padding_mask,
            tgt_key_padding_mask=tgt_key_padding_mask,
            tgt_mask=causal_mask
        )
        logits = self.fc_out(out)  # [B, tgt_len, vocab_size]
        return logits


In [16]:
# ========================================
# 6) Train
# ========================================
vocab_size = len(tokenizer)
model = TransformerEncoderDecoder(
    vocab_size=vocab_size,
    d_model=128,
    nhead=4,
    num_encoder_layers=2,
    num_decoder_layers=2,
    pad_token_id=pad_token_id
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=pad_token_id)

epochs = 5
for epoch in range(epochs):
    model.train()
    total_loss=0.0
    for batch in loader:
        partial_batch = batch["partial_batch"].to(device)
        remainder_batch= batch["remainder_batch"].to(device)

        # teacher forcing
        dec_in = remainder_batch[:, :-1]
        labels = remainder_batch[:, 1:].contiguous()

        optimizer.zero_grad()
        logits = model(partial_batch, dec_in)  # [B, dec_len, vocab_size]

        B,L,V = logits.shape
        loss = criterion(logits.view(B*L,V), labels.view(B*L))
        loss.backward()
        optimizer.step()
        total_loss+=loss.item()
    avg_loss = total_loss/len(loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss={avg_loss:.4f}")

Epoch 1/5, Loss=8.4660
Epoch 2/5, Loss=5.7548
Epoch 3/5, Loss=4.9522
Epoch 4/5, Loss=4.3168
Epoch 5/5, Loss=3.7345


In [17]:
# ========================================
# 7) Inference: Generating the Remainder
# ========================================
def generate_remainder(model, tokenizer, partial_text, max_new_tokens=50):
    """
    Provide partial text -> model tries to produce remainder.
    We do a simple greedy decode step-by-step
    """
    model.eval()
    with torch.no_grad():
        # encode partial
        enc_partial = tokenizer.encode(partial_text, add_special_tokens=False)
        src = torch.tensor([enc_partial], dtype=torch.long).to(device)

        # start decoder with e.g. [pad/eos], or any dummy "start" token
        dec_in = [tokenizer.eos_token_id]  # placeholder as "BOS"
        dec_tensor = torch.tensor([dec_in], dtype=torch.long).to(device)

        for _ in range(max_new_tokens):
            logits = model(src, dec_tensor)
            next_token_logits = logits[0, -1, :]
            next_id = torch.argmax(next_token_logits).item()

            dec_in.append(next_id)
            dec_tensor = torch.tensor([dec_in], dtype=torch.long).to(device)

            if next_id == tokenizer.eos_token_id:
                break

        # skip the first token (the placeholder)
        generated_ids = dec_in[1:]
        text_out = tokenizer.decode(generated_ids, skip_special_tokens=True)
    return text_out

# test on a partial snippet
test_partial = "Hello, I'm Sam. I saw an ad about a photography workshop."
gen_remainder = generate_remainder(model, tokenizer, test_partial, max_new_tokens=30)
print("=== PARTIAL ===")
print(test_partial)
print("\n=== PREDICTED REMAINDER ===")
print(gen_remainder)


=== PARTIAL ===
Hello, I'm Sam. I saw an ad about a photography workshop.

=== PREDICTED REMAINDER ===
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
