In [1]:
"""
Conversation Prediction Notebook
- Takes a CSV file with columns:
    CONVERSATION_ID, CONVERSATION_STEP, TEXT, ...
- Splits each conversation into partial->remainder
- Trains a from-scratch encoder-decoder Transformer to generate the remainder
  given only the partial snippet.
- Saves the model + tokenizer for reuse.
- Demonstrates how to load and run inference on the saved model.
"""


'\nConversation Prediction Notebook\n- Takes a CSV file with columns:\n    CONVERSATION_ID, CONVERSATION_STEP, TEXT, ...\n- Splits each conversation into partial->remainder\n- Trains a from-scratch encoder-decoder Transformer to generate the remainder\n  given only the partial snippet.\n- Saves the model + tokenizer for reuse.\n- Demonstrates how to load and run inference on the saved model.\n'

In [2]:
# ========================================
# 1) Imports
# ========================================
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import os
import pickle  # for saving tokenizer or other objects

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cpu


In [3]:
# ========================================
# 2) CSV -> Partial->Remainder
# ========================================
def build_convo_pairs(csv_path, partial_ratio=0.5):
    """
    Reads the CSV with columns: 
      CONVERSATION_ID, CONVERSATION_STEP, TEXT
    Groups by conversation ID, sorts by step,
    then splits into (partial, remainder).
    
    partial_ratio indicates what fraction of lines 
    in each conversation is used as 'partial' snippet.
    The remainder lines form the 'remainder' text.
    """
    df = pd.read_csv(csv_path)
    data_pairs = []
    for convo_id, group in df.groupby("CONVERSATION_ID"):
        group_sorted = group.sort_values("CONVERSATION_STEP")
        texts = group_sorted["TEXT"].tolist()
        if len(texts) < 2:
            continue

        cutoff = max(1, int(len(texts)*partial_ratio))
        partial_list = texts[:cutoff]
        remainder_list = texts[cutoff:]

        partial_str = "\n".join(partial_list).strip()
        remainder_str = "\n".join(remainder_list).strip() if remainder_list else ""

        data_pairs.append({
            "partial": partial_str,
            "remainder": remainder_str
        })

    return data_pairs


In [5]:
# Example usage (update csv_path for your real data)
csv_path = "C:/Users/DELL/Desktop/VOIP_Phishing_Attacks/Repos/convoPredict/conversation-prediction/FINAL_DATASET2.csv"  # <-- update with your CSV path
partial_ratio = 0.5

conversation_data = build_convo_pairs(csv_path, partial_ratio)
print("Number of partial->remainder pairs:", len(conversation_data))
if len(conversation_data) > 0:
    print("Sample pair:\n", conversation_data[0])


Number of partial->remainder pairs: 76
Sample pair:
 {'partial': "Hello, this is [Your Name]'s personal assistant. How may I assist you today?\nHi, I'm Sam. I saw an ad about a photography workshop hosted by [Org Name] next month. I'm interested in registering but had a few questions.\nHi Sam, it's great to hear of your interest in the photography workshop. I'd be happy to help with any questions you have.\nThanks! I was wondering about the skill level required for participants. I'm fairly new to photography.\nThe workshop is designed to accommodate all skill levels, from beginners to more experienced photographers. [Org Name] aims to ensure everyone can learn and grow, regardless of their starting point.\nThat sounds perfect. What's the registration process?", 'remainder': "You can register through our website. I can guide you through the steps if you'd like, or send you a direct link to the registration page.\nA direct link would be great. Can you also tell me about the workshop fee?

In [6]:
# ========================================
# 3) A Minimal Tokenizer (Word-Based)
# ========================================
# For demonstration, we build a naive word-level tokenizer that:
#  - uses <PAD> (0), <BOS> (1), <EOS> (2), <UNK> (3)
#  - collects all unique tokens from partial + remainder text

class SimpleTokenizer:
    def __init__(self):
        self.word2idx = {"<PAD>":0, "<BOS>":1, "<EOS>":2, "<UNK>":3}
        self.idx2word = {0:"<PAD>",1:"<BOS>",2:"<EOS>",3:"<UNK>"}
        self.vocab_size = 4

    def fit(self, data_pairs):
        idx = 4
        for ex in data_pairs:
            combined = ex["partial"] + " " + ex["remainder"]
            words = combined.split()
            for w in words:
                if w not in self.word2idx:
                    self.word2idx[w] = idx
                    self.idx2word[idx] = w
                    idx += 1
        self.vocab_size = len(self.word2idx)

    def encode(self, text, add_bos=False, add_eos=False):
        ids = []
        if add_bos:
            ids.append(self.word2idx["<BOS>"])
        for w in text.split():
            wid = self.word2idx.get(w, self.word2idx["<UNK>"])
            ids.append(wid)
        if add_eos:
            ids.append(self.word2idx["<EOS>"])
        return ids

    def decode(self, ids):
        words = []
        for i in ids:
            if i==self.word2idx["<EOS>"]:
                break
            word = self.idx2word.get(i, "<UNK>")
            words.append(word)
        return " ".join(words)


In [7]:
# Build the tokenizer from conversation_data
tokenizer = SimpleTokenizer()
tokenizer.fit(conversation_data)
print("Vocab size:", tokenizer.vocab_size)
# Example


Vocab size: 2942


In [8]:
# ========================================
# 4) PyTorch Dataset
# ========================================
class ConversationDataset(Dataset):
    def __init__(self, data_pairs, tokenizer, max_len=50):
        self.data = data_pairs
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        ex = self.data[idx]
        partial_enc = self.tokenizer.encode(ex["partial"], add_bos=True, add_eos=True)
        remainder_enc = self.tokenizer.encode(ex["remainder"], add_bos=True, add_eos=True)

        # Truncate if too long
        partial_enc = partial_enc[:self.max_len]
        remainder_enc = remainder_enc[:self.max_len]

        return {
            "partial_enc": partial_enc,
            "remainder_enc": remainder_enc
        }

def collate_fn(batch):
    partial_list = [b["partial_enc"] for b in batch]
    remainder_list = [b["remainder_enc"] for b in batch]

    max_len_enc = max(len(p) for p in partial_list)
    max_len_dec = max(len(r) for r in remainder_list)

    partial_padded = []
    remainder_padded= []

    for p, r in zip(partial_list, remainder_list):
        p_pad = p + [tokenizer.word2idx["<PAD>"]] * (max_len_enc-len(p))
        r_pad = r + [tokenizer.word2idx["<PAD>"]] * (max_len_dec-len(r))
        partial_padded.append(p_pad)
        remainder_padded.append(r_pad)

    return {
        "partial_batch": torch.tensor(partial_padded, dtype=torch.long),
        "remainder_batch": torch.tensor(remainder_padded, dtype=torch.long)
    }

dataset = ConversationDataset(conversation_data, tokenizer, max_len=50)
loader  = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)


In [9]:
# Quick check
for batch in loader:
    print("partial_batch shape:", batch["partial_batch"].shape)
    print("remainder_batch shape:", batch["remainder_batch"].shape)
    break


partial_batch shape: torch.Size([2, 50])
remainder_batch shape: torch.Size([2, 50])


In [10]:
# ========================================
# 5) A Minimal Encoder-Decoder Model
# ========================================
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0,d_model,2)*(-math.log(10000.0)/d_model))
        pe[:,0::2] = torch.sin(position*div_term)
        pe[:,1::2] = torch.cos(position*div_term)
        self.register_buffer("pe", pe.unsqueeze(0))  # shape [1, max_len, d_model]

    def forward(self, x):
        seq_len = x.size(1)
        x = x + self.pe[:, :seq_len, :].to(x.device)
        return x

class TransformerEncoderDecoder(nn.Module):
    def __init__(self, vocab_size, d_model=128, nhead=4, num_encoder_layers=2, num_decoder_layers=2):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_enc = PositionalEncoding(d_model)
        self.pos_dec = PositionalEncoding(d_model)

        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            batch_first=True
        )
        self.fc_out = nn.Linear(d_model, vocab_size)

        self.pad_token_id = 0  # <PAD> index

    def _generate_causal_mask(self, size):
        mask = (torch.triu(torch.ones(size, size))==1).transpose(0,1)
        mask = mask.masked_fill(mask==1, float('-inf'))
        return mask

    def forward(self, src, tgt):
        # src: [B, src_len]
        # tgt: [B, tgt_len]
        # embeddings
        enc_emb = self.embedding(src)*math.sqrt(self.d_model)
        enc_emb = self.pos_enc(enc_emb)

        dec_emb = self.embedding(tgt)*math.sqrt(self.d_model)
        dec_emb = self.pos_dec(dec_emb)

        # create masks
        src_key_padding_mask = (src==self.pad_token_id)
        tgt_key_padding_mask = (tgt==self.pad_token_id)

        seq_len_tgt = tgt.size(1)
        causal_mask = self._generate_causal_mask(seq_len_tgt).to(src.device)

        out = self.transformer(
            src=enc_emb, 
            tgt=dec_emb,
            src_key_padding_mask=src_key_padding_mask,
            tgt_key_padding_mask=tgt_key_padding_mask,
            tgt_mask=causal_mask
        )
        logits = self.fc_out(out)  # [B, tgt_len, vocab_size]
        return logits


In [11]:
# ========================================
# 6) Train the Model
# ========================================
model = TransformerEncoderDecoder(
    vocab_size=tokenizer.vocab_size,
    d_model=128,
    nhead=4,
    num_encoder_layers=2,
    num_decoder_layers=2
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.word2idx["<PAD>"])

epochs=5
for epoch in range(epochs):
    model.train()
    total_loss=0.0
    for batch in loader:
        partial_batch = batch["partial_batch"].to(device)
        remainder_batch = batch["remainder_batch"].to(device)

        # teacher forcing
        # dec_in => remainder[:,:-1]
        # label => remainder[:,1:]
        dec_in = remainder_batch[:, :-1]
        labels = remainder_batch[:, 1:].contiguous()

        optimizer.zero_grad()
        logits = model(partial_batch, dec_in)  # [B, dec_len, vocab_size]
        B,L,V = logits.shape
        loss = criterion(logits.view(B*L,V), labels.view(B*L))
        loss.backward()
        optimizer.step()
        total_loss+=loss.item()
    avg_loss = total_loss/len(loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss={avg_loss:.4f}")


Epoch 1/5, Loss=7.0616
Epoch 2/5, Loss=6.2368
Epoch 3/5, Loss=6.0195
Epoch 4/5, Loss=5.4625
Epoch 5/5, Loss=4.7270


In [12]:
# ========================================
# 7) Save the Model + Tokenizer
# ========================================
save_dir = "gpt2_saved_conversation_model"
os.makedirs(save_dir, exist_ok=True)

# Save model state_dict
model_path = os.path.join(save_dir, "model_state.pt")
torch.save(model.state_dict(), model_path)
print(f"Model saved to: {model_path}")

# Save tokenizer
tokenizer_path = os.path.join(save_dir, "tokenizer.pkl")
with open(tokenizer_path, "wb") as f:
    pickle.dump(tokenizer, f)
print(f"Tokenizer saved to: {tokenizer_path}")


Model saved to: gpt2_saved_conversation_model\model_state.pt
Tokenizer saved to: gpt2_saved_conversation_model\tokenizer.pkl


In [13]:
# ========================================
# 8) Reusable Inference Code
# ========================================
# Suppose we want to load the model + tokenizer in a new environment or script 
# and generate the remainder from a partial snippet.

def load_model_and_tokenizer(model_dir):
    # Load tokenizer
    tokenizer_path = os.path.join(model_dir, "tokenizer.pkl")
    with open(tokenizer_path, "rb") as f:
        loaded_tokenizer = pickle.load(f)
    # Instantiate model with correct vocab size
    # Make sure you match d_model, nhead, layers, etc. 
    # or store them in a config file
    model = TransformerEncoderDecoder(
        vocab_size=loaded_tokenizer.vocab_size,
        d_model=128,
        nhead=4,
        num_encoder_layers=2,
        num_decoder_layers=2
    )
    # Load state_dict
    model_path = os.path.join(model_dir, "model_state.pt")
    state_dict = torch.load(model_path, map_location=torch.device('cpu'))
    model.load_state_dict(state_dict)
    model.eval()
    return model, loaded_tokenizer

def generate_remainder(model, tokenizer, partial_text, max_len=50):
    model.eval()
    # encode partial
    partial_ids = tokenizer.encode(partial_text, add_bos=True)  # add <BOS>
    src = torch.tensor([partial_ids], dtype=torch.long)

    # We start decoder input with <BOS>
    dec_in = torch.tensor([[tokenizer.word2idx["<BOS>"]]], dtype=torch.long)

    for _ in range(max_len):
        with torch.no_grad():
            logits = model(src, dec_in)
            # next token from last time-step
            next_token_logits = logits[0,-1,:]
            next_id = torch.argmax(next_token_logits).item()

        # append
        dec_in = torch.cat([dec_in, torch.tensor([[next_id]])], dim=1)

        # stop if <EOS>
        if next_id == tokenizer.word2idx["<EOS>"]:
            break

    # decode the generated tokens after <BOS>
    generated_ids = dec_in[0,1:].tolist()
    text_out = tokenizer.decode(generated_ids)
    return text_out


### Reuse the trained model

In [17]:
# Example of usage:

# (A) In the same script, we can simply reuse the 'model' and 'tokenizer' we already have.
# Or let's simulate how we'd do it from a fresh environment:

loaded_model, loaded_tokenizer = load_model_and_tokenizer("C:/Users/DELL/Desktop/VOIP_Phishing_Attacks/Repos/convoPredict/conversation-prediction/research/3. MidEvaluation/encoder-decoder/gpt2_saved_conversation_model")
loaded_model.to(device)

# Now let's pick a partial snippet from the dataset or custom:
test_partial = "Hello, I'm Sam. I saw an ad about a photography workshop."
print("=== PARTIAL ===")
print(test_partial)

remainder_pred = generate_remainder(loaded_model, loaded_tokenizer, test_partial)
print("\n=== PREDICTED REMAINDER ===")
print(remainder_pred)


  state_dict = torch.load(model_path, map_location=torch.device('cpu'))


=== PARTIAL ===
Hello, I'm Sam. I saw an ad about a photography workshop.

=== PREDICTED REMAINDER ===
<PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
