In [1]:
"""
Conversation Prediction from CSV in a Jupyter Notebook.

Steps:
1) Read the CSV data containing columns:
   CONVERSATION_ID, CONVERSATION_STEP, TEXT, CONTEXT, LABEL
2) For each conversation_id, sort by conversation_step, 
   then build partial -> remainder pairs.
3) Tokenize the text (simple approach or real subword approach).
4) Train a minimal encoder-decoder Transformer on partial->remainder.
5) Demonstrate how to predict the remainder from a partial snippet.
"""


'\nConversation Prediction from CSV in a Jupyter Notebook.\n\nSteps:\n1) Read the CSV data containing columns:\n   CONVERSATION_ID, CONVERSATION_STEP, TEXT, CONTEXT, LABEL\n2) For each conversation_id, sort by conversation_step, \n   then build partial -> remainder pairs.\n3) Tokenize the text (simple approach or real subword approach).\n4) Train a minimal encoder-decoder Transformer on partial->remainder.\n5) Demonstrate how to predict the remainder from a partial snippet.\n'

In [2]:
# ========================================
# 1) Imports
# ========================================
import pandas as pd
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cpu


In [4]:
# ========================================
# 2) Read the CSV & Build Partial->Remainder
# ========================================


def build_convo_pairs_from_csv(csv_path, partial_ratio=0.5):
    df = pd.read_csv(csv_path)
    # Group by conversation ID
    data_pairs = []
    for convo_id, group in df.groupby("CONVERSATION_ID"):
        group_sorted = group.sort_values("CONVERSATION_STEP")
        # Grab the TEXT column in order
        texts = group_sorted["TEXT"].tolist()

        if len(texts) < 2:
            # If a conversation is too short, skip or handle differently
            continue

        # partial: first partial_ratio portion
        cutoff = max(1, int(len(texts) * partial_ratio))
        partial_list = texts[:cutoff]
        remainder_list = texts[cutoff:]

        partial_str = "\n".join(partial_list)
        remainder_str = "\n".join(remainder_list) if remainder_list else ""

        data_pairs.append({
            "partial": partial_str.strip(),
            "remainder": remainder_str.strip()
        })
    return data_pairs

csv_path = "C:/Users/DELL/Desktop/VOIP_Phishing_Attacks/Repos/convoPredict/conversation-prediction/FINAL_DATASET2.csv"  # <-- update with your CSV path

conversation_data = build_convo_pairs_from_csv(csv_path, partial_ratio=0.5)

print("Number of partial->remainder pairs:", len(conversation_data))
# Peek at one example
if len(conversation_data) > 0:
    print("Example pair:\n", conversation_data[0])


Number of partial->remainder pairs: 76
Example pair:
 {'partial': "Hello, this is [Your Name]'s personal assistant. How may I assist you today?\nHi, I'm Sam. I saw an ad about a photography workshop hosted by [Org Name] next month. I'm interested in registering but had a few questions.\nHi Sam, it's great to hear of your interest in the photography workshop. I'd be happy to help with any questions you have.\nThanks! I was wondering about the skill level required for participants. I'm fairly new to photography.\nThe workshop is designed to accommodate all skill levels, from beginners to more experienced photographers. [Org Name] aims to ensure everyone can learn and grow, regardless of their starting point.\nThat sounds perfect. What's the registration process?", 'remainder': "You can register through our website. I can guide you through the steps if you'd like, or send you a direct link to the registration page.\nA direct link would be great. Can you also tell me about the workshop fee

In [5]:
# ========================================
# 3) Minimal Word-Level Tokenizer
# ========================================
class SimpleTokenizer:
    def __init__(self):
        self.word2idx = {"<PAD>": 0, "<BOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.idx2word = {0: "<PAD>", 1: "<BOS>", 2: "<EOS>", 3: "<UNK>"}
        self.vocab_size = 4

    def fit(self, data_pairs):
        # We'll collect partial + remainder text
        idx = 4
        for ex in data_pairs:
            combined_text = ex["partial"] + " " + ex["remainder"]
            words = combined_text.split()
            for w in words:
                if w not in self.word2idx:
                    self.word2idx[w] = idx
                    self.idx2word[idx] = w
                    idx += 1
        self.vocab_size = len(self.word2idx)

    def encode(self, text, add_bos=False, add_eos=False):
        ids = []
        if add_bos:
            ids.append(self.word2idx["<BOS>"])
        for w in text.split():
            wid = self.word2idx.get(w, self.word2idx["<UNK>"])
            ids.append(wid)
        if add_eos:
            ids.append(self.word2idx["<EOS>"])
        return ids

    def decode(self, ids):
        words = []
        for i in ids:
            if i == self.word2idx["<EOS>"]:
                break
            word = self.idx2word.get(i, "<UNK>")
            words.append(word)
        return " ".join(words)

# Build tokenizer
tokenizer = SimpleTokenizer()
tokenizer.fit(conversation_data)
print("Vocab size:", tokenizer.vocab_size)


Vocab size: 2942


In [6]:
# ========================================
# 4) PyTorch Dataset: partial->remainder
# ========================================
class ConversationDataset(Dataset):
    def __init__(self, data_pairs, tokenizer, max_len=50):
        self.data = data_pairs
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        ex = self.data[idx]
        partial_enc = self.tokenizer.encode(ex["partial"], add_bos=True, add_eos=True)
        remainder_enc = self.tokenizer.encode(ex["remainder"], add_bos=True, add_eos=True)

        partial_enc = partial_enc[:self.max_len]
        remainder_enc = remainder_enc[:self.max_len]

        return {
            "partial_enc": partial_enc,
            "remainder_enc": remainder_enc
        }

def collate_fn(batch):
    partial_list = [x["partial_enc"] for x in batch]
    remainder_list = [x["remainder_enc"] for x in batch]

    max_len_partial = max(len(p) for p in partial_list)
    max_len_rem = max(len(r) for r in remainder_list)

    partial_padded = []
    remainder_padded = []
    for p, r in zip(partial_list, remainder_list):
        p_pad = p + [tokenizer.word2idx["<PAD>"]] * (max_len_partial - len(p))
        r_pad = r + [tokenizer.word2idx["<PAD>"]] * (max_len_rem - len(r))
        partial_padded.append(p_pad)
        remainder_padded.append(r_pad)
    
    return {
        "partial_batch": torch.tensor(partial_padded, dtype=torch.long),
        "remainder_batch": torch.tensor(remainder_padded, dtype=torch.long)
    }

dataset = ConversationDataset(conversation_data, tokenizer, max_len=50)
loader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)


In [7]:
# Quick check
for batch in loader:
    print("partial_batch shape:", batch["partial_batch"].shape)
    print("remainder_batch shape:", batch["remainder_batch"].shape)
    break


partial_batch shape: torch.Size([2, 50])
remainder_batch shape: torch.Size([2, 50])


In [8]:
# ========================================
# 5) Minimal Encoder-Decoder Transformer
# ========================================
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0)/d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))  # shape: [1, max_len, d_model]

    def forward(self, x):
        seq_len = x.size(1)
        x = x + self.pe[:, :seq_len, :].to(x.device)
        return x

class TransformerEncoderDecoder(nn.Module):
    def __init__(self, vocab_size, d_model=128, nhead=4, num_layers=2):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_enc = PositionalEncoding(d_model)
        self.pos_dec = PositionalEncoding(d_model)

        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            batch_first=True
        )

        self.out_fc = nn.Linear(d_model, vocab_size)
        self.pad_token_id = 0

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz))==1).transpose(0,1)
        mask = mask.masked_fill(mask==1, float('-inf'))
        return mask

    def forward(self, enc_input, dec_input):
        # enc_input: [B, seq_len_enc]
        # dec_input: [B, seq_len_dec]
        # embed
        enc_emb = self.embedding(enc_input) * math.sqrt(self.d_model)
        enc_emb = self.pos_enc(enc_emb)

        dec_emb = self.embedding(dec_input) * math.sqrt(self.d_model)
        dec_emb = self.pos_dec(dec_emb)

        # create masks
        src_key_padding_mask = (enc_input == self.pad_token_id)
        tgt_key_padding_mask = (dec_input == self.pad_token_id)

        seq_len_dec = dec_input.size(1)
        causal_mask = self._generate_square_subsequent_mask(seq_len_dec).to(enc_input.device)

        # pass through transformer
        out = self.transformer(
            src=enc_emb,
            tgt=dec_emb,
            src_key_padding_mask=src_key_padding_mask,
            tgt_key_padding_mask=tgt_key_padding_mask,
            tgt_mask=causal_mask
        )
        logits = self.out_fc(out)  # [B, seq_len_dec, vocab_size]
        return logits


In [9]:
# ========================================
# 6) Training Loop
# ========================================
model = TransformerEncoderDecoder(
    vocab_size=tokenizer.vocab_size,
    d_model=128,
    nhead=4,
    num_layers=2
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.word2idx["<PAD>"])

epochs = 5

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in loader:
        partial_batch = batch["partial_batch"].to(device)
        remainder_batch = batch["remainder_batch"].to(device)

        # Teacher forcing approach: 
        # input to decoder = remainder_batch[:, :-1]
        # we want to predict remainder_batch[:, 1:]
        dec_in = remainder_batch[:, :-1]
        labels = remainder_batch[:, 1:].contiguous()

        optimizer.zero_grad()
        logits = model(partial_batch, dec_in)  # [B, dec_len, vocab_size]
        B, L, V = logits.shape
        loss = criterion(logits.view(B*L, V), labels.view(B*L))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    avg_loss = total_loss / len(loader)
    print(f"Epoch {epoch+1}/{epochs}  Loss={avg_loss:.4f}")


Epoch 1/5  Loss=7.0010
Epoch 2/5  Loss=6.2273
Epoch 3/5  Loss=5.9828
Epoch 4/5  Loss=5.3786
Epoch 5/5  Loss=4.6634


In [10]:
# ========================================
# 7) Inference: Generating the Remainder
# ========================================
def generate_remainder(model, partial_text, max_gen_len=30):
    model.eval()
    # encode partial
    partial_ids = tokenizer.encode(partial_text, add_bos=True)  # no <EOS> so we can keep appending
    src = torch.tensor([partial_ids], dtype=torch.long).to(device)

    # Start decoder input with <BOS>
    dec_input = [tokenizer.word2idx["<BOS>"]]
    dec_tensor = torch.tensor([dec_input], dtype=torch.long).to(device)

    for _ in range(max_gen_len):
        with torch.no_grad():
            logits = model(src, dec_tensor)
            # last token in decoder
            next_token_logits = logits[0, -1, :]
            next_token_id = torch.argmax(next_token_logits).item()

        dec_input.append(next_token_id)
        dec_tensor = torch.tensor([dec_input], dtype=torch.long).to(device)

        # stop if <EOS>
        if next_token_id == tokenizer.word2idx["<EOS>"]:
            break

    # remove <BOS> from start
    generated_ids = dec_input[1:]
    # decode until <EOS>
    text_out = tokenizer.decode(generated_ids)
    return text_out

# Let's test on an example partial from the dataset
if len(conversation_data) > 0:
    test_partial = conversation_data[0]["partial"]
    print("=== PARTIAL ===")
    print(test_partial)
    generated = generate_remainder(model, test_partial, max_gen_len=30)
    print("\n=== MODEL'S PREDICTION FOR REMAINDER ===")
    print(generated)


=== PARTIAL ===
Hello, this is [Your Name]'s personal assistant. How may I assist you today?
Hi, I'm Sam. I saw an ad about a photography workshop hosted by [Org Name] next month. I'm interested in registering but had a few questions.
Hi Sam, it's great to hear of your interest in the photography workshop. I'd be happy to help with any questions you have.
Thanks! I was wondering about the skill level required for participants. I'm fairly new to photography.
The workshop is designed to accommodate all skill levels, from beginners to more experienced photographers. [Org Name] aims to ensure everyone can learn and grow, regardless of their starting point.
That sounds perfect. What's the registration process?


  output = torch._nested_tensor_from_mask(



=== MODEL'S PREDICTION FOR REMAINDER ===
<PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
