In [None]:
# # Conversation Prediction Model with T5

In [1]:
# ============================================
# 0) Install Requirements (if needed)
# ============================================
# In a fresh environment or Google Colab, you might need:
# !pip install transformers datasets accelerate pandas

import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    T5Tokenizer, 
    T5ForConditionalGeneration, 
    DataCollatorForSeq2Seq, 
    TrainingArguments, 
    Trainer
)
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ============================================
# 1) Load the CSV Data
# ============================================
# We'll assume your CSV is named "conversations.csv" and has:
#   CONVERSATION_ID, CONVERSATION_STEP, TEXT, CONTEXT, LABEL
# We only need CONVERSATION_ID, CONVERSATION_STEP, TEXT for building partial->full.

df = pd.read_csv("/Users/ashansubodha/Desktop/VOIP Vishing/conversation-prediction/FINAL_DATASET2.csv")
print("Data size:", len(df))
print(df.head(10))

Data size: 967
   CONVERSATION_ID  CONVERSATION_STEP  \
0                0                  1   
1                0                  2   
2                0                  3   
3                0                  4   
4                0                  5   
5                0                  6   
6                0                  7   
7                0                  8   
8                0                  9   
9                0                 10   

                                                TEXT  \
0  Hello, this is [Your Name]'s personal assistan...   
1  Hi, I'm Sam. I saw an ad about a photography w...   
2  Hi Sam, it's great to hear of your interest in...   
3  Thanks! I was wondering about the skill level ...   
4  The workshop is designed to accommodate all sk...   
5  That sounds perfect. What's the registration p...   
6  You can register through our website. I can gu...   
7  A direct link would be great. Can you also tel...   
8  Certainly, the fee for the

In [3]:
# ============================================
# 2) Build (partial, full) Pairs
# ============================================
# We'll define a function that:
#  1. Groups by CONVERSATION_ID
#  2. Sorts by CONVERSATION_STEP
#  3. Takes the first 'partial_ratio'% of lines as "source"
#  4. Takes all lines as "target"
# This yields a dataset where "source" is the partial conversation,
# and "target" is the entire conversation text.

def build_partial_full_pairs(df, partial_ratio=0.5):
    """
    partial_ratio: fraction of the conversation to treat as 'partial'.
                   e.g., 0.5 => first 50% is partial, entire conversation is target.
    """
    pairs = []
    grouped = df.groupby("CONVERSATION_ID")
    
    for convo_id, group in grouped:
        group_sorted = group.sort_values("CONVERSATION_STEP")
        # Collect all TEXT lines in order
        all_texts = group_sorted["TEXT"].tolist()
        
        # Build the "full" conversation by concatenating
        full_convo = "\n".join(all_texts)

        # Build the "partial" by taking first partial_ratio lines
        cutoff = max(1, int(len(all_texts) * partial_ratio))  # at least 1 line
        partial_texts = all_texts[:cutoff]
        partial_convo = "\n".join(partial_texts)

        pairs.append({
            "source": partial_convo,
            "target": full_convo
        })

    return pd.DataFrame(pairs)

pairs_df = build_partial_full_pairs(df, partial_ratio=0.5)
print("Number of conversation pairs:", len(pairs_df))
print(pairs_df.head(5))


Number of conversation pairs: 76
                                              source  \
0  Hello, this is [Your Name]'s personal assistan...   
1  Hello, this is the personal assistant of [Your...   
2  Hello, this is the assistant to [Your Name]. H...   
3  Hello, this is the office of [Your Name]. How ...   
4  Hello, you've reached the assistant for [Your ...   

                                              target  
0  Hello, this is [Your Name]'s personal assistan...  
1  Hello, this is the personal assistant of [Your...  
2  Hello, this is the assistant to [Your Name]. H...  
3  Hello, this is the office of [Your Name]. How ...  
4  Hello, you've reached the assistant for [Your ...  


In [5]:
pairs_df

Unnamed: 0,source,target
0,"Hello, this is [Your Name]'s personal assistan...","Hello, this is [Your Name]'s personal assistan..."
1,"Hello, this is the personal assistant of [Your...","Hello, this is the personal assistant of [Your..."
2,"Hello, this is the assistant to [Your Name]. H...","Hello, this is the assistant to [Your Name]. H..."
3,"Hello, this is the office of [Your Name]. How ...","Hello, this is the office of [Your Name]. How ..."
4,"Hello, you've reached the assistant for [Your ...","Hello, you've reached the assistant for [Your ..."
...,...,...
71,"Hello, this is the personal assistant of [Your...","Hello, this is the personal assistant of [Your..."
72,"Hello, this is a call from the National Tax Se...","Hello, this is a call from the National Tax Se..."
73,"Hello, is this Mr. OOO?\nYes, this is him.\nHi...","Hello, is this Mr. OOO?\nYes, this is him.\nHi..."
74,"This bond is under your name, not under our ba...","This bond is under your name, not under our ba..."


In [None]:
pairs_df

In [4]:
# ============================================
# 3) Create a Hugging Face Dataset
# ============================================
dataset = Dataset.from_pandas(pairs_df[["source", "target"]])

# Let's do a simple train_test_split
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
eval_dataset  = split_dataset["test"]

print("Train size:", len(train_dataset))
print("Eval size:", len(eval_dataset))
print("Sample train example:\n", train_dataset[0])


Train size: 60
Eval size: 16
Sample train example:
 {'source': "Hello, this is the assistant to [Your Name]. May I know the reason for your call?\nHi, I'm calling from 'QuickTech Support' about a critical security issue flagged on [Your Name]'s computer. Immediate action is required.\nCould you specify what kind of security issue has been detected?\nWe've detected multiple virus infections that could compromise personal data. We urgently need to run a remote clean-up procedure.\nBefore we proceed, could you provide me with details on when and how the issue was detected?\nOur system conducts regular sweeps and your IP address showed up in our latest scan. We're here to help resolve this immediately.\nI need to cross-verify this with our internal security protocols before any remote access is granted.", 'target': "Hello, this is the assistant to [Your Name]. May I know the reason for your call?\nHi, I'm calling from 'QuickTech Support' about a critical security issue flagged on [Your Nam

In [6]:
# ============================================
# 4) Tokenizer Setup (T5)
# ============================================
model_name = "t5-small"  # or "t5-base", "flan-t5-base", etc.
tokenizer = T5Tokenizer.from_pretrained(model_name)

# T5 typically uses special tokens; we'll rely on T5's own defaults.
# If there's a mismatch for pad_token, do:
#   tokenizer.pad_token = tokenizer.eos_token


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
# ============================================
# 5) Preprocessing Function
# ============================================
# We treat "source" as the input (encoder) and "target" as the output (decoder).

max_input_length = 256
max_target_length = 256

def preprocess_function(examples):
    # Encode source
    model_inputs = tokenizer(
        examples["source"],
        max_length=max_input_length,
        truncation=True
    )
    # Encode target
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["target"],
            max_length=max_target_length,
            truncation=True
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset  = eval_dataset.map(preprocess_function,  batched=True)

# Remove original columns to keep only the tokenized fields
train_dataset = train_dataset.remove_columns(["source","target"])
eval_dataset  = eval_dataset.remove_columns(["source","target"])

train_dataset.set_format("torch")
eval_dataset.set_format("torch")

print("Processed train sample:", train_dataset[0])


Map: 100%|██████████| 60/60 [00:00<00:00, 911.79 examples/s]
Map: 100%|██████████| 16/16 [00:00<00:00, 934.53 examples/s]

Processed train sample: {'input_ids': tensor([ 8774,     6,    48,    19,     8,  6165,    12,   784, 21425,  5570,
         4275,   932,    27,   214,     8,  1053,    21,    39,   580,    58,
         2018,     6,    27,    31,    51,  3874,    45,     3,    31,  5991,
         3142,  9542,  4224,    31,    81,     3,     9,  2404,  1034,   962,
         5692,  5402,    30,   784, 21425,  5570,   908,    31,     7,  1218,
            5,  1318,  5700,   342,  1041,    19,   831,     5,  9348,    25,
        11610,   125,   773,    13,  1034,   962,    65,   118, 14619,    58,
          101,    31,   162, 14619,  1317,  6722, 13315,    24,   228, 12326,
          525,   331,     5,   101, 10839,   120,   174,    12,   661,     3,
            9,  4322,  1349,    18,   413,  3979,     5,  3103,    62,  8669,
            6,   228,    25,   370,   140,    28,  1030,    30,   116,    11,
          149,     8,   962,    47, 14619,    58,   421,   358,  3498,     7,
         1646, 17695,     




In [8]:
# ============================================
# 6) Data Collator for Seq2Seq
# ============================================
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model_name,
    padding="longest",  # or "max_length"
    return_tensors="pt"
)


In [9]:
# ============================================
# 7) Load T5 Model
# ============================================
model = T5ForConditionalGeneration.from_pretrained(model_name)
model = model.to("cuda" if torch.cuda.is_available() else "cpu")


In [10]:
# ============================================
# 8) Training Arguments
# ============================================
training_args = TrainingArguments(
    output_dir="t5-conversation-prediction",
    overwrite_output_dir=True,
    num_train_epochs=3,       # adjust for real data
    per_device_train_batch_size=2,  # adjust
    per_device_eval_batch_size=2,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),
    load_best_model_at_end=True,
    push_to_hub=False
)

print(training_args)


TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval_



In [11]:
# ============================================
# 9) Define Trainer
# ============================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator
)


In [12]:
# ============================================
# 10) Train the Model
# ============================================
trainer.train()

# Save final model
trainer.save_model("t5-conversation-prediction")
tokenizer.save_pretrained("t5-conversation-prediction")


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
                                               
 33%|███▎      | 30/90 [00:20<00:23,  2.52it/s]

{'eval_loss': 1.965167760848999, 'eval_runtime': 1.5266, 'eval_samples_per_second': 10.481, 'eval_steps_per_second': 5.241, 'epoch': 1.0}


 56%|█████▌    | 50/90 [00:28<00:14,  2.68it/s]

{'loss': 2.8344, 'grad_norm': 2.227100372314453, 'learning_rate': 2.2222222222222223e-05, 'epoch': 1.67}


                                               
 67%|██████▋   | 60/90 [00:32<00:09,  3.09it/s]

{'eval_loss': 1.768527626991272, 'eval_runtime': 0.5225, 'eval_samples_per_second': 30.625, 'eval_steps_per_second': 15.312, 'epoch': 2.0}


                                               
100%|██████████| 90/90 [00:45<00:00,  2.83it/s]

{'eval_loss': 1.727165937423706, 'eval_runtime': 0.4735, 'eval_samples_per_second': 33.791, 'eval_steps_per_second': 16.896, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 90/90 [00:46<00:00,  1.93it/s]

{'train_runtime': 46.7162, 'train_samples_per_second': 3.853, 'train_steps_per_second': 1.927, 'train_loss': 2.470164320203993, 'epoch': 3.0}





('t5-conversation-prediction/tokenizer_config.json',
 't5-conversation-prediction/special_tokens_map.json',
 't5-conversation-prediction/spiece.model',
 't5-conversation-prediction/added_tokens.json')

In [15]:
# ============================================
# 11) Generate (Inference)
# ============================================
# We'll define a helper function that, given a partial conversation,
# uses the fine-tuned T5 to generate the rest.

def predict_conversation(partial_convo, max_new_tokens=100):
    model.eval()
    inputs = tokenizer(
        partial_convo,
        return_tensors="pt",
        truncation=True,
        max_length=256
    )
    input_ids = inputs["input_ids"].to(model.device)
    attention_mask = inputs["attention_mask"].to(model.device)
    
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=max_new_tokens,
        num_beams=4,       # or do_sample=True for sampling
        early_stopping=True
    )
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Example usage
partial_conversation = """Good Morning, I am Sanuja calling on behalf of State Bank of Sri Lanka. Oh, hi. I'm actually in a meeting right now. Could you call later?
"""
completion = predict_conversation(partial_conversation, max_new_tokens=300)
print("=== Generated Conversation ===\n", completion)


=== Generated Conversation ===
 Good Morning, I am Sanuja calling on behalf of State Bank of Sri Lanka. Oh, hi. I'm actually in a meeting right now. Could you call later?
