# # T5 Conversation Completion Example

In [1]:
# =========================================
# 1) Install and Import Dependencies
# =========================================
# If you're in a fresh environment (e.g. Google Colab), you may need:
# !pip install transformers datasets accelerate

import pandas as pd
import torch
from datasets import Dataset, load_dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# =========================================
# 2) Load the Conversation CSV
# =========================================
# We'll assume your CSV has:
# CONVERSATION_ID, CONVERSATION_STEP, TEXT, ...
# We'll build pairs of (partial_conversation -> full_conversation) for training.

df = pd.read_csv("/Users/ashansubodha/Desktop/VOIP Vishing/conversation-prediction/BETTER30.csv")

# Let's see what columns we have
print(df.head())

# CONVERSATION_ID | CONVERSATION_STEP | TEXT | CONTEXT | LABEL


   CONVERSATION_ID  CONVERSATION_STEP  \
0                6                  1   
1                6                  2   
2                6                  3   
3                6                  4   
4                6                  5   

                                                TEXT  \
0  Good morning, this is [Your Name]'s personal a...   
1  Hello, my name is Jamie. I'm interested in vol...   
2  Yes, I'm really passionate about environmental...   
3  Great, how do I sign up, and where can I find ...   
4  Could you send me the link, please? And my ema...   

                             CONTEXT    LABEL                      FEATURES  \
0          Standard opening exchange  neutral                           NaN   
1   Encourages the caller's interest  neutral      welcoming, positive_tone   
2    Reinforces anyone can volunteer  neutral                     inclusive   
3           Demonstrates flexibility  neutral  helpful_tone, offers_options   
4  Fulfills caller's 

In [3]:
# =========================================
# 3) Build (source, target) pairs
# =========================================
# We want:
# source = partial conversation (e.g., first 30% or 50%)
# target = entire conversation
#
# For demonstration:
#  - We'll group by CONVERSATION_ID
#  - We'll take the first X% of TEXT lines as "partial"
#  - We'll take the entire conversation as "target"
# This is just one approach. You could also take partial => remaining turns.

def build_partial_full_pairs(df, partial_ratio=0.3):
    """
    For each conversation:
      1. Sort by CONVERSATION_STEP
      2. partial_convo = first partial_ratio % of lines
      3. full_convo = all lines
      4. Return (partial_convo_text, full_convo_text)
    """
    rows = []
    grouped = df.groupby("CONVERSATION_ID")

    for convo_id, group in grouped:
        group_sorted = group.sort_values("CONVERSATION_STEP")
        texts = group_sorted["TEXT"].tolist()

        # Convert entire conversation to a single string
        full_convo = "\n".join(texts)

        # Determine partial slice
        cutoff = max(1, int(len(texts) * partial_ratio))  # at least 1 line
        partial_texts = texts[:cutoff]
        partial_convo = "\n".join(partial_texts)

        rows.append({
            "source": partial_convo,
            "target": full_convo
        })

    return pd.DataFrame(rows)

pairs_df = build_partial_full_pairs(df, partial_ratio=0.3)



In [4]:
print(f"Built {len(pairs_df)} conversation pairs.")


Built 65 conversation pairs.


In [5]:
(pairs_df)


Unnamed: 0,source,target
0,"Hello, this is [Your Name]'s personal assistan...","Hello, this is [Your Name]'s personal assistan..."
1,"Hello, this is the personal assistant of [Your...","Hello, this is the personal assistant of [Your..."
2,"Hello, this is the assistant to [Your Name]. H...","Hello, this is the assistant to [Your Name]. H..."
3,"Hello, this is the office of [Your Name]. How ...","Hello, this is the office of [Your Name]. How ..."
4,"Hello, you've reached the assistant for [Your ...","Hello, you've reached the assistant for [Your ..."
...,...,...
60,"Hello, you've reached the assistant to [Your N...","Hello, you've reached the assistant to [Your N..."
61,"Good afternoon, you've reached [Your Name]'s a...","Good afternoon, you've reached [Your Name]'s a..."
62,"Hello, you've reached the office of [Your Name...","Hello, you've reached the office of [Your Name..."
63,"Good morning, [Your Name]'s office. How may I ...","Good morning, [Your Name]'s office. How may I ..."


In [6]:
# =========================================
# 4) Create a Hugging Face Dataset
# =========================================
dataset = Dataset.from_pandas(pairs_df[["source", "target"]])

# Train/test split
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
eval_dataset  = split_dataset["test"]

print("Train examples:", len(train_dataset))
print("Eval examples:",  len(eval_dataset))
print("Sample train example:\n", train_dataset[0])


Train examples: 52
Eval examples: 13
Sample train example:
 {'source': "Hello, this is the assistant to [Your Name]. How may I assist you today?\nGood day! I'm Sarah from the Global Aid Foundation. We're currently running a campaign to support victims of the recent natural disaster in [Region]. We're reaching out to compassionate individuals like [Your Name] for support.", 'target': "Hello, this is the assistant to [Your Name]. How may I assist you today?\nGood day! I'm Sarah from the Global Aid Foundation. We're currently running a campaign to support victims of the recent natural disaster in [Region]. We're reaching out to compassionate individuals like [Your Name] for support.\nAbsolutely. Our foundation has been actively involved in various relief efforts worldwide. This particular campaign focuses on providing shelter, food, and medical supplies to the affected individuals in [Region].\nWe're in the process of updating our website with the latest campaign details. However, we can 

In [None]:
# ! pip install sentencepiece



In [8]:
# =========================================
# 5) Prepare the T5 Tokenizer
# =========================================
model_name = "t5-small"  # or "t5-base", "flan-t5-base", etc.
tokenizer = T5Tokenizer.from_pretrained(model_name)

# T5 uses an <extra_id_0> style for mask, etc., but for general generation,
# we just need the normal tokens.


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [9]:
# =========================================
# 6) Tokenization Function
# =========================================
# We'll treat "source" as the encoder input, "target" as the decoder output.

def preprocess_function(examples):
    # examples["source"] -> list of partial conversation strings
    # examples["target"] -> list of full conversation strings

    # Encode the source
    model_inputs = tokenizer(
        examples["source"],
        max_length=512,      # adjust as needed
        truncation=True
    )

    # Encode the target
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["target"],
            max_length=512,   # adjust if conversations can be long
            truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset  = eval_dataset.map(preprocess_function,  batched=True)

# Remove original columns to keep only tokenized fields
train_dataset = train_dataset.remove_columns(["source","target"])
eval_dataset  = eval_dataset.remove_columns(["source","target"])

# Set format for PyTorch
train_dataset.set_format("torch")
eval_dataset.set_format("torch")

print(train_dataset[0])


Map: 100%|██████████| 52/52 [00:00<00:00, 941.14 examples/s]
Map: 100%|██████████| 13/13 [00:00<00:00, 1172.53 examples/s]

{'input_ids': tensor([ 8774,     6,    48,    19,     8,  6165,    12,   784, 21425,  5570,
         4275,   571,   164,    27,  2094,    25,   469,    58,  1804,   239,
           55,    27,    31,    51,  8077,    45,     8,  3699, 12090,  2941,
            5,   101,    31,    60,  1083,  1180,     3,     9,  2066,    12,
          380,  8926,    13,     8,  1100,   793,  6912,    16,   784, 17748,
           23,   106,  4275,   101,    31,    60,  7232,    91,    12, 21801,
         1742,   114,   784, 21425,  5570,   908,    21,   380,     5,     1]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'labels': tensor([ 8774,     6,    48,    19,     8,  6165,    12,   784, 21425,  5570,
         4275,   571,   164,    27,  2094,    25,   469,    58,  1804,   239,
           55,    




In [10]:
# =========================================
# 7) Data Collator for Seq2Seq
# =========================================
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model_name,
    padding="longest",  # or "max_length"
    return_tensors="pt"
)


In [11]:
# =========================================
# 8) Load T5 Model
# =========================================
model = T5ForConditionalGeneration.from_pretrained(model_name)
model = model.cuda() if torch.cuda.is_available() else model


In [12]:
# =========================================
# 9) Training Arguments
# =========================================
training_args = TrainingArguments(
    output_dir="t5-conversation-model",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),
    push_to_hub=False
)

training_args




TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval_

In [13]:
# =========================================
# 10) Define Trainer
# =========================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator
)


In [14]:
# =========================================
# 11) Train the Model
# =========================================
trainer.train()




  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
                                               
 33%|███▎      | 26/78 [00:37<00:43,  1.19it/s]

{'eval_loss': 3.0752596855163574, 'eval_runtime': 4.3614, 'eval_samples_per_second': 2.981, 'eval_steps_per_second': 1.605, 'epoch': 1.0}


                                               
 67%|██████▋   | 52/78 [00:55<00:14,  1.75it/s]

{'eval_loss': 2.8930037021636963, 'eval_runtime': 0.545, 'eval_samples_per_second': 23.853, 'eval_steps_per_second': 12.844, 'epoch': 2.0}


                                               
100%|██████████| 78/78 [01:11<00:00,  1.78it/s]

{'eval_loss': 2.8547608852386475, 'eval_runtime': 0.5639, 'eval_samples_per_second': 23.056, 'eval_steps_per_second': 12.415, 'epoch': 3.0}


100%|██████████| 78/78 [01:12<00:00,  1.07it/s]

{'train_runtime': 72.6488, 'train_samples_per_second': 2.147, 'train_steps_per_second': 1.074, 'train_loss': 3.6464949387770433, 'epoch': 3.0}





TrainOutput(global_step=78, training_loss=3.6464949387770433, metrics={'train_runtime': 72.6488, 'train_samples_per_second': 2.147, 'train_steps_per_second': 1.074, 'total_flos': 3755734990848.0, 'train_loss': 3.6464949387770433, 'epoch': 3.0})

In [15]:
# Save final model
trainer.save_model("t5-conversation-finetuned")
tokenizer.save_pretrained("t5-conversation-finetuned")

('t5-conversation-finetuned/tokenizer_config.json',
 't5-conversation-finetuned/special_tokens_map.json',
 't5-conversation-finetuned/spiece.model',
 't5-conversation-finetuned/added_tokens.json')

In [18]:
# =========================================
# 12) Inference / Generation
# =========================================
# Suppose we have a partial conversation snippet, and want T5 to generate
# the entire conversation (or next lines).

partial_text = """
Caller: Good Morning, I'm Sanuja from State Bank of Sri Lanka.
Callee: Oh, hi. I'm in a meeting now. Could you call later?
Caller:
"""
# This is our "source." The model should generate the "target" (the full convo).

# Load model if needed
# model = T5ForConditionalGeneration.from_pretrained("t5-conversation-finetuned")
# tokenizer = T5Tokenizer.from_pretrained("t5-conversation-finetuned")
# model.eval()

encoded_input = tokenizer.encode(
    partial_text,
    return_tensors="pt",
    truncation=True,
    max_length=512
)

encoded_input = encoded_input.cuda() if torch.cuda.is_available() else encoded_input

outputs = model.generate(
    encoded_input,
    max_length=200,       # set a max length for generation
    num_beams=4,          # or do_sample=True for sampling
    early_stopping=True
)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("=== Generated Conversation ===")
print(generated_text)


=== Generated Conversation ===
Caller: Good Morning, I'm Sanuja from State Bank of Sri Lanka. Callee: Oh, hi. I'm in meeting now. Could you call later? Caller: Good Morning, I'm Sanuja from State Bank of Sri Lanka. Callee: Oh, hi. I'm in meeting now. Could you call later? Caller: Good Morning, I'm Sanuja from State Bank of Sri Lanka. Callee: Oh, hi. I'm in meeting now. Could


In [None]:
# =========================================
# 12) Inference / Generation
# =========================================
# Suppose we have a partial conversation snippet, and want T5 to generate
# the entire conversation (or next lines).

partial_text = """
Caller: Good Morning, I'm Sanuja from State Bank of Sri Lanka.
Callee: Oh, hi. I'm in a meeting now. Could you call later?
Caller:
"""
# This is our "source." The model should generate the "target" (the full convo).

# Load model if needed
# model = T5ForConditionalGeneration.from_pretrained("t5-conversation-finetuned")
# tokenizer = T5Tokenizer.from_pretrained("t5-conversation-finetuned")
# model.eval()

# Check if MPS is available
import torch

device = "mps" if torch.backends.mps.is_available() else "cpu"
model = model.to(device)

encoded_input = tokenizer.encode(
    partial_text,
    return_tensors="pt",
    truncation=True,
    max_length=512
).to(device)  # move tensor to the same device

with torch.no_grad():
    outputs = model.generate(
        encoded_input,
        max_length=200,
        num_beams=4,
        early_stopping=True
    )


encoded_input = encoded_input.cuda() if torch.cuda.is_available() else encoded_input

outputs = model.generate(
    encoded_input,
    max_length=200,       # set a max length for generation
    num_beams=4,          # or do_sample=True for sampling
    early_stopping=True
)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("=== Generated Conversation ===")
print(generated_text)
