In [None]:
# # BART Conversation Completion Example

In [1]:
# =========================================
# 1) Install and Import Dependencies
# =========================================
# !pip install transformers datasets accelerate

import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    BartTokenizer,
    BartForConditionalGeneration,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer
)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# =========================================
# 2) Load the Conversation CSV
# =========================================
# We'll assume your CSV has columns like:
# CONVERSATION_ID, CONVERSATION_STEP, TEXT, ...
# We will group each conversation and build partial->full pairs.

df = pd.read_csv("/Users/ashansubodha/Desktop/VOIP Vishing/conversation-prediction/BETTER30.csv")

# Inspect columns
print(df.head())

# Example columns might be:
# CONVERSATION_ID | CONVERSATION_STEP | TEXT | CONTEXT | LABEL (etc.)


   CONVERSATION_ID  CONVERSATION_STEP  \
0                6                  1   
1                6                  2   
2                6                  3   
3                6                  4   
4                6                  5   

                                                TEXT  \
0  Good morning, this is [Your Name]'s personal a...   
1  Hello, my name is Jamie. I'm interested in vol...   
2  Yes, I'm really passionate about environmental...   
3  Great, how do I sign up, and where can I find ...   
4  Could you send me the link, please? And my ema...   

                             CONTEXT    LABEL                      FEATURES  \
0          Standard opening exchange  neutral                           NaN   
1   Encourages the caller's interest  neutral      welcoming, positive_tone   
2    Reinforces anyone can volunteer  neutral                     inclusive   
3           Demonstrates flexibility  neutral  helpful_tone, offers_options   
4  Fulfills caller's 

In [14]:
df

Unnamed: 0,CONVERSATION_ID,CONVERSATION_STEP,TEXT,CONTEXT,LABEL,FEATURES,ANNOTATIONS
0,6,1,"Good morning, this is [Your Name]'s personal a...",Standard opening exchange,neutral,,
1,6,2,"Hello, my name is Jamie. I'm interested in vol...",Encourages the caller's interest,neutral,"welcoming, positive_tone",
2,6,3,"Yes, I'm really passionate about environmental...",Reinforces anyone can volunteer,neutral,inclusive,
3,6,4,"Great, how do I sign up, and where can I find ...",Demonstrates flexibility,neutral,"helpful_tone, offers_options",
4,6,5,"Could you send me the link, please? And my ema...",Fulfills caller's request quickly,neutral,prompt_action,
...,...,...,...,...,...,...,...
645,68,5,Would it be possible to receive some literatur...,"""Assistant requests official documentation fo...",scam_response,"""literature_request","decision_making_review"""
646,68,6,We can certainly provide information on our wo...,"""Scammer stresses the urgency of the situation.""",scam,"""immediate_action_urgency","urgency_stress"""
647,68,7,"I understand the urgency, but our policy is to...","""Assistant reiterates the policy on verifying...",scam_response,"""contribution_policy_reiteration","information_for_review_request"""
648,68,8,I'll do my best to get you the information qui...,"""Scammer promises to send information and hig...",scam,"""information_sending_promise","support_impact_highlight"""


In [3]:
# =========================================
# 3) Build (source, target) pairs
# =========================================
# We define a function that, for each conversation:
#   - Sorts by CONVERSATION_STEP
#   - Takes the first X% of lines as a "partial" conversation
#   - Takes the entire conversation as "full"
#   - Creates a row: { "source": partial, "target": full }

def build_partial_full_pairs(df, partial_ratio=0.3):
    """
    partial_ratio: portion of conversation to treat as the 'partial' snippet.
    """
    rows = []
    grouped = df.groupby("CONVERSATION_ID")

    for convo_id, group in grouped:
        group_sorted = group.sort_values("CONVERSATION_STEP")
        texts = group_sorted["TEXT"].tolist()

        # Full conversation as one string
        full_convo = "\n".join(texts)

        # partial: first partial_ratio% lines
        cutoff = max(1, int(len(texts) * partial_ratio))  # at least 1 line
        partial_texts = texts[:cutoff]
        partial_convo = "\n".join(partial_texts)

        rows.append({
            "source": partial_convo,
            "target": full_convo
        })

    return pd.DataFrame(rows)

pairs_df = build_partial_full_pairs(df, partial_ratio=0.3)
print(f"Built {len(pairs_df)} conversation pairs.")
print(pairs_df.head())
# columns: source, target



Built 65 conversation pairs.
                                              source  \
0  Hello, this is [Your Name]'s personal assistan...   
1  Hello, this is the personal assistant of [Your...   
2  Hello, this is the assistant to [Your Name]. H...   
3  Hello, this is the office of [Your Name]. How ...   
4  Hello, you've reached the assistant for [Your ...   

                                              target  
0  Hello, this is [Your Name]'s personal assistan...  
1  Hello, this is the personal assistant of [Your...  
2  Hello, this is the assistant to [Your Name]. H...  
3  Hello, this is the office of [Your Name]. How ...  
4  Hello, you've reached the assistant for [Your ...  


In [4]:
# =========================================
# 4) Create a Hugging Face Dataset
# =========================================
dataset = Dataset.from_pandas(pairs_df[["source", "target"]])

# Split into train/test
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
eval_dataset  = split_dataset["test"]

print("Train examples:", len(train_dataset))
print("Eval examples:",  len(eval_dataset))
print("Sample train example:\n", train_dataset[0])


Train examples: 52
Eval examples: 13
Sample train example:
 {'source': "Hello, this is the assistant to [Your Name]. How may I assist you today?\nGood day! I'm Sarah from the Global Aid Foundation. We're currently running a campaign to support victims of the recent natural disaster in [Region]. We're reaching out to compassionate individuals like [Your Name] for support.", 'target': "Hello, this is the assistant to [Your Name]. How may I assist you today?\nGood day! I'm Sarah from the Global Aid Foundation. We're currently running a campaign to support victims of the recent natural disaster in [Region]. We're reaching out to compassionate individuals like [Your Name] for support.\nAbsolutely. Our foundation has been actively involved in various relief efforts worldwide. This particular campaign focuses on providing shelter, food, and medical supplies to the affected individuals in [Region].\nWe're in the process of updating our website with the latest campaign details. However, we can 

In [5]:
# =========================================
# 5) Prepare the BART Tokenizer
# =========================================
# We'll use "facebook/bart-base" as an example. 
# You can also try "facebook/bart-large", "facebook/bart-large-cnn", etc.

model_name = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)


In [6]:
# =========================================
# 6) Tokenization Function
# =========================================
# For seq2seq models:
#   - "source" => encoder input
#   - "target" => decoder output

def preprocess_function(examples):
    # examples["source"] -> partial conversation text
    # examples["target"] -> full conversation text
    model_inputs = tokenizer(
        examples["source"],
        max_length=512,  # adjust if needed
        truncation=True
    )

    # Tokenize target
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["target"],
            max_length=512,  # adjust if needed
            truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset  = eval_dataset.map(preprocess_function,  batched=True)

# Remove original columns
train_dataset = train_dataset.remove_columns(["source", "target"])
eval_dataset  = eval_dataset.remove_columns(["source", "target"])

# Convert to PyTorch format
train_dataset.set_format("torch")
eval_dataset.set_format("torch")

print(train_dataset[0])


Map: 100%|██████████| 52/52 [00:00<00:00, 486.60 examples/s]
Map: 100%|██████████| 13/13 [00:00<00:00, 452.65 examples/s]

{'input_ids': tensor([    0, 31414,     6,    42,    16,     5,  3167,     7,   646, 12861,
        10704,  8174,  1336,   189,    38,  3991,    47,   452,   116, 50118,
        12350,   183,   328,    38,   437,  4143,    31,     5,  1849, 11572,
         2475,     4,   166,   214,   855,   878,    10,   637,     7,   323,
         1680,     9,     5,   485,  1632,  4463,    11,   646, 43575,  8174,
          166,   214,  3970,    66,     7, 23303,  2172,   101,   646, 12861,
        10704,   742,    13,   323,     4,     2]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'labels': tensor([    0, 31414,     6,    42,    16,     5,  3167,     7,   646, 12861,
        10704,  8174,  1336,   189,    38,  3991,    47,   452,   116, 50118,
        12350,   183,   328,    38,   437,  4143,    31,  




In [7]:
# =========================================
# 7) Data Collator for Seq2Seq
# =========================================
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model_name,
    padding="longest",
    return_tensors="pt"
)


In [8]:
# =========================================
# 8) Load BART For Conditional Generation
# =========================================
model = BartForConditionalGeneration.from_pretrained(model_name)
model = model.cuda() if torch.cuda.is_available() else model


In [9]:
# =========================================
# 9) Training Arguments
# =========================================
training_args = TrainingArguments(
    output_dir="bart-conversation-model",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),
    push_to_hub=False
)

training_args




TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval_

In [10]:
# =========================================
# 10) Define Trainer
# =========================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator
)


In [11]:
# =========================================
# 11) Train the Model
# =========================================
trainer.train()

trainer.save_model("bart-conversation-finetuned")
tokenizer.save_pretrained("bart-conversation-finetuned")


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
                                               


{'eval_loss': 2.1283204555511475, 'eval_runtime': 3.5765, 'eval_samples_per_second': 3.635, 'eval_steps_per_second': 1.957, 'epoch': 1.0}


                                               
 67%|██████▋   | 52/78 [01:00<00:22,  1.18it/s]

{'eval_loss': 1.8945937156677246, 'eval_runtime': 0.5509, 'eval_samples_per_second': 23.599, 'eval_steps_per_second': 12.707, 'epoch': 2.0}


                                               
100%|██████████| 78/78 [01:20<00:00,  1.70it/s]

{'eval_loss': 1.8643345832824707, 'eval_runtime': 0.6154, 'eval_samples_per_second': 21.124, 'eval_steps_per_second': 11.375, 'epoch': 3.0}


100%|██████████| 78/78 [01:22<00:00,  1.06s/it]


{'train_runtime': 82.7807, 'train_samples_per_second': 1.884, 'train_steps_per_second': 0.942, 'train_loss': 2.3080841455704126, 'epoch': 3.0}


('bart-conversation-finetuned/tokenizer_config.json',
 'bart-conversation-finetuned/special_tokens_map.json',
 'bart-conversation-finetuned/vocab.json',
 'bart-conversation-finetuned/merges.txt',
 'bart-conversation-finetuned/added_tokens.json')

In [13]:
# =========================================
# 12) Inference / Generation
# =========================================
# Suppose we have a partial conversation snippet, and want BART to generate
# the rest (or entire) conversation.

partial_text = """
Caller: Hello, I'm Sanuja from State Bank of Sri Lanka.
Callee: Hi, I'm in a meeting now, can we talk later?
Caller:
"""
# We'll feed this partial text as "source". The model should produce the "target."

# (If needed, load the saved model)
# model = BartForConditionalGeneration.from_pretrained("bart-conversation-finetuned")
# tokenizer = BartTokenizer.from_pretrained("bart-conversation-finetuned")
import torch

# Choose the device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# Move model to device
model = model.to(device)

# Encode your partial_text
encoded_input = tokenizer.encode(
    partial_text,
    return_tensors="pt",
    truncation=True,
    max_length=512
)

# Move input IDs to device
encoded_input = encoded_input.to(device)

# Generate
model.eval()

outputs = model.generate(
    encoded_input,
    max_length=200,
    num_beams=4,
    early_stopping=True
)

encoded_input = encoded_input.cuda() if torch.cuda.is_available() else encoded_input

outputs = model.generate(
    encoded_input,
    max_length=200,    # set a max length for generation
    num_beams=4,       # or do_sample=True for sampling
    early_stopping=True
)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("=== Generated Conversation ===")
print(generated_text)


=== Generated Conversation ===
Caller: Hello, I'm Sanuja from State Bank of Sri Lanka. Can you please tell me your contact number?Callee: Hi, I was in a meeting now, can we talk later?
Hello, that's my contact number. Can we talk about the meeting?

