In [1]:
"""
Script to fine-tune BART for conversation prediction (partial -> remainder).

Differences from previous approach:
- Uses a simpler Hugging Face Dataset creation approach with map() for tokenization
- Minimal usage of DataCollatorForSeq2Seq
- Same logic of partial->remainder structure
"""


'\nScript to fine-tune BART for conversation prediction (partial -> remainder).\n\nDifferences from previous approach:\n- Uses a simpler Hugging Face Dataset creation approach with map() for tokenization\n- Minimal usage of DataCollatorForSeq2Seq\n- Same logic of partial->remainder structure\n'

In [2]:
# ==========================================
# 1) Imports
# ==========================================
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split

from transformers import (
    BartTokenizer,
    BartForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments
)
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


  from .autonotebook import tqdm as notebook_tqdm
  warn(



Using device: cpu


In [3]:
# ==========================================
# 2) CSV -> Partial->Remainder
# ==========================================
def build_partial_remainder(csv_path, partial_ratio=0.5):
    """
    Read CSV with columns like:
      CONVERSATION_ID, CONVERSATION_STEP, TEXT
    Group lines by conversation, sort by step, 
    build partial (first X% lines) vs remainder (last X% lines).
    Return list of (partial_str, remainder_str).
    """
    df = pd.read_csv(csv_path)
    pairs = []
    for convo_id, group in df.groupby("CONVERSATION_ID"):
        group_sorted = group.sort_values("CONVERSATION_STEP")
        texts = group_sorted["TEXT"].tolist()
        if len(texts) < 2:
            continue
        cutoff = max(1, int(len(texts)*partial_ratio))
        partial_list = texts[:cutoff]
        remainder_list= texts[cutoff:]
        partial_str = "\n".join(partial_list).strip()
        remainder_str= "\n".join(remainder_list).strip()
        if partial_str and remainder_str:
            pairs.append((partial_str, remainder_str))
    return pairs


In [4]:
# Example usage
csv_path = "C:/Users/DELL/Desktop/VOIP_Phishing_Attacks/Repos/convoPredict/conversation-prediction/FINAL_DATASET2.csv"  
partial_ratio = 0.5
pairs = build_partial_remainder(csv_path, partial_ratio)
print(f"Number of partial->remainder pairs: {len(pairs)}")
if pairs:
    print("Sample pair:\nPartial:", pairs[0][0], "\nRemainder:", pairs[0][1])


Number of partial->remainder pairs: 76
Sample pair:
Partial: Hello, this is [Your Name]'s personal assistant. How may I assist you today?
Hi, I'm Sam. I saw an ad about a photography workshop hosted by [Org Name] next month. I'm interested in registering but had a few questions.
Hi Sam, it's great to hear of your interest in the photography workshop. I'd be happy to help with any questions you have.
Thanks! I was wondering about the skill level required for participants. I'm fairly new to photography.
The workshop is designed to accommodate all skill levels, from beginners to more experienced photographers. [Org Name] aims to ensure everyone can learn and grow, regardless of their starting point.
That sounds perfect. What's the registration process? 
Remainder: You can register through our website. I can guide you through the steps if you'd like, or send you a direct link to the registration page.
A direct link would be great. Can you also tell me about the workshop fee?
Certainly, the

In [5]:
# ==========================================
# 3) Create a HuggingFace Datasets object
#    from the partial->remainder pairs
# ==========================================
from datasets import Dataset as HFDataset

def create_hf_dataset(pairs):
    # We'll build a list of dicts: { "source":..., "target":... }
    data_dict = {
        "source": [p[0] for p in pairs],
        "target": [p[1] for p in pairs]
    }
    hf_ds = HFDataset.from_dict(data_dict)
    return hf_ds

hf_ds = create_hf_dataset(pairs)
print(hf_ds)
if len(hf_ds)>0:
    print("Sample record:", hf_ds[0])


Dataset({
    features: ['source', 'target'],
    num_rows: 76
})
Sample record: {'source': "Hello, this is [Your Name]'s personal assistant. How may I assist you today?\nHi, I'm Sam. I saw an ad about a photography workshop hosted by [Org Name] next month. I'm interested in registering but had a few questions.\nHi Sam, it's great to hear of your interest in the photography workshop. I'd be happy to help with any questions you have.\nThanks! I was wondering about the skill level required for participants. I'm fairly new to photography.\nThe workshop is designed to accommodate all skill levels, from beginners to more experienced photographers. [Org Name] aims to ensure everyone can learn and grow, regardless of their starting point.\nThat sounds perfect. What's the registration process?", 'target': "You can register through our website. I can guide you through the steps if you'd like, or send you a direct link to the registration page.\nA direct link would be great. Can you also tell me

In [6]:
# We'll do a train/val split
train_size = int(0.9 * len(hf_ds))
val_size   = len(hf_ds) - train_size
hf_train, hf_val = hf_ds.train_test_split(test_size=val_size).values()

print("HF train size:", len(hf_train))
print("HF val size  :", len(hf_val))


HF train size: 68
HF val size  : 8


In [7]:
# ==========================================
# 4) Load BART + Tokenizer
# ==========================================
model_name = "facebook/bart-base"  # or bart-large, etc.
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)
model.to(device)
print("Loaded BART + Tokenizer from:", model_name)


Loaded BART + Tokenizer from: facebook/bart-base


In [8]:
# ==========================================
# 5) Tokenization Function
# ==========================================
def tokenize_function(examples):
    # For BART seq2seq: 
    # "source" -> encoder input
    # "target" -> decoder labels
    model_inputs = tokenizer(
        examples["source"], 
        max_length=128,    # <--- param: tune
        truncation=True
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["target"], 
            max_length=128,  # <--- param: tune
            truncation=True
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

hf_train = hf_train.map(tokenize_function, batched=True, remove_columns=["source","target"])
hf_val   = hf_val.map(tokenize_function,   batched=True, remove_columns=["source","target"])

hf_train.set_format("torch")
hf_val.set_format("torch")

print("Tokenized train sample:", hf_train[0])


Map: 100%|██████████| 68/68 [00:00<00:00, 71.57 examples/s]
Map: 100%|██████████| 8/8 [00:00<00:00, 157.66 examples/s]

Tokenized train sample: {'input_ids': tensor([    0,   713,  2175,    16,   223,   110,   766,     6,    45,   223,
           84,   827,    18,   766,     4, 50118,  9904,     6,  4420,     4,
        50118, 32730,     6,    52,  1395,  1719,     5,   455,   379,   153,
        20858,   771,    19,    84,   827,    18,  1188,     4, 50118,  9904,
            6,  4420,     4, 50118, 13984,     6,   965,    75,    14,  4577,
          116, 50118,  9904,     6,  4420,     4, 50118,  1106,    84,   827,
        14617,    24,  4378,     6,    24,   115,    28, 32085,    25,    10,
        15178,  4628,  2541,     4, 50118,  9904,     4, 50118,   170,    40,
         1719,   843,    12,  1096,   207,    31,     5,  1049,  6084,     8,
          291,    12,   541,   207,    31,     5,   400,  6084,     4, 50118,
         9904,     4, 50118,   133,  2405,   291,    12,   541,   207,   782,
            7,    28,  2913,    30,    47,     6,     8,    52,   581, 27736,
           47,   423,     




In [9]:
# ==========================================
# 6) DataCollatorForSeq2Seq
# ==========================================
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding="longest",
    return_tensors="pt"
)


In [10]:
# ==========================================
# 7) Trainer Setup
# ==========================================
training_args = TrainingArguments(
    output_dir="bart_conversation_predict_2",  # <--- param: where to save
    overwrite_output_dir=True,
    num_train_epochs=3,                       # <--- param: tune
    per_device_train_batch_size=2,            # <--- param: tune
    per_device_eval_batch_size=2,
    learning_rate=5e-5,                       # <--- param: tune
    weight_decay=0.01,
    evaluation_strategy="epoch", 
    save_strategy="epoch",
    logging_steps=10,
    push_to_hub=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_train,
    eval_dataset=hf_val,
    tokenizer=tokenizer,
    data_collator=data_collator
)


  trainer = Trainer(


In [11]:
# ==========================================
# 8) Fine-Tune
# ==========================================
trainer.train()


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
 10%|▉         | 10/102 [00:48<05:13,  3.41s/it]

{'loss': 3.887, 'grad_norm': 9.756758689880371, 'learning_rate': 4.5098039215686275e-05, 'epoch': 0.29}


 20%|█▉        | 20/102 [01:20<04:18,  3.15s/it]

{'loss': 3.3955, 'grad_norm': 10.599475860595703, 'learning_rate': 4.0196078431372555e-05, 'epoch': 0.59}


 29%|██▉       | 30/102 [01:52<03:48,  3.17s/it]

{'loss': 3.1173, 'grad_norm': 8.892078399658203, 'learning_rate': 3.529411764705883e-05, 'epoch': 0.88}


                                                


{'eval_loss': 2.6425771713256836, 'eval_runtime': 2.3227, 'eval_samples_per_second': 3.444, 'eval_steps_per_second': 1.722, 'epoch': 1.0}


 39%|███▉      | 40/102 [02:32<03:40,  3.56s/it]

{'loss': 2.7964, 'grad_norm': 8.188032150268555, 'learning_rate': 3.0392156862745097e-05, 'epoch': 1.18}


 49%|████▉     | 50/102 [03:03<02:44,  3.16s/it]

{'loss': 2.5413, 'grad_norm': 8.344756126403809, 'learning_rate': 2.5490196078431373e-05, 'epoch': 1.47}


 59%|█████▉    | 60/102 [03:36<02:16,  3.25s/it]

{'loss': 2.6826, 'grad_norm': 8.38615608215332, 'learning_rate': 2.058823529411765e-05, 'epoch': 1.76}


                                                
 67%|██████▋   | 68/102 [04:05<01:48,  3.20s/it]

{'eval_loss': 2.4609251022338867, 'eval_runtime': 2.2121, 'eval_samples_per_second': 3.616, 'eval_steps_per_second': 1.808, 'epoch': 2.0}


 69%|██████▊   | 70/102 [04:14<02:21,  4.43s/it]

{'loss': 2.436, 'grad_norm': 6.876885414123535, 'learning_rate': 1.568627450980392e-05, 'epoch': 2.06}


 78%|███████▊  | 80/102 [04:46<01:10,  3.21s/it]

{'loss': 2.2743, 'grad_norm': 8.150529861450195, 'learning_rate': 1.0784313725490197e-05, 'epoch': 2.35}


 88%|████████▊ | 90/102 [05:17<00:37,  3.10s/it]

{'loss': 2.2932, 'grad_norm': 8.273405075073242, 'learning_rate': 5.882352941176471e-06, 'epoch': 2.65}


 98%|█████████▊| 100/102 [05:48<00:06,  3.00s/it]

{'loss': 2.2278, 'grad_norm': 8.27377700805664, 'learning_rate': 9.80392156862745e-07, 'epoch': 2.94}


                                                 
100%|██████████| 102/102 [06:00<00:00,  3.09s/it]

{'eval_loss': 2.4451236724853516, 'eval_runtime': 2.3735, 'eval_samples_per_second': 3.371, 'eval_steps_per_second': 1.685, 'epoch': 3.0}


100%|██████████| 102/102 [06:03<00:00,  3.56s/it]


{'train_runtime': 363.3148, 'train_samples_per_second': 0.561, 'train_steps_per_second': 0.281, 'train_loss': 2.756316970376407, 'epoch': 3.0}


TrainOutput(global_step=102, training_loss=2.756316970376407, metrics={'train_runtime': 363.3148, 'train_samples_per_second': 0.561, 'train_steps_per_second': 0.281, 'total_flos': 14915916288000.0, 'train_loss': 2.756316970376407, 'epoch': 3.0})

In [12]:
# ==========================================
# 9) Save Fine-Tuned Model
# ==========================================
save_dir = "bart_conversation_finetuned_model_2"
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)

print("Fine-tuned BART model + tokenizer saved to:", save_dir)


Fine-tuned BART model + tokenizer saved to: bart_conversation_finetuned_model_2


: 