In [1]:
"""
Fine-Tuning BART so that partial -> entire conversation (not just remainder)

Steps:
1) Build partial->entire pairs from your CSV
2) Convert them into a Hugging Face Dataset (source=partial, target=entire)
3) Tokenize with BartTokenizer
4) Fine-tune BartForConditionalGeneration
5) Save the model for reuse
"""


'\nFine-Tuning BART so that partial -> entire conversation (not just remainder)\n\nSteps:\n1) Build partial->entire pairs from your CSV\n2) Convert them into a Hugging Face Dataset (source=partial, target=entire)\n3) Tokenize with BartTokenizer\n4) Fine-tune BartForConditionalGeneration\n5) Save the model for reuse\n'

In [2]:
# ======================================
# 1) Imports
# ======================================
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import (
    BartTokenizer, 
    BartForConditionalGeneration, 
    DataCollatorForSeq2Seq, 
    TrainingArguments, 
    Trainer
)
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


  from .autonotebook import tqdm as notebook_tqdm
  warn(



Using device: cpu


In [3]:
# ======================================
# 2) Build partial->entire from CSV
# ======================================
def build_partial_entire(csv_path, partial_ratio=0.5):
    """
    For each conversation, 
      partial snippet = first partial_ratio lines,
      entire snippet  = all lines in that conversation.
    Return list of (partial_str, entire_str).
    """
    df = pd.read_csv(csv_path)
    pairs = []
    for convo_id, group in df.groupby("CONVERSATION_ID"):
        group_sorted = group.sort_values("CONVERSATION_STEP")
        lines = group_sorted["TEXT"].tolist()
        if len(lines) < 2:
            continue

        # entire conversation
        entire_str = "\n".join(lines).strip()

        # partial snippet
        cutoff = max(1, int(len(lines)*partial_ratio))
        partial_lines = lines[:cutoff]
        partial_str = "\n".join(partial_lines).strip()

        if partial_str and entire_str:
            pairs.append((partial_str, entire_str))
    return pairs

# Example usage
csv_path = "C:/Users/DELL/Desktop/VOIP_Phishing_Attacks/Repos/convoPredict/conversation-prediction/FINAL_DATASET2.csv"  
partial_ratio = 0.5
pairs = build_partial_entire(csv_path, partial_ratio)
print("Number of partial->entire pairs:", len(pairs))
if pairs:
    print("\nSample pair:\nPartial:", pairs[0][0], "\nEntire:", pairs[0][1])


Number of partial->entire pairs: 76

Sample pair:
Partial: Hello, this is [Your Name]'s personal assistant. How may I assist you today?
Hi, I'm Sam. I saw an ad about a photography workshop hosted by [Org Name] next month. I'm interested in registering but had a few questions.
Hi Sam, it's great to hear of your interest in the photography workshop. I'd be happy to help with any questions you have.
Thanks! I was wondering about the skill level required for participants. I'm fairly new to photography.
The workshop is designed to accommodate all skill levels, from beginners to more experienced photographers. [Org Name] aims to ensure everyone can learn and grow, regardless of their starting point.
That sounds perfect. What's the registration process? 
Entire: Hello, this is [Your Name]'s personal assistant. How may I assist you today?
Hi, I'm Sam. I saw an ad about a photography workshop hosted by [Org Name] next month. I'm interested in registering but had a few questions.
Hi Sam, it's g

In [4]:
# ======================================
# 3) Build a HF Dataset
# ======================================
from datasets import Dataset as HFDataset

def create_hf_dataset(pairs):
    data_dict = {
        "source": [p[0] for p in pairs],
        "target": [p[1] for p in pairs],
    }
    hf_ds = HFDataset.from_dict(data_dict)
    return hf_ds

hf_ds = create_hf_dataset(pairs)
print(hf_ds)
if len(hf_ds)>0:
    print("\nSample record:", hf_ds[0])


Dataset({
    features: ['source', 'target'],
    num_rows: 76
})

Sample record: {'source': "Hello, this is [Your Name]'s personal assistant. How may I assist you today?\nHi, I'm Sam. I saw an ad about a photography workshop hosted by [Org Name] next month. I'm interested in registering but had a few questions.\nHi Sam, it's great to hear of your interest in the photography workshop. I'd be happy to help with any questions you have.\nThanks! I was wondering about the skill level required for participants. I'm fairly new to photography.\nThe workshop is designed to accommodate all skill levels, from beginners to more experienced photographers. [Org Name] aims to ensure everyone can learn and grow, regardless of their starting point.\nThat sounds perfect. What's the registration process?", 'target': "Hello, this is [Your Name]'s personal assistant. How may I assist you today?\nHi, I'm Sam. I saw an ad about a photography workshop hosted by [Org Name] next month. I'm interested in regist

In [5]:
# We'll do a train/val split
train_size = int(0.9 * len(hf_ds))
val_size   = len(hf_ds) - train_size
hf_train, hf_val = hf_ds.train_test_split(test_size=val_size).values()

print("Train size:", len(hf_train), "Val size:", len(hf_val))


Train size: 68 Val size: 8


In [6]:
# ======================================
# 4) Load BART
# ======================================
model_name = "facebook/bart-base"  # or bart-large if you have bigger GPU
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

model.to(device)
print("Loaded BART model + tokenizer:", model_name)


Loaded BART model + tokenizer: facebook/bart-base


In [7]:
# ======================================
# 5) Tokenization
# ======================================
def tokenize_fn(examples):
    # "source" => partial snippet
    # "target" => entire conversation
    model_inputs = tokenizer(
        examples["source"],
        max_length=128,   # <--- param: tune
        truncation=True
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["target"],
            max_length=256, # <--- param: tune (entire might be longer)
            truncation=True
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

hf_train = hf_train.map(tokenize_fn, batched=True, remove_columns=["source","target"])
hf_val   = hf_val.map(tokenize_fn,   batched=True, remove_columns=["source","target"])

hf_train.set_format("torch")
hf_val.set_format("torch")

print("Train sample after tokenization:", hf_train[0])


Map: 100%|██████████| 68/68 [00:00<00:00, 165.18 examples/s]
Map: 100%|██████████| 8/8 [00:00<00:00, 209.10 examples/s]

Train sample after tokenization: {'input_ids': tensor([    0, 31414,     6,    42,    16,   646, 12861, 10704, 46117,    29,
         3167,     4,  1336,    64,    38,  3991,    47,   452,   116, 50118,
        30086,   328, 24953,     6,   646, 12861, 10704,   742,    34,    57,
         3919,   734, 50118, 10836,    47,   694,    55,  1254,    59,     5,
         3096,   646, 12861, 10704,   742,  2867,     7,   339,    42,  4588,
          116, 50118, 10643,   768,     6,    24,    21,    10,  9624,  4230,
          734, 50118, 35299,    47, 17151,    61, 13778,  3595,     8,     5,
         6089,  1110,     9,    42,  3096,   116, 50118,   243,  1171,   484,
          299,    12, 15512,  3595,   734, 50118,  2709, 14925,  6216,     6,
          189,    38,    33,    10,  5135,   346,    50,   781, 12059,    59,
            5,  4588,   116,     2]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1,




In [8]:
# ======================================
# 6) DataCollatorForSeq2Seq
# ======================================
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding="longest",
    return_tensors="pt"
)


In [9]:
# ======================================
# 7) Trainer Setup
# ======================================
train_args = TrainingArguments(
    output_dir="bart_partial_entire_convo",
    overwrite_output_dir=True,
    num_train_epochs=3,           # <--- param: tune
    per_device_train_batch_size=2,# <--- param: tune
    per_device_eval_batch_size=2,
    learning_rate=5e-5,           # <--- param: tune
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    push_to_hub=False
)

trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=hf_train,
    eval_dataset=hf_val,
    tokenizer=tokenizer,
    data_collator=data_collator
)


  trainer = Trainer(


In [12]:
# ======================================
# 8) Fine-Tune
# ======================================
trainer.train()


 55%|█████▍    | 56/102 [24:11<19:52, 25.92s/it]
 10%|▉         | 10/102 [00:48<07:05,  4.63s/it]

{'loss': 1.1097, 'grad_norm': 4.584842681884766, 'learning_rate': 4.5098039215686275e-05, 'epoch': 0.29}


 20%|█▉        | 20/102 [01:42<07:31,  5.50s/it]

{'loss': 1.0704, 'grad_norm': 4.315908432006836, 'learning_rate': 4.0196078431372555e-05, 'epoch': 0.59}


 29%|██▉       | 30/102 [02:28<05:14,  4.37s/it]

{'loss': 1.0973, 'grad_norm': 4.7568583488464355, 'learning_rate': 3.529411764705883e-05, 'epoch': 0.88}


                                                
 33%|███▎      | 34/102 [02:49<04:53,  4.32s/it]

{'eval_loss': 1.2899765968322754, 'eval_runtime': 3.7778, 'eval_samples_per_second': 2.118, 'eval_steps_per_second': 1.059, 'epoch': 1.0}


 39%|███▉      | 40/102 [03:17<04:44,  4.59s/it]

{'loss': 1.0201, 'grad_norm': 4.267385005950928, 'learning_rate': 3.0392156862745097e-05, 'epoch': 1.18}


 49%|████▉     | 50/102 [04:01<03:49,  4.41s/it]

{'loss': 0.9144, 'grad_norm': 4.710136890411377, 'learning_rate': 2.5490196078431373e-05, 'epoch': 1.47}


 59%|█████▉    | 60/102 [04:48<03:00,  4.31s/it]

{'loss': 0.8778, 'grad_norm': 5.829656600952148, 'learning_rate': 2.058823529411765e-05, 'epoch': 1.76}


                                                
 67%|██████▋   | 68/102 [05:27<02:26,  4.30s/it]

{'eval_loss': 1.278550386428833, 'eval_runtime': 3.7195, 'eval_samples_per_second': 2.151, 'eval_steps_per_second': 1.075, 'epoch': 2.0}


 69%|██████▊   | 70/102 [05:39<03:06,  5.83s/it]

{'loss': 0.9584, 'grad_norm': 4.02698278427124, 'learning_rate': 1.568627450980392e-05, 'epoch': 2.06}


 78%|███████▊  | 80/102 [06:22<01:37,  4.41s/it]

{'loss': 0.8315, 'grad_norm': 4.547756671905518, 'learning_rate': 1.0784313725490197e-05, 'epoch': 2.35}


 88%|████████▊ | 90/102 [07:07<00:52,  4.37s/it]

{'loss': 0.7833, 'grad_norm': 5.019215106964111, 'learning_rate': 5.882352941176471e-06, 'epoch': 2.65}


 98%|█████████▊| 100/102 [07:49<00:08,  4.22s/it]

{'loss': 0.8395, 'grad_norm': 4.409575462341309, 'learning_rate': 9.80392156862745e-07, 'epoch': 2.94}


                                                 
100%|██████████| 102/102 [08:04<00:00,  4.12s/it]

{'eval_loss': 1.304596185684204, 'eval_runtime': 3.886, 'eval_samples_per_second': 2.059, 'eval_steps_per_second': 1.029, 'epoch': 3.0}


100%|██████████| 102/102 [08:07<00:00,  4.78s/it]

{'train_runtime': 487.7477, 'train_samples_per_second': 0.418, 'train_steps_per_second': 0.209, 'train_loss': 0.9467497853671804, 'epoch': 3.0}





TrainOutput(global_step=102, training_loss=0.9467497853671804, metrics={'train_runtime': 487.7477, 'train_samples_per_second': 0.418, 'train_steps_per_second': 0.209, 'total_flos': 15948419235840.0, 'train_loss': 0.9467497853671804, 'epoch': 3.0})

In [13]:
# ======================================
# 9) Save Fine-Tuned Model
# ======================================
save_dir = "bart_partial_entire_model"
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)

print("Fine-tuned BART model + tokenizer saved in:", save_dir)


Fine-tuned BART model + tokenizer saved in: bart_partial_entire_model


: 