In [13]:
"""
Jupyter Notebook:
Fine-Tuning GPT-2 for Conversation Prediction (Partial -> Remainder).

We will:
1) Load a CSV with columns: CONVERSATION_ID, CONVERSATION_STEP, TEXT
2) Build partial->remainder pairs for each conversation
3) Create a single text = partial + <SEP> + remainder
4) Train GPT-2 (causal LM) on these examples
5) Save the fine-tuned model & tokenizer for reuse
"""

# If needed, install:
# !pip install transformers datasets accelerate sentencepiece


'\nJupyter Notebook:\nFine-Tuning GPT-2 for Conversation Prediction (Partial -> Remainder).\n\nWe will:\n1) Load a CSV with columns: CONVERSATION_ID, CONVERSATION_STEP, TEXT\n2) Build partial->remainder pairs for each conversation\n3) Create a single text = partial + <SEP> + remainder\n4) Train GPT-2 (causal LM) on these examples\n5) Save the fine-tuned model & tokenizer for reuse\n'

In [14]:
# =========================================
# 1) Imports
# =========================================
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import math, os

# Hugging Face
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cpu


In [16]:
# =========================================
# 2) Build partial->remainder from CSV
# =========================================
def build_partial_remainder(csv_path, partial_ratio=0.5):
    """
    partial_ratio <--- param to tune how many lines go into partial snippet
    e.g. partial_ratio=0.5 means first 50% lines of the conversation = partial,
                              last 50% = remainder
    """
    df = pd.read_csv(csv_path)
    pairs = []
    for convo_id, group in df.groupby("CONVERSATION_ID"):
        group_sorted = group.sort_values("CONVERSATION_STEP")
        texts = group_sorted["TEXT"].tolist()
        if len(texts) < 2:
            continue
        cutoff = max(1, int(len(texts)*partial_ratio))
        partial_list = texts[:cutoff]
        remainder_list = texts[cutoff:]
        partial_str = "\n".join(partial_list).strip()
        remainder_str = "\n".join(remainder_list).strip() if remainder_list else ""
        if partial_str and remainder_str:
            pairs.append((partial_str, remainder_str))
    return pairs

# Example usage (update the path):
csv_path = "C:/Users/DELL/Desktop/VOIP_Phishing_Attacks/Repos/convoPredict/conversation-prediction/FINAL_DATASET2.csv"  
partial_ratio = 0.5         # <--- param to tune
data_pairs = build_partial_remainder(csv_path, partial_ratio)
print("Number of partial->remainder pairs:", len(data_pairs))
if data_pairs:
    print("Sample pair:\nPartial:", data_pairs[0][0], "\nRemainder:", data_pairs[0][1])


Number of partial->remainder pairs: 76
Sample pair:
Partial: Hello, this is [Your Name]'s personal assistant. How may I assist you today?
Hi, I'm Sam. I saw an ad about a photography workshop hosted by [Org Name] next month. I'm interested in registering but had a few questions.
Hi Sam, it's great to hear of your interest in the photography workshop. I'd be happy to help with any questions you have.
Thanks! I was wondering about the skill level required for participants. I'm fairly new to photography.
The workshop is designed to accommodate all skill levels, from beginners to more experienced photographers. [Org Name] aims to ensure everyone can learn and grow, regardless of their starting point.
That sounds perfect. What's the registration process? 
Remainder: You can register through our website. I can guide you through the steps if you'd like, or send you a direct link to the registration page.
A direct link would be great. Can you also tell me about the workshop fee?
Certainly, the

In [17]:
# =========================================
# 3) Create a single text with <|SEP|>
# =========================================
# We'll combine partial + special sep + remainder into one string.
# GPT-2 will treat partial snippet as context, and remainder as the next tokens to predict.

def make_single_text(partial, remainder, sep_token="<|SEP|>"):
    """
    partial + sep_token + remainder
    e.g. "Hello.\nCallee: I saw your ad.<|SEP|>This is the remainder..."
    """
    return f"{partial}\n{sep_token}\n{remainder}"

class ConversationDataset(Dataset):
    """
    We'll store the final strings. Each item is a single text line
    that includes partial+SEP+remainder for GPT-2.
    """
    def __init__(self, pairs, sep_token="<|SEP|>"):
        self.texts = []
        for (part, rem) in pairs:
            combined = make_single_text(part, rem, sep_token=sep_token)
            self.texts.append(combined)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]


In [18]:
# =========================================
# 4) GPT-2 Tokenizer
# =========================================
# We'll add a custom <SEP> token if we want GPT-2 to treat it specially.
# If you want to treat <|SEP|> as just normal text, skip special token addition.
# We'll do it as a new token for clarity.

model_name = "gpt2"  # <--- param: choose "gpt2-medium", "gpt2-large" if you have bigger GPU
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# We can add "<|SEP|>" as a special token
special_tokens = {"sep_token": "<|SEP|>"}
if "<|SEP|>" not in tokenizer.get_vocab():
    tokenizer.add_special_tokens({"additional_special_tokens": ["<|SEP|>"]})
    print("Added <|SEP|> to GPT-2 tokenizer vocab.")
    
# GPT-2 doesn't have a real pad token, so let's set pad to eos
tokenizer.pad_token = tokenizer.eos_token
print("Vocabulary size:", len(tokenizer))


Added <|SEP|> to GPT-2 tokenizer vocab.
Vocabulary size: 50258


In [19]:
# =========================================
# 5) Build the Dataset
# =========================================
ds = ConversationDataset(data_pairs, sep_token="<|SEP|>")
print("Dataset length:", len(ds))
if len(ds)>0:
    print("Example text:", ds[0])


Dataset length: 76
Example text: Hello, this is [Your Name]'s personal assistant. How may I assist you today?
Hi, I'm Sam. I saw an ad about a photography workshop hosted by [Org Name] next month. I'm interested in registering but had a few questions.
Hi Sam, it's great to hear of your interest in the photography workshop. I'd be happy to help with any questions you have.
Thanks! I was wondering about the skill level required for participants. I'm fairly new to photography.
The workshop is designed to accommodate all skill levels, from beginners to more experienced photographers. [Org Name] aims to ensure everyone can learn and grow, regardless of their starting point.
That sounds perfect. What's the registration process?
<|SEP|>
You can register through our website. I can guide you through the steps if you'd like, or send you a direct link to the registration page.
A direct link would be great. Can you also tell me about the workshop fee?
Certainly, the fee for the workshop is $200, w

In [23]:
# We'll do a train/val split
train_size = int(0.9 * len(ds))  # <--- param to tune train vs val ratio
eval_size  = len(ds) - train_size
train_ds, eval_ds = torch.utils.data.random_split(ds, [train_size, eval_size])

print("Train size:", len(train_ds))
print("Eval size :", len(eval_ds))


Train size: 68
Eval size : 8


In [24]:
# =========================================
# 6) Data Collator for Causal LM
# =========================================
# We'll tokenize inside the collator, then do standard next-token prediction ignoring pad.

class ConversationDataCollator:
    def __init__(self, tokenizer, max_length=128):  # <--- param: max_length
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, examples):
        # 'examples' is a list of strings from our dataset
        encoding = self.tokenizer(
            examples,
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        # For GPT-2, input_ids are also labels for causal LM
        encoding["labels"] = encoding["input_ids"].clone()
        return encoding

max_length = 128  # <--- param: you can tune
collator = ConversationDataCollator(tokenizer, max_length)


In [25]:
# Quick check with one batch
example_batch = [ds[0], ds[1]]
encoded = collator(example_batch)
for k,v in encoded.items():
    print(k, v.shape)

input_ids torch.Size([2, 128])
attention_mask torch.Size([2, 128])
labels torch.Size([2, 128])


In [26]:
# =========================================
# 7) Create DataLoaders or Use Trainer Directly
# =========================================
# We'll rely on Hugging Face Trainer, so we define HuggingFace Datasets

from datasets import Dataset as HFDataset

train_texts = [train_ds[i] for i in range(len(train_ds))]
eval_texts  = [eval_ds[i]  for i in range(len(eval_ds))]

train_hf = HFDataset.from_dict({"text": train_texts})
eval_hf  = HFDataset.from_dict({"text": eval_texts})

print(train_hf, eval_hf)


Dataset({
    features: ['text'],
    num_rows: 68
}) Dataset({
    features: ['text'],
    num_rows: 8
})


In [27]:
# We'll define a tokenize function for usage with map
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding=True,
        truncation=True,
        max_length=max_length
    )

train_hf = train_hf.map(tokenize_function, batched=True, remove_columns=["text"])
eval_hf  = eval_hf.map(tokenize_function,  batched=True, remove_columns=["text"])

train_hf = train_hf.map(lambda x: {"labels": x["input_ids"]}, batched=True)
eval_hf  = eval_hf.map(lambda x: {"labels": x["input_ids"]},  batched=True)

train_hf.set_format("torch")
eval_hf.set_format("torch")


Map: 100%|██████████| 68/68 [00:00<00:00, 187.87 examples/s]
Map: 100%|██████████| 8/8 [00:00<00:00, 151.41 examples/s]
Map: 100%|██████████| 68/68 [00:00<00:00, 4452.00 examples/s]
Map: 100%|██████████| 8/8 [00:00<00:00, 2333.90 examples/s]


In [28]:
# =========================================
# 8) Load GPT-2 Model
# =========================================
model = GPT2LMHeadModel.from_pretrained(model_name)

# If we added special tokens (like <|SEP|>), we should resize embeddings
model.resize_token_embeddings(len(tokenizer))

model.to(device)
model.train()

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50258, bias=False)
)

In [29]:
# =========================================
# 9) Training Arguments
# =========================================
num_epochs = 2       
train_batch_size = 2  
lr = 5e-5             

training_args = TrainingArguments(
    output_dir="gpt2_conversation_predict",
    overwrite_output_dir=True,
    num_train_epochs=num_epochs,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=2,
    learning_rate=lr,
    weight_decay=0.01,
    evaluation_strategy="epoch", 
    save_strategy="epoch",
    logging_steps=10,
    push_to_hub=False
)




In [30]:
# =========================================
# 10) Trainer
# =========================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_hf,
    eval_dataset=eval_hf,
    tokenizer=tokenizer,
    data_collator=default_data_collator  # or you can pass the custom collator if you want
)

trainer.train()


  trainer = Trainer(
 15%|█▍        | 10/68 [00:39<02:53,  2.98s/it]

{'loss': 3.0631, 'grad_norm': 13.99717903137207, 'learning_rate': 4.2647058823529415e-05, 'epoch': 0.29}


 29%|██▉       | 20/68 [01:07<02:09,  2.71s/it]

{'loss': 2.8709, 'grad_norm': 14.305420875549316, 'learning_rate': 3.529411764705883e-05, 'epoch': 0.59}


 44%|████▍     | 30/68 [01:34<01:41,  2.66s/it]

{'loss': 2.555, 'grad_norm': 13.082621574401855, 'learning_rate': 2.7941176470588236e-05, 'epoch': 0.88}


                                               
 50%|█████     | 34/68 [01:47<01:31,  2.70s/it]

{'eval_loss': 2.6475071907043457, 'eval_runtime': 1.9891, 'eval_samples_per_second': 4.022, 'eval_steps_per_second': 2.011, 'epoch': 1.0}


 59%|█████▉    | 40/68 [02:07<01:24,  3.03s/it]

{'loss': 2.3386, 'grad_norm': 15.284363746643066, 'learning_rate': 2.058823529411765e-05, 'epoch': 1.18}


 74%|███████▎  | 50/68 [02:34<00:48,  2.68s/it]

{'loss': 2.2335, 'grad_norm': 12.967528343200684, 'learning_rate': 1.323529411764706e-05, 'epoch': 1.47}


 88%|████████▊ | 60/68 [03:00<00:20,  2.59s/it]

{'loss': 2.1923, 'grad_norm': 14.030877113342285, 'learning_rate': 5.882352941176471e-06, 'epoch': 1.76}


                                               
100%|██████████| 68/68 [03:26<00:00,  2.66s/it]

{'eval_loss': 2.5611648559570312, 'eval_runtime': 2.0186, 'eval_samples_per_second': 3.963, 'eval_steps_per_second': 1.982, 'epoch': 2.0}


100%|██████████| 68/68 [03:29<00:00,  3.08s/it]

{'train_runtime': 209.4546, 'train_samples_per_second': 0.649, 'train_steps_per_second': 0.325, 'train_loss': 2.490679151871625, 'epoch': 2.0}





TrainOutput(global_step=68, training_loss=2.490679151871625, metrics={'train_runtime': 209.4546, 'train_samples_per_second': 0.649, 'train_steps_per_second': 0.325, 'total_flos': 8883929088000.0, 'train_loss': 2.490679151871625, 'epoch': 2.0})

In [31]:
# =========================================
# 11) Save the Finetuned Model + Tokenizer
# =========================================
save_dir = "gpt2_conversation_predict_model"
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)
print(f"Model + tokenizer saved to {save_dir}")


Model + tokenizer saved to gpt2_conversation_predict_model


In [32]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_dir = "gpt2_conversation_predict_model"
model = GPT2LMHeadModel.from_pretrained(model_dir).to(device)
tokenizer = GPT2Tokenizer.from_pretrained(model_dir)

prompt_partial = "Hello, I'm Sam. I saw an ad about a photography workshop."
inputs = tokenizer(prompt_partial, return_tensors="pt").to(device)
output_ids = model.generate(
    **inputs,
    max_new_tokens=50,
    do_sample=False 
)

generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("Partial:", prompt_partial)
print("Completion:", generated_text)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Partial: Hello, I'm Sam. I saw an ad about a photography workshop.
Completion: Hello, I'm Sam. I saw an ad about a photography workshop. How can I help you?
I'm calling from the photography workshop. We're offering a free workshop to help you improve your photography skills.
I'm sure you've heard about the workshop. How can I help you?
I'm calling


### Test Model