# # GPT-2 Conversation Generation Example


In [22]:
# =========================================
# 1) Install and Import Dependencies
# =========================================
# In a fresh environment (e.g. Google Colab), you may need:
# !pip install transformers datasets accelerate

import pandas as pd
import torch
from datasets import Dataset, load_dataset
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)

In [23]:
# =========================================
# 2) Load and Prepare the Dataset
# =========================================
# We assume you have a CSV with columns:
# CONVERSATION_ID, CONVERSATION_STEP, TEXT, CONTEXT, LABEL (and so on)
# For GPT-2 generation, we'll focus on grouping the TEXT by CONVERSATION_ID
# to get the entire conversation in one sequence.

df = pd.read_csv("/Users/ashansubodha/Desktop/VOIP Vishing/conversation-prediction/FINAL_DATASET2.csv")

# Let's see the columns
# print(df.head())

# Example columns might be:
# CONVERSATION_ID | CONVERSATION_STEP | TEXT | CONTEXT | LABEL

In [24]:
df

Unnamed: 0,CONVERSATION_ID,CONVERSATION_STEP,TEXT,CONTEXT,LABEL,Unnamed: 5,Unnamed: 6
0,0,1,"Hello, this is [Your Name]'s personal assistan...",Standard opening exchange,neutral,,
1,0,2,"Hi, I'm Sam. I saw an ad about a photography w...",Expresses interest,neutral,,
2,0,3,"Hi Sam, it's great to hear of your interest in...",Assistant is open and encouraging,neutral,,
3,0,4,Thanks! I was wondering about the skill level ...,Addresses the concern directly,neutral,,
4,0,5,The workshop is designed to accommodate all sk...,Addresses the concern directly,neutral,,
...,...,...,...,...,...,...,...
962,75,27,Understood. Weâ€™ll try to expedite the process.,Acknowledging and offering assistance,neutral,,
963,75,28,"If I delete the app, I wonâ€™t get messages from...",Clarifying concerns about app deletion,neutral,,
964,75,29,"No, deleting the app wonâ€™t affect our communic...",Reassuring customer,neutral,,
965,75,30,"Alright, proceed and call me back in 10 minutes.",Requesting follow-up,neutral,,


In [25]:
grouped = df.groupby("CONVERSATION_ID") 

In [26]:
for convo_id, group in grouped:
    print(convo_id)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75


In [27]:
# =========================================
# 3) Convert Each Conversation to a Single String
# =========================================
# We'll group by CONVERSATION_ID, sort by CONVERSATION_STEP,
# and concatenate TEXT into one big string (with speaker labels if desired).

def build_full_conversation_text(df):
    conversation_texts = []
    grouped = df.groupby("CONVERSATION_ID")

    for convo_id, group in grouped:
        # Sort by step to get correct order
        group_sorted = group.sort_values(by="CONVERSATION_STEP")
        # Optionally, you can add speaker labels like:
        #   "Caller: " + TEXT or "Callee: " + TEXT
        # for clarity. For now, let's just join the raw TEXT with line breaks.
        convo_string = ""
        for _, row in group_sorted.iterrows():
            # You could do something like:
            #   convo_string += f"{row['CONTEXT']} ({row['LABEL']}): {row['TEXT']}\n"
            # Or simpler:
            convo_string += row["TEXT"] + "\n"
        conversation_texts.append(convo_string.strip())

    return conversation_texts

conversation_list = build_full_conversation_text(df)
print(f"Number of full conversations: {len(conversation_list)}")
print("Sample conversation:\n", conversation_list[0][:500], "...")

Number of full conversations: 76
Sample conversation:
 Hello, this is [Your Name]'s personal assistant. How may I assist you today?
Hi, I'm Sam. I saw an ad about a photography workshop hosted by [Org Name] next month. I'm interested in registering but had a few questions.
Hi Sam, it's great to hear of your interest in the photography workshop. I'd be happy to help with any questions you have.
Thanks! I was wondering about the skill level required for participants. I'm fairly new to photography.
The workshop is designed to accommodate all skill leve ...


In [28]:
conversation_list

["Hello, this is [Your Name]'s personal assistant. How may I assist you today?\nHi, I'm Sam. I saw an ad about a photography workshop hosted by [Org Name] next month. I'm interested in registering but had a few questions.\nHi Sam, it's great to hear of your interest in the photography workshop. I'd be happy to help with any questions you have.\nThanks! I was wondering about the skill level required for participants. I'm fairly new to photography.\nThe workshop is designed to accommodate all skill levels, from beginners to more experienced photographers. [Org Name] aims to ensure everyone can learn and grow, regardless of their starting point.\nThat sounds perfect. What's the registration process?\nYou can register through our website. I can guide you through the steps if you'd like, or send you a direct link to the registration page.\nA direct link would be great. Can you also tell me about the workshop fee?\nCertainly, the fee for the workshop is $200, which includes all materials and

In [29]:
# =========================================
# 4) Create a Hugging Face Dataset
# =========================================
# We now have a list of strings, each representing a full conversation.
# GPT-2 is a language model, so we'll treat each conversation as one "example".

dataset_dict = {
    "text": conversation_list
}

# Convert to a Hugging Face Dataset
conversation_dataset = Dataset.from_dict(dataset_dict)
# We can do a train_test_split
split_dataset = conversation_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
eval_dataset  = split_dataset["test"]

print("Train examples:", len(train_dataset))
print("Eval examples:", len(eval_dataset))
print("Sample train example:\n", train_dataset[0]["text"])


Train examples: 60
Eval examples: 16
Sample train example:
 Hello, this is the assistant to [Your Name]. May I know the reason for your call?
Hi, I'm calling from 'QuickTech Support' about a critical security issue flagged on [Your Name]'s computer. Immediate action is required.
Could you specify what kind of security issue has been detected?
We've detected multiple virus infections that could compromise personal data. We urgently need to run a remote clean-up procedure.
Before we proceed, could you provide me with details on when and how the issue was detected?
Our system conducts regular sweeps and your IP address showed up in our latest scan. We're here to help resolve this immediately.
I need to cross-verify this with our internal security protocols before any remote access is granted.
I understand your concern, but we need to act fast to prevent data loss. We're offering this complimentary service as a part of our customer care initiative.
While we appreciate the assistance, I mus

In [30]:
train_dataset

Dataset({
    features: ['text'],
    num_rows: 60
})

In [31]:
# =========================================
# 5) Tokenize the Text for GPT-2
# =========================================
# We'll use the 'gpt2' tokenizer, which has a pretrained vocabulary.

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# GPT-2 doesn't have a pad token by default; let's point it to the EOS token:
tokenizer.pad_token = tokenizer.eos_token

# Define a preprocessing function for language modeling
max_length = 512  # Adjust as needed depending on GPU memory

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=max_length,
        # We won't pad here; we'll let the data collator handle dynamic padding
    )

train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
eval_dataset  = eval_dataset.map(tokenize_function,  batched=True, remove_columns=["text"])

# We'll use a data collator for language modeling that can pad dynamically
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # because GPT-2 is a causal LM
)


Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 60/60 [00:00<00:00, 468.50 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 16/16 [00:00<00:00, 490.47 examples/s]


In [32]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 60
})

In [33]:
# =========================================
# 6) Load the GPT-2 LM Head Model
# =========================================
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Because GPT-2 does not have a real pad token, we need to set:
model.config.pad_token_id = model.config.eos_token_id


In [34]:
# =========================================
# 7) Training Arguments
# =========================================
# We'll fine-tune GPT-2 with a small set of hyperparams for demo.

training_args = TrainingArguments(
    output_dir="gpt2-conversation-model",
    overwrite_output_dir=True,
    num_train_epochs=2,  # try more epochs with real data
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=50,
    logging_steps=50,
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),  # Use FP16 if GPU is available
    push_to_hub=False,  # or True if you want to push to HF Hub
)

print(training_args)


TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=50,
eval_strategy=IntervalStrategy.STEPS,
eval_us



In [35]:
# =========================================
# 8) Initialize Trainer
# =========================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator
)


In [36]:
# =========================================
# 9) Train the Model
# =========================================
trainer.train()

# After training finishes, we can save the final model
trainer.save_model("gpt2-conversation-finetuned")
tokenizer.save_pretrained("gpt2-conversation-finetuned")


 83%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–Ž | 50/60 [00:21<00:03,  2.67it/s]

{'loss': 2.6395, 'grad_norm': 11.529086112976074, 'learning_rate': 8.333333333333334e-06, 'epoch': 1.67}


                                               
 83%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–Ž | 50/60 [00:22<00:03,  2.67it/s]

{'eval_loss': 2.459791660308838, 'eval_runtime': 1.1939, 'eval_samples_per_second': 13.401, 'eval_steps_per_second': 6.701, 'epoch': 1.67}


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 60/60 [00:29<00:00,  2.05it/s]


{'train_runtime': 29.3183, 'train_samples_per_second': 4.093, 'train_steps_per_second': 2.047, 'train_loss': 2.593076992034912, 'epoch': 2.0}


('gpt2-conversation-finetuned/tokenizer_config.json',
 'gpt2-conversation-finetuned/special_tokens_map.json',
 'gpt2-conversation-finetuned/vocab.json',
 'gpt2-conversation-finetuned/merges.txt',
 'gpt2-conversation-finetuned/added_tokens.json')

In [18]:
# Load model & tokenizer if needed
model = GPT2LMHeadModel.from_pretrained("gpt2-conversation-finetuned")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-conversation-finetuned")

In [19]:
# model.cuda()  # if using GPU

AssertionError: Torch not compiled with CUDA enabled

In [37]:
# =========================================
# 10) Inference / Generation
# =========================================
# Now, let's see how we might generate the rest of a conversation
# given a partial conversation as a prompt.

partial_conversation = """Good Morning, I am Sanuja calling on behalf of State Bank of Sri Lanka. Oh, hi. I'm actually in a meeting right now. Could you call later?
"""

# Encode the partial conversation
input_ids = tokenizer.encode(partial_conversation, return_tensors="pt").to(trainer.model.device)

# Generate up to some max length
max_gen_length = 100  # try more if you want a longer completion

# We can use various decoding strategies: greedy, beam search, sampling, etc.
output_ids = model.generate(
    input_ids,
    max_length=max_gen_length,
    do_sample=True,       # sample instead of greedy
    top_k=50,
    top_p=0.95,
    temperature=0.7,
    num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id  # because GPT-2 needs an EOS token for padding
)

generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("=== Generated Conversation Continuation ===")
print(generated_text)


=== Generated Conversation Continuation ===
Good Morning, I am Sanuja calling on behalf of State Bank of Sri Lanka. Oh, hi. I'm actually in a meeting right now. Could you call later?
Yes, it's late today, I'm calling on behalf of the State Bank of Sri Lanka.
I'm also calling to confirm your payment details and the amount.
Thank you.
Let's proceed now.
I'll wait for your call.
You can call again within an hour or two,
