In [None]:
# # Conversation Prediction with GPT-2


In [1]:
# ============================================
# 0) Install Requirements (if needed)
# ============================================
# In a fresh environment or Google Colab, you might need:
# !pip install transformers datasets accelerate pandas


In [21]:
# ============================================
# 1) Imports
# ============================================
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)

print("All libraries imported successfully!")


All libraries imported successfully!


In [3]:
# ============================================
# 2) Load and Inspect the CSV
# ============================================
# We'll assume your CSV is named "conversations.csv" and has columns:
#   CONVERSATION_ID, CONVERSATION_STEP, TEXT, CONTEXT, LABEL
# We'll use only CONVERSATION_ID, CONVERSATION_STEP, TEXT to build partial->full pairs.

df = pd.read_csv("/Users/ashansubodha/Desktop/VOIP Vishing/conversation-prediction/FINAL_DATASET2.csv")
print("Data size:", len(df))


Data size: 967


In [4]:
(df.head(10))


Unnamed: 0,CONVERSATION_ID,CONVERSATION_STEP,TEXT,CONTEXT,LABEL,Unnamed: 5,Unnamed: 6
0,0,1,"Hello, this is [Your Name]'s personal assistan...",Standard opening exchange,neutral,,
1,0,2,"Hi, I'm Sam. I saw an ad about a photography w...",Expresses interest,neutral,,
2,0,3,"Hi Sam, it's great to hear of your interest in...",Assistant is open and encouraging,neutral,,
3,0,4,Thanks! I was wondering about the skill level ...,Addresses the concern directly,neutral,,
4,0,5,The workshop is designed to accommodate all sk...,Addresses the concern directly,neutral,,
5,0,6,That sounds perfect. What's the registration p...,Directs the conversation to registration,neutral,,
6,0,7,You can register through our website. I can gu...,Offers assistance options,neutral,,
7,0,8,A direct link would be great. Can you also tel...,requesting additional information,neutral,,
8,0,9,"Certainly, the fee for the workshop is $200, w...",Proactive in facilitating registration,neutral,,
9,0,10,"Sure, it's sam.photography@example.com.",Provides email address,neutral,,


In [6]:
# ============================================
# 3) Build (partial, full) Pairs
# ============================================
# We'll define a function that:
#   1) Groups by CONVERSATION_ID
#   2) Sorts by CONVERSATION_STEP
#   3) Takes the first X% of lines as "partial" -> entire conversation as "full"
#   4) Produces a single text: "partial [SEP] full"
# Because GPT-2 is a causal LM, it will learn that after the partial, the model 
# should continue with the full. We insert a special separator token (e.g., <SEP>) 
# so the model sees the boundary.

def build_partial_full_pairs(df, partial_ratio=0.5, sep_token="<SEP>"):
    pairs = []
    grouped = df.groupby("CONVERSATION_ID")

    for convo_id, group in grouped:
        group_sorted = group.sort_values("CONVERSATION_STEP")
        all_texts = group_sorted["TEXT"].tolist()
        
        # full conversation text
        full_convo = "\n".join(all_texts)
        
        # partial conversation text
        cutoff = max(1, int(len(all_texts) * partial_ratio))  # at least 1 line
        partial_texts = all_texts[:cutoff]
        partial_convo = "\n".join(partial_texts)
        
        # single text for GPT-2
        # e.g., "partial_convo <SEP> full_convo"
        # GPT-2 will see the partial and learn to continue with the full
        combined_text = partial_convo + f"\n{sep_token}\n" + full_convo
        
        pairs.append({"text": combined_text})
    
    return pd.DataFrame(pairs)

pairs_df = build_partial_full_pairs(df, partial_ratio=0.5, sep_token="<SEP>")
print("Number of conversation pairs:", len(pairs_df))


Number of conversation pairs: 76


In [7]:
pairs_df

Unnamed: 0,text
0,"Hello, this is [Your Name]'s personal assistan..."
1,"Hello, this is the personal assistant of [Your..."
2,"Hello, this is the assistant to [Your Name]. H..."
3,"Hello, this is the office of [Your Name]. How ..."
4,"Hello, you've reached the assistant for [Your ..."
...,...
71,"Hello, this is the personal assistant of [Your..."
72,"Hello, this is a call from the National Tax Se..."
73,"Hello, is this Mr. OOO?\nYes, this is him.\nHi..."
74,"This bond is under your name, not under our ba..."


In [8]:
# ============================================
# 4) Create a Hugging Face Dataset
# ============================================
dataset = Dataset.from_pandas(pairs_df[["text"]])

# Simple train-test split (80-20).
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
eval_dataset  = split_dataset["test"]

print("Train size:", len(train_dataset))
print("Eval size:", len(eval_dataset))
print("Example of train data:", train_dataset[0])


Train size: 60
Eval size: 16
Example of train data: {'text': "Hello, this is the assistant to [Your Name]. May I know the reason for your call?\nHi, I'm calling from 'QuickTech Support' about a critical security issue flagged on [Your Name]'s computer. Immediate action is required.\nCould you specify what kind of security issue has been detected?\nWe've detected multiple virus infections that could compromise personal data. We urgently need to run a remote clean-up procedure.\nBefore we proceed, could you provide me with details on when and how the issue was detected?\nOur system conducts regular sweeps and your IP address showed up in our latest scan. We're here to help resolve this immediately.\nI need to cross-verify this with our internal security protocols before any remote access is granted.\n<SEP>\nHello, this is the assistant to [Your Name]. May I know the reason for your call?\nHi, I'm calling from 'QuickTech Support' about a critical security issue flagged on [Your Name]'s co

In [22]:
# ============================================
# 5) Load GPT-2 Tokenizer
# ============================================
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# GPT-2 doesn't have a pad token, so let's set eos_token as pad
tokenizer.pad_token = tokenizer.eos_token

print("Vocab size:", len(tokenizer))


Vocab size: 50257


In [23]:
# ============================================
# 6) Preprocessing Function
# ============================================
# We'll tokenize the "text" in each row. This is just a single string:
# "partial_convo <SEP> full_convo"
# GPT-2 will learn that after the partial snippet, it should predict the rest.

max_length = 256

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=max_length,
        padding="max_length"  # or do dynamic padding in the data collator
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset  = eval_dataset.map(tokenize_function,  batched=True)

# remove original "text" column
train_dataset = train_dataset.remove_columns(["text"])
eval_dataset  = eval_dataset.remove_columns(["text"])

train_dataset.set_format("torch")
eval_dataset.set_format("torch")


Map:   0%|          | 0/60 [00:00<?, ? examples/s]


KeyError: 'text'

In [24]:
# ============================================
# 7) Data Collator
# ============================================
# For GPT-2, we do standard causal language modeling:
#   DataCollatorForLanguageModeling with mlm=False

from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # GPT-2 is a causal LM
)


In [25]:
# ============================================
# 8) Load GPT-2 Model
# ============================================
model = GPT2LMHeadModel.from_pretrained(model_name)

# Because GPT-2 doesn't have a pad token by default, set pad_token_id to eos_token_id
model.config.pad_token_id = model.config.eos_token_id
model = model.to("cuda" if torch.cuda.is_available() else "cpu")


In [14]:
# ============================================
# 9) Training Arguments
# ============================================
training_args = TrainingArguments(
    output_dir="gpt2-conversation",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),
    push_to_hub=False,
)

print(training_args)


TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval_



In [15]:
# ============================================
# 10) Trainer
# ============================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator
)


In [17]:
# ============================================
# 11) Train the Model
# ============================================
trainer.train()

# Save the final model
trainer.save_model("gpt2-conversation")
tokenizer.save_pretrained("gpt2-conversation")


                                               
 33%|███▎      | 30/90 [00:14<00:17,  3.34it/s]

{'eval_loss': 4.1788105964660645, 'eval_runtime': 0.6114, 'eval_samples_per_second': 26.171, 'eval_steps_per_second': 13.085, 'epoch': 1.0}


 56%|█████▌    | 50/90 [00:22<00:13,  2.97it/s]

{'loss': 4.0167, 'grad_norm': 40266.29296875, 'learning_rate': 2.2222222222222223e-05, 'epoch': 1.67}


                                               
 67%|██████▋   | 60/90 [00:26<00:09,  3.25it/s]

{'eval_loss': 5.129263877868652, 'eval_runtime': 0.6361, 'eval_samples_per_second': 25.153, 'eval_steps_per_second': 12.576, 'epoch': 2.0}


                                               
100%|██████████| 90/90 [00:40<00:00,  3.33it/s]

{'eval_loss': 5.7340264320373535, 'eval_runtime': 0.6356, 'eval_samples_per_second': 25.173, 'eval_steps_per_second': 12.587, 'epoch': 3.0}


100%|██████████| 90/90 [00:41<00:00,  2.16it/s]


{'train_runtime': 41.7112, 'train_samples_per_second': 4.315, 'train_steps_per_second': 2.158, 'train_loss': 5.142096455891927, 'epoch': 3.0}


('gpt2-conversation/tokenizer_config.json',
 'gpt2-conversation/special_tokens_map.json',
 'gpt2-conversation/vocab.json',
 'gpt2-conversation/merges.txt',
 'gpt2-conversation/added_tokens.json')

In [26]:
# ============================================
# 12) Conversation Generation (Inference)
# ============================================
# We'll define a function that, given a partial snippet,
# uses GPT-2 to generate the rest. We'll rely on the fact 
# that GPT-2 was trained to continue text after the partial 
# snippet and <SEP>.

def generate_conversation(partial_text, sep_token="<SEP>", max_length=100):
    model.eval()
    device = model.device
    
    # We'll provide "partial_text + SEP" as the prompt
    prompt = partial_text + f"\n{sep_token}\n"
    
    inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)
    
    # Generate
    # You can do sampling, beam search, etc. We'll do a simple sample approach
    # or set do_sample=False for greedy.
    outputs = model.generate(
        inputs,
        max_length=len(inputs[0]) + max_length,
        num_beams=1,           # or do_sample=True
        top_p=0.9,
        temperature=0.8,
        no_repeat_ngram_size=2,
        early_stopping=True
    )
    
    generated_tokens = outputs[0].tolist()
    generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    
    # The model will produce both partial_text + <SEP> + rest, 
    # so we might want to split out the portion after <SEP>.
    if sep_token in generated_text:
        # e.g., separate at the SEP token
        after_sep = generated_text.split(sep_token, 1)[1]
        # strip leading newlines/spaces
        completion = after_sep.strip()
        return completion
    else:
        return generated_text




In [27]:
partial_conversation = "Good Morning, I am Sanuja calling on behalf of State Bank of Sri Lanka. Oh, hi. I'm actually in a meeting right now. Could you call later?"
completion = generate_conversation(partial_conversation, sep_token="<SEP>", max_length=80)
print("=== Generated Continuation ===")
print(completion)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


=== Generated Continuation ===
SEPA: I'll call you later.
(The phone rings)
SRIBA: Hi, Sanju. It's me. You're in the meeting. What's going on? What are you doing? I don't know. Is there anything I can do? Is it okay to talk to me? Are you okay? Do you have any questions? (He leaves) I
