In [1]:
!pip install datasets -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━[0m [32m307.2/491.2 kB[0m [31m9.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dep

In [7]:
# === 1) Setup & Imports ===
from datasets import load_dataset
from transformers import (
    BertTokenizer,
    EncoderDecoderModel,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)
import openai
import torch

# === 2) Load PersonaChat & Sample 2k Examples ===
dataset = load_dataset("bavard/personachat_truecased")

max_train_samples = 1800
max_val_samples   = 200

def format_example(ex):
    history = ex["history"]
    input_text  = "\n".join(history)
    target_text = ex["candidates"][0]
    return {"input_text": input_text, "target_text": target_text}

train_raw = dataset["train"].select(range(max_train_samples)).map(format_example)
val_raw   = dataset["validation"].select(range(max_val_samples)).map(format_example)

# === 3) Initialize BERT2BERT Summarizer ===
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    "bert-base-uncased",
    "bert-base-uncased"
)

# tie embeddings and set special tokens
model.config.tie_encoder_decoder      = True
model.config.is_encoder_decoder       = True
model.config.decoder_start_token_id   = tokenizer.cls_token_id
model.config.bos_token_id             = tokenizer.cls_token_id
model.config.eos_token_id             = tokenizer.sep_token_id
model.config.pad_token_id             = tokenizer.pad_token_id

# === 4) Tokenization ===
max_input_length  = 512
max_target_length = 64

def tokenize_fn(ex):
    enc = tokenizer(
        ex["input_text"],
        max_length=max_input_length,
        truncation=True,
        padding="max_length"
    )
    dec = tokenizer(
        ex["target_text"],
        max_length=max_target_length,
        truncation=True,
        padding="max_length"
    )
    enc["labels"] = dec["input_ids"]
    return enc

train_ds = train_raw.map(tokenize_fn, remove_columns=train_raw.column_names)
val_ds   = val_raw.map(tokenize_fn,   remove_columns=val_raw.column_names)

# === 5) Trainer & TrainingArguments ===
training_args = TrainingArguments(
    output_dir="./bert2bert_personachat",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
    report_to="none"
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=data_collator,
    tokenizer=tokenizer
)

# === 6) Fine‑tune Summarizer ===
trainer.train()
model.save_pretrained("bert2bert_personachat_finetuned")
tokenizer.save_pretrained("bert2bert_personachat_finetuned")

Repo card metadata block was not found. Setting CardData to empty.
Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.we

Step,Training Loss
100,2.7904
200,1.1567
300,1.0161
400,0.9627
500,0.9087
600,0.865
700,0.8642
800,0.8505
900,0.8095
1000,0.786




('bert2bert_personachat_finetuned/tokenizer_config.json',
 'bert2bert_personachat_finetuned/special_tokens_map.json',
 'bert2bert_personachat_finetuned/vocab.txt',
 'bert2bert_personachat_finetuned/added_tokens.json')

In [10]:
# === 7) Load Summarizer & Set OpenAI Key ===
openai.api_key = "sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"  # ← replace with your key

summ_tokenizer = BertTokenizer.from_pretrained("bert2bert_personachat_finetuned")
summ_model     = EncoderDecoderModel.from_pretrained("bert2bert_personachat_finetuned")

# ** Ensure special tokens are set on loaded model **
summ_model.config.is_encoder_decoder       = True
summ_model.config.decoder_start_token_id   = summ_tokenizer.cls_token_id
summ_model.config.eos_token_id             = summ_tokenizer.sep_token_id
summ_model.config.pad_token_id             = summ_tokenizer.pad_token_id

# (Optional) verify
print("decoder_start_token_id:", summ_model.config.decoder_start_token_id)
print("bos_token_id:          ", summ_model.config.bos_token_id)
print("eos_token_id:          ", summ_model.config.eos_token_id)
print("pad_token_id:          ", summ_model.config.pad_token_id)

# === 8) Summarization Function ===
def summarize_conversation_bert(history):
    text = "\n".join(history)
    inputs = summ_tokenizer(
        text,
        max_length=512,
        truncation=True,
        return_tensors="pt"
    )
    summary_ids = summ_model.generate(
        inputs["input_ids"],
        decoder_start_token_id=summ_model.config.decoder_start_token_id,
        bos_token_id=summ_model.config.bos_token_id,
        eos_token_id=summ_model.config.eos_token_id,
        pad_token_id=summ_model.config.pad_token_id,
        max_length=64,
        num_beams=4,
        length_penalty=2.0,
        early_stopping=True
    )
    return summ_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# === 9) ChatGPT‑3.5 Response Generator (v1.0+ API) ===
def generate_response_gpt35(summary, user_input):
    prompt = (
        f"Conversation summary:\n{summary}\n\n"
        f"User just said: \"{user_input}\"\n"
        "Bot reply:"
    )
    resp = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7,
        max_tokens=150,
    )
    return resp.choices[0].message.content.strip()


# === 10) Run the Prompt‑Based Chatbot ===
def run_chatbot():
    print("Bot: Hello! (type 'exit' to quit)")
    history = ["Bot: Hello!"]
    while True:
        user_input = input("User: ")
        if user_input.lower() in ("exit", "quit", "bye"):
            print("Bot: Goodbye!")
            break

        history.append(f"User: {user_input}")

        # summarize with our fine‑tuned BERT2BERT
        summary  = summarize_conversation_bert(history)
        # generate next turn via GPT‑3.5
        bot_reply = generate_response_gpt35(summary, user_input)

        print(f"Bot: {bot_reply}")
        history.append(f"Bot: {bot_reply}")

if __name__ == "__main__":
    run_chatbot()

The following encoder weights were not tied to the decoder ['bert/pooler']
The following encoder weights were not tied to the decoder ['bert/pooler']
The following encoder weights were not tied to the decoder ['bert/pooler']


decoder_start_token_id: 101
bos_token_id:           101
eos_token_id:           102
pad_token_id:           0
Bot: Hello! (type 'exit' to quit)
User: hi
Bot: Hi there! How are you doing today?
User: I am sick
Bot: I'm sorry to hear that. I hope you feel better soon.
User: I am hungry
Bot: I'm sorry to hear that you're hungry. Have you had a chance to eat yet today?
User: no
Bot: I'm sorry to hear that. Is there anything I can do to help?
User: I am Tarun
Bot: Nice to meet you, Tarun! How are you doing today?
User: good
Bot: I'm glad to hear that you're doing well! How has your day been so far?
User: I met a girl
Bot: That's great! How did you two meet?
User: We met in school
Bot: That's great! It's always nice to meet people in school. What were your first impressions of each other?
User: She is nice
Bot: That's great to hear! What makes her nice?
User: She helped me for my homework
Bot: That's great to hear! It's always helpful to have someone assist with homework.
User: bye
Bot: Good