In [12]:
# --- CELL 1: Imports and Path Configuration ---
import os
import torch
from datasets import load_dataset, concatenate_datasets, Dataset
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)

# --- CONFIGURE YOUR LOCAL PATHS HERE ---
YOUR_SAVE_PATH = "EmpatheticChatbot-Kaggle-DialoGPT" 
BASE_MODEL = "microsoft/DialoGPT-medium"
os.makedirs(YOUR_SAVE_PATH, exist_ok=True)
print(f"Setup complete. Model will be saved to: {os.path.abspath(YOUR_SAVE_PATH)}")

Setup complete. Model will be saved to: D:\Emoticon\myapp\backend\EmpatheticChatbot-Kaggle-DialoGPT


In [13]:
# --- CELL 2: Tokenizer and Hardware Check ---
print("Setting up tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
EOS = tokenizer.eos_token
print("Tokenizer is ready.")

# Check for GPU and set the device
if torch.cuda.is_available():
    device = "cuda"
    print(f"✅ GPU found! Using device: {device}")
else:
    device = "cpu"
    print("⚠️ No GPU found. Using CPU. Training will be VERY slow.")

use_fp16 = True if device == "cuda" else False

Setting up tokenizer...
Tokenizer is ready.
⚠️ No GPU found. Using CPU. Training will be VERY slow.


In [20]:
# --- DIAGNOSTIC CELL ---
# Run this cell by itself.

import pandas as pd

kaggle_csv_path = "empathetic_dialogues.csv"
df_empathetic = pd.read_csv(kaggle_csv_path)

print("THE REAL COLUMN NAMES ARE:")
print(df_empathetic.columns.tolist())

THE REAL COLUMN NAMES ARE:
['Unnamed: 0', 'Situation', 'emotion', 'empathetic_dialogues', 'labels', 'Unnamed: 5', 'Unnamed: 6']


In [22]:
# Cell 3: Load and Process Datasets (FINAL WORKING VERSION)
import pandas as pd
from datasets import Dataset

print("Loading and processing datasets...")

# --- 1. LOAD THE KAGGLE CSV ---
kaggle_csv_path = "empathetic_dialogues.csv"
if not os.path.exists(kaggle_csv_path):
    raise FileNotFoundError(f"ERROR: File not found at '{os.path.abspath(kaggle_csv_path)}'.")

print(f"Reading Kaggle dataset from: {kaggle_csv_path}")
df_empathetic = pd.read_csv(kaggle_csv_path)
print(f"Loaded {len(df_empathetic)} dialogues from CSV.")


# --- 2. LOAD PERSONA-CHAT ---
print("\nLoading 'blended_skill_talk' dataset from Hugging Face...")
persona_dataset = load_dataset("blended_skill_talk")
print("--- Raw Datasets Loaded Successfully ---")


# --- 3. DEFINE FORMATTING FUNCTIONS (WITH ALL CORRECTIONS) ---
def format_kaggle_dialogue(row):
    """Uses the correct column names for the Kaggle CSV."""
    prompt = str(row['Situation']).strip()
    response = str(row['empathetic_dialogues']).strip()
    return f"{prompt}{EOS}{response}{EOS}"


def format_persona_dialogue(example):
    """
    Uses the correct column names for the blended_skill_talk dataset.
    The key is 'personas', not 'context_personas'.
    """
    # --- THIS IS THE LINE THAT HAS BEEN FIXED ---
    persona = " ".join(example["personas"])
    # -------------------------------------------
    
    context = example["previous_utterance"]
    full_text = persona + EOS + " ".join(context) + EOS + example["free_messages"][0]
    return {"text": full_text + EOS}


# --- 4. APPLY FORMATTING ---
df_empathetic['text'] = df_empathetic.apply(format_kaggle_dialogue, axis=1)
empathetic_formatted = Dataset.from_pandas(df_empathetic)

# This line will now work correctly.
persona_formatted = persona_dataset.map(
    format_persona_dialogue,
    remove_columns=list(persona_dataset['train'].features)
)


# --- 5. SPLIT AND CONCATENATE DATASETS ---
empathetic_train = empathetic_formatted.train_test_split(test_size=0.1, seed=42)
persona_train = persona_formatted['train']
persona_eval = persona_formatted['validation']
train_dataset = concatenate_datasets([empathetic_train['train'], persona_train]).shuffle(seed=42)
eval_dataset = concatenate_datasets([empathetic_train['test'], persona_eval])
print(f"\n--- Datasets Combined. Train examples: {len(train_dataset)}, Eval examples: {len(eval_dataset)} ---")


# --- 6. TOKENIZE ---
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True, remove_columns=eval_dataset.column_names)

print("✅ Datasets are tokenized and ready for training.")

Loading and processing datasets...
Reading Kaggle dataset from: empathetic_dialogues.csv
Loaded 64636 dialogues from CSV.

Loading 'blended_skill_talk' dataset from Hugging Face...
--- Raw Datasets Loaded Successfully ---


Map:   0%|          | 0/4819 [00:00<?, ? examples/s]

Map:   0%|          | 0/1009 [00:00<?, ? examples/s]

Map:   0%|          | 0/980 [00:00<?, ? examples/s]


--- Datasets Combined. Train examples: 62991, Eval examples: 7473 ---


Map:   0%|          | 0/62991 [00:00<?, ? examples/s]

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

✅ Datasets are tokenized and ready for training.


In [7]:
# Cell 4: Configure and Run Training

print("Configuring and starting the training process...")

# --- 1. LOAD THE BASE MODEL ---
# This loads the pre-trained DialoGPT model from Hugging Face.
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL)

# --- 2. DEFINE THE DATA COLLATOR ---
# The data collator is a utility that will batch our processed examples together.
# For causal language modeling (like GPT), we don't need a special mask (mlm=False).
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# --- 3. DEFINE TRAINING ARGUMENTS ---
# This object contains all the parameters to configure the training process.
use_fp16 = True  # or False, depending on your setup

training_args = TrainingArguments(
    output_dir=r"D:\Emoticon\myapp\backend\EmoticonAi",  # or use double backslashes or forward slashes
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,

    # Evaluation and Saving Strategy
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    load_best_model_at_end=True,
    save_total_limit=2,

    # Performance and Logging
    fp16=use_fp16,
    logging_steps=100,
    report_to="none",
)


# --- 4. INITIALIZE THE TRAINER ---
# The Trainer object orchestrates the entire training and evaluation process.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# --- 5. START TRAINING ---
print("🚀 Starting fine-tuning... This can take several hours depending on your hardware.")
print("   You will see loss metrics and progress bars below.")
trainer.train()
print("✅ Fine-tuning complete!")


# --- 6. SAVE THE FINAL BEST MODEL ---
# The trainer automatically loaded the best model, so we just need to save it.
print("\nSaving final best model to a clean directory...")
final_model_path = os.path.join(D:\Emoticon\myapp\backend\EmoticonAi, "final_best_model")
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path) # Important: also save the tokenizer with the model

print(f"✅ All steps complete! Your best model is saved at: {os.path.abspath(final_model_path)}")

SyntaxError: invalid syntax. Maybe you meant '==' or ':=' instead of '='? (807134386.py, line 18)

In [6]:
# Cell 5: Interactive Test Case
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import os

# --- 1. LOAD THE FINE-TUNED MODEL ---
# This path must point to the folder where the best model was saved.
# It should contain files like `pytorch_model.bin` and `config.json`.
final_model_path = os.path.join("D:\Emoticon\myapp\backend\EmpatheticChatbot-Kaggle-DialoGPT", "final_best_model")

if not os.path.exists(final_model_path):
    print("="*50)
    print(f"ERROR: Model not found at '{os.path.abspath(final_model_path)}'")
    print("Please make sure you have run the training script successfully and the path is correct.")
    print("="*50)
else:
    print(f"Loading model from: {os.path.abspath(final_model_path)}")
    tokenizer = AutoTokenizer.from_pretrained(final_model_path)
    model = AutoModelForCausalLM.from_pretrained(final_model_path)

    # Check for GPU and move model to the correct device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    print(f"Model loaded on {device.upper()}.")


    # --- 2. SETUP THE INTERACTIVE CHAT LOOP ---
    # This tensor will store the entire conversation history
    chat_history_ids = None

    print("\nStarting Chatbot. Type 'quit' to exit.")
    print("-" * 30)

    while True:
        try:
            # Get user input
            user_input = input(">> You: ")

            if user_input.lower() == 'quit':
                print("Bot: Goodbye!")
                break

            # Encode the new user input, add the eos_token, and return a tensor in PyTorch
            new_user_input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors='pt').to(device)

            # Append the new user input tokens to the chat history
            # If this is the first turn, bot_input_ids is just the user input.
            # Otherwise, it's the concatenation of the past history and the new input.
            bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if chat_history_ids is not None else new_user_input_ids

            # --- 3. GENERATE A RESPONSE ---
            # The model.generate() function creates the response.
            # The `chat_history_ids` now becomes the full conversation history.
            chat_history_ids = model.generate(
                bot_input_ids,
                max_length=1000,                               # Max length of the entire conversation
                pad_token_id=tokenizer.eos_token_id,
                no_repeat_ngram_size=3,                       # Prevents the model from repeating the same 3-word phrases
                do_sample=True,                               # Activates sampling-based generation (more creative)
                top_k=50,                                     # Considers the top 50 most likely words at each step
                top_p=0.95,                                   # Nucleus sampling: keeps the most probable tokens with cumulative probability >= 0.95
                temperature=0.7                               # Makes the output less random (lower) or more random (higher)
            )

            # --- 4. DECODE AND PRINT THE RESPONSE ---
            # Decode the last response from the bot.
            # `bot_input_ids.shape[-1]:` ensures we only decode the new part of the history.
            response = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
            print(f"Bot: {response}")

        except Exception as e:
            print(f"An error occurred: {e}")
            break

ERROR: Model not found at 'D:\Emoticon\myapackend\EmpatheticChatbot-Kaggle-DialoGPT\final_best_model'
Please make sure you have run the training script successfully and the path is correct.
