# Exploratory Data Analysis (EDA) with Python

In [49]:
# imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from transformers import GPT2Tokenizer
from transformers import TFGPT2LMHeadModel
from transformers import DataCollatorForLanguageModeling
from datasets import load_dataset
import re
import string

In [35]:
# load the dataset
df = pd.read_csv('data/emotion-emotion_69k.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Situation,emotion,empathetic_dialogues,labels,Unnamed: 5,Unnamed: 6
0,0,I remember going to the fireworks with my best...,sentimental,Customer :I remember going to see the firework...,"Was this a friend you were in love with, or ju...",,
1,1,I remember going to the fireworks with my best...,sentimental,Customer :This was a best friend. I miss her.\...,Where has she gone?,,
2,2,I remember going to the fireworks with my best...,sentimental,Customer :We no longer talk.\nAgent :,Oh was this something that happened because of...,,
3,3,I remember going to the fireworks with my best...,sentimental,Customer :Was this a friend you were in love w...,This was a best friend. I miss her.,,
4,4,I remember going to the fireworks with my best...,sentimental,Customer :Where has she gone?\nAgent :,We no longer talk.,,


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64636 entries, 0 to 64635
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Unnamed: 0            64636 non-null  int64 
 1   Situation             64636 non-null  object
 2   emotion               64632 non-null  object
 3   empathetic_dialogues  64636 non-null  object
 4   labels                64636 non-null  object
 5   Unnamed: 5            113 non-null    object
 6   Unnamed: 6            5 non-null      object
dtypes: int64(1), object(6)
memory usage: 3.5+ MB


In [37]:
df.describe()

Unnamed: 0.1,Unnamed: 0
count,64636.0
mean,32317.5
std,18658.950337
min,0.0
25%,16158.75
50%,32317.5
75%,48476.25
max,64635.0


In [38]:
df.columns

Index(['Unnamed: 0', 'Situation', 'emotion', 'empathetic_dialogues', 'labels',
       'Unnamed: 5', 'Unnamed: 6'],
      dtype='object')

In [39]:
# Drop irrelevant columns
df = df[['empathetic_dialogues', 'labels']].dropna()

# Extract only the last speaker line from 'empathetic_dialogues'
def extract_last_turn(dialogue):
    lines = dialogue.split('\n')
    for line in reversed(lines):
        if "Customer :" in line or "Agent :" in line:
            return line.strip()
    return lines[-1].strip()

df['input'] = df['empathetic_dialogues'].apply(extract_last_turn)
df['target'] = df['labels'].apply(str)

In [40]:

# Final Data
df = df[['input', 'target']]
df.head(10)

Unnamed: 0,input,target
0,Agent :,"Was this a friend you were in love with, or ju..."
1,Agent :,Where has she gone?
2,Agent :,Oh was this something that happened because of...
3,Agent :,This was a best friend. I miss her.
4,Agent :,We no longer talk.
5,Agent :,Oh ya? I don't really see how
6,Agent :,I do actually hit blank walls a lot of times b...
7,Agent :,Wait what are sweatings
8,Agent :,dont you feel so.. its a wonder
9,Agent :,i virtually thought so.. and i used to get sw...


In [44]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # important for batching

# Tokenize data
def tokenize(sample):
    return tokenizer(sample['input'], padding="max_length", truncation=True, max_length=64), \
           tokenizer(sample['target'], padding="max_length", truncation=True, max_length=64)

input_encodings = tokenizer(list(df['input']), padding=True, truncation=True, max_length=64, return_tensors="tf")
target_encodings = tokenizer(list(df['target']), padding=True, truncation=True, max_length=64, return_tensors="tf")


In [45]:
# Load tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Ensure padding works

# Combine input and target into one field
df['dialogue'] = df['input'] + tokenizer.eos_token + df['target'] + tokenizer.eos_token

# Tokenize full dialogue sequences
encodings = tokenizer(list(df['dialogue']), truncation=True, padding=True, max_length=64, return_tensors='tf')

# Create dataset: inputs and labels are the same
dataset = tf.data.Dataset.from_tensor_slices((
    {
        "input_ids": encodings["input_ids"],
        "attention_mask": encodings["attention_mask"]
    },
    encodings["input_ids"]  # this is the label
))

# Shuffle and batch
BATCH_SIZE = 4
dataset = dataset.shuffle(1000).batch(BATCH_SIZE)

In [46]:


model = TFGPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))  # for pad_token

# Compile
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss_fn)


All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [47]:
# Training
EPOCHS = 3

model.fit(dataset, epochs=EPOCHS)


Epoch 1/3
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x7f0444475990>

In [48]:
def generate_reply(prompt, max_length=50):
    inputs = tokenizer(prompt, return_tensors="tf")
    outputs = model.generate(**inputs, max_length=max_length, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Try generating
generate_reply("Customer :I miss my best friend.")


'Customer :I miss my best friend...........................................'

In [50]:
# Load only the training split
dataset = load_dataset("empathetic_dialogues", split="train")
print(dataset[0])


{'conv_id': 'hit:0_conv:1', 'utterance_idx': 1, 'context': 'sentimental', 'prompt': 'I remember going to the fireworks with my best friend. There was a lot of people_comma_ but it only felt like us in the world.', 'speaker_idx': 1, 'utterance': 'I remember going to see the fireworks with my best friend. It was the first time we ever spent time alone together. Although there was a lot of people_comma_ we felt like the only people in the world.', 'selfeval': '5|5|5_2|2|5', 'tags': ''}


In [51]:
def clean_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    return text

# Extract context + response pairs
input_texts = []
target_texts = []

for sample in dataset:
    dialogue = sample['utterance']
    context = clean_text(sample['context'])  # e.g., "Feeling anxious about..."
    utterance = clean_text(dialogue)

    # Basic pair: input is context, output is response
    input_texts.append("startseq " + context + " endseq")
    target_texts.append("startseq " + utterance + " endseq")


In [60]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Combine input and output for shared vocabulary
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(input_texts + target_texts)

vocab_size = len(tokenizer.word_index) + 1
print(f"Vocab size: {vocab_size}")

# Convert texts to sequences
input_seqs = tokenizer.texts_to_sequences(input_texts)
target_seqs = tokenizer.texts_to_sequences(target_texts)

# Pad sequences
max_len = 30
input_seqs = pad_sequences(input_seqs, maxlen=max_len, padding='post')
target_seqs = pad_sequences(target_seqs, maxlen=max_len, padding='post')


Vocab size: 42193


In [61]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense

embedding_dim = 128
lstm_units = 256

# Encoder
encoder_inputs = Input(shape=(max_len,))
encoder_embed = Embedding(vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm, state_h, state_c = LSTM(lstm_units, return_state=True)(encoder_embed)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_len,))
decoder_embed = Embedding(vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embed, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()




Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_5 (InputLayer)        [(None, 30)]                 0         []                            
                                                                                                  
 input_6 (InputLayer)        [(None, 30)]                 0         []                            
                                                                                                  
 embedding_3 (Embedding)     (None, 30, 128)              5400704   ['input_5[0][0]']             
                                                                                                  
 embedding_4 (Embedding)     (None, 30, 128)              5400704   ['input_6[0][0]']             
                                                                                            

In [62]:
# Shift target by 1 for teacher forcing
decoder_target_data = np.expand_dims(target_seqs, -1)

model.fit(
    [input_seqs, target_seqs],
    decoder_target_data,
    batch_size=64,
    epochs=3,
    validation_split=0.1
)


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7f0411b50cd0>

In [63]:
# Re-create the decoder embedding layer and copy weights
decoder_embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim)
decoder_embedding_layer.build((None,))
decoder_embedding_layer.set_weights(model.get_layer(index=3).get_weights())  # reuse weights from trained model

decoder_embed2 = decoder_embedding_layer(decoder_inputs)
decoder_lstm_outputs, state_h, state_c = decoder_lstm(decoder_embed2, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs2 = decoder_dense(decoder_lstm_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states
)


In [64]:
def generate_reply(input_text):
    input_text = "startseq " + clean_text(input_text) + " endseq"
    input_seq = tokenizer.texts_to_sequences([input_text])
    input_seq = pad_sequences(input_seq, maxlen=max_len, padding='post')

    # Get encoder states
    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer.word_index['startseq']

    stop_condition = False
    decoded_sentence = ""

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_word_index.get(sampled_token_index, '')

        if sampled_word == 'endseq' or len(decoded_sentence.split()) > max_len:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word

        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence.strip()

print(generate_reply("I'm feeling so overwhelmed and anxious today"))





KeyboardInterrupt: 

AttributeError: 'Embedding' object has no attribute 'predict'

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")


In [None]:
# For managing history across turns
chat_history_ids = None

def chat_with_bot(user_input, chat_history_ids=None):
    # Encode the user input and add end-of-string token
    new_input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors='pt')

    # Append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_input_ids], dim=-1) if chat_history_ids is not None else new_input_ids

    # Generate a response
    output_ids = model.generate(
        bot_input_ids,
        max_length=1000,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.8
    )

    # Decode and return the last output
    response = tokenizer.decode(output_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
    return response, output_ids


In [None]:
chat_history_ids = None  # reset history

while True:
    user_input = input("You: ")
    if user_input.lower() in ['quit', 'exit']:
        break
    response, chat_history_ids = chat_with_bot(user_input, chat_history_ids)
    print(f"Bot: {response}")
