In [1]:
%load_ext autoreload
%autoreload 2

# Data Processing

In [2]:
import pandas as pd
import json
from datasets import load_dataset

In [3]:
act_labels = ["dummy", "inform", "question", "directive", "commissive"]
emotion_labels = ["no_emotion", "anger", "disgust", "fear", "happiness", "sadness", "surprise"]

In [4]:
dataset = load_dataset("daily_dialog")

In [5]:
for split, split_dataset in dataset.items():
    split_dataset.to_csv(f"daily_dialog-{split}.csv")

Creating CSV from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [6]:
df1 = pd.read_csv("daily_dialog-train.csv")
df2 = pd.read_csv("daily_dialog-validation.csv")
df3 = pd.read_csv("daily_dialog-test.csv")

df = pd.concat([df1, df2, df3])

df1 = df2 = df3 = None

In [7]:
import spacy
from spacy_cleaner import processing, Cleaner

model = spacy.load("en_core_web_sm")
cleaner = Cleaner( 
    model,
    processing.remove_stopword_token,
    processing.replace_punctuation_token,
    processing.mutate_lemma_token,
)

In [8]:
def parse_dialog(dialog, remove_punctuation=True):
    turns = dialog.replace("\'", "").replace("[", "").replace("]", "").replace('"', "").split("\n")
    return turns

def parse_label_numbers(label_numbers):
    label_numbers = label_numbers.replace("[", "").replace("]", "").split(" ")
    return list(map(int, label_numbers))

In [9]:
rows = []

for i, row in df.iterrows():
    turns = parse_dialog(row["dialog"])
    act = parse_label_numbers(row["act"])
    emotion = parse_label_numbers(row["emotion"])
    for sequence, turn in enumerate(turns):
        rows.append({
            "original_index": i,
            "sequence": sequence,
            "act_label": act_labels[act[sequence]],
            "emotion_label": emotion_labels[emotion[sequence]],
            "text": turn
        })

turns_df = pd.DataFrame.from_records(rows)

In [10]:
cleaned_texts = cleaner.clean(turns_df["text"])

Cleaning Progress: 100%|██████████| 90010/90010 [01:34<00:00, 954.38it/s] 


In [20]:
turns_df["cleaned_text"] = cleaned_texts

Unnamed: 0,dialog,act,emotion,turns,act_label,emotion_label
0,"[Say , Jim , how about going for a few beers a...",[3 4 2 2 2 3 4 1 3 4],[0 0 0 0 0 0 4 4 4 4],"[directive|no_emotion Say , Jim , how about go...","[directive, commissive, question, question, qu...","[no_emotion, no_emotion, no_emotion, no_emotio..."
1,"[Can you do push-ups ?, Of course I can . Its ...",[2 1 2 2 1 1],[0 0 6 0 0 0],"[question|no_emotion Can you do push-ups ?, in...","[question, inform, question, question, inform]","[no_emotion, no_emotion, surprise, no_emotion,..."
2,"[Can you study with the radio on ?, No , I lis...",[2 1 2 1 1],[0 0 0 0 0],[question|no_emotion Can you study with the ra...,"[question, inform, question, inform]","[no_emotion, no_emotion, no_emotion, no_emotion]"
3,"[Are you all right ?, I will be all right soon...",[2 1 1 1],[0 0 0 0],"[question|no_emotion Are you all right ?, info...","[question, inform, inform]","[no_emotion, no_emotion, no_emotion]"
4,"[Hey John , nice skates . Are they new ?, Yeah...",[2 1 2 1 1 2 1 3 4],[0 0 0 0 0 6 0 6 0],"[question|no_emotion Hey John , nice skates . ...","[question, inform, question, inform, inform, q...","[no_emotion, no_emotion, no_emotion, no_emotio..."
...,...,...,...,...,...,...
13113,"[Frank ’ s getting married , do you believe th...",[2 2 1 2 1 2 1],[0 6 0 0 0 0 0],[question|no_emotion Frank ’ s getting married...,"[question, question, inform, question]","[no_emotion, surprise, no_emotion, no_emotion]"
13114,"[OK . Come back into the classroom , class ., ...",[1 2 1 1 1 2 1],[0 0 0 5 0 0 0],[inform|no_emotion OK . Come back into the cla...,"[inform, question, inform, inform, inform]","[no_emotion, no_emotion, no_emotion, sadness, ..."
13115,"[Do you have any hobbies ? Yes , I like coll...",[2 1 2 1 2 1 1],[0 4 4 0 6 0 0],[question|no_emotion Do you have any hobbies ?...,"[question, inform, question, inform, question]","[no_emotion, happiness, happiness, no_emotion,..."
13116,"[Jenny , whats wrong with you ? Why do you kee...",[2 1 1],[0 0 0],"[question|no_emotion Jenny , whats wrong with ...","[question, inform, inform]","[no_emotion, no_emotion, no_emotion]"


In [21]:
new_df.head()

Unnamed: 0,dialog,act,emotion,turns,act_label,emotion_label
0,"[Say , Jim , how about going for a few beers a...",[3 4 2 2 2 3 4 1 3 4],[0 0 0 0 0 0 4 4 4 4],"[directive|no_emotion Say , Jim , how about go...","[directive, commissive, question, question, qu...","[no_emotion, no_emotion, no_emotion, no_emotio..."
1,"[Can you do push-ups ?, Of course I can . Its ...",[2 1 2 2 1 1],[0 0 6 0 0 0],"[question|no_emotion Can you do push-ups ?, in...","[question, inform, question, question, inform]","[no_emotion, no_emotion, surprise, no_emotion,..."
2,"[Can you study with the radio on ?, No , I lis...",[2 1 2 1 1],[0 0 0 0 0],[question|no_emotion Can you study with the ra...,"[question, inform, question, inform]","[no_emotion, no_emotion, no_emotion, no_emotion]"
3,"[Are you all right ?, I will be all right soon...",[2 1 1 1],[0 0 0 0],"[question|no_emotion Are you all right ?, info...","[question, inform, inform]","[no_emotion, no_emotion, no_emotion]"
4,"[Hey John , nice skates . Are they new ?, Yeah...",[2 1 2 1 1 2 1 3 4],[0 0 0 0 0 6 0 6 0],"[question|no_emotion Hey John , nice skates . ...","[question, inform, question, inform, inform, q...","[no_emotion, no_emotion, no_emotion, no_emotio..."


In [22]:
new_df.iloc[0]["dialog"]

['Say , Jim , how about going for a few beers after dinner ?',
 'You know that is tempting but is really not good for our fitness .',
 'What do you mean ? It will help us to relax .',
 'Do you really think so ? I dont . It will just make us fat and act silly . Remember last time ?',
 'I guess you are right.But what shall we do ? I dont feel like sitting at home .',
 'I suggest a walk over to the gym where we can play singsong and meet some of our friends .',
 'Thats a good idea . I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them .',
 'Sounds great to me ! If they are willing , we could ask them to go dancing with us.That is excellent exercise and fun , too .',
 'Good.Let  s go now .   All right .']

 # Prepare for Classification


In [23]:
def get_label(act_label, emotion_label, label_type):
    if label_type == "act":
        return act_label
    elif label_type == "emotion":
        return emotion_label
    else:
        return f"{act_label}|{emotion_label}"

label_type = "combo"

examples = []

for row_index, row in new_df.iterrows():
    for dialog_index, utterance in enumerate(row["dialog"]):
        example = {}
        act_label = row["act_label"][dialog_index]
        emotion_label = row["emotion_label"][dialog_index]
        # example["label"] = f"{act_label}|{emotion_label}"
        example["label"] = get_label(act_label, emotion_label, label_type)
        example["text"] = utterance
        examples.append(example)


In [24]:
classification = pd.DataFrame.from_records(examples)

In [25]:
classification

Unnamed: 0,label,text
0,directive|no_emotion,"Say , Jim , how about going for a few beers af..."
1,commissive|no_emotion,You know that is tempting but is really not go...
2,question|no_emotion,What do you mean ? It will help us to relax .
3,question|no_emotion,Do you really think so ? I dont . It will just...
4,question|no_emotion,I guess you are right.But what shall we do ? I...
...,...,...
90005,question|surprise,are you kidding ? Can you afford it ? Do you t...
90006,directive|no_emotion,"never mind that , Ill take care of it . Are yo..."
90007,commissive|no_emotion,"yeah , I think so ."
90008,inform|happiness,ok . Ill make the arrangements . It will be gr...


In [26]:
labels = classification[["label"]].sort_values("label").drop_duplicates().reset_index(drop=True)

In [27]:
label2id = labels_map = dict(list(zip(labels.label, labels.index)))
id2label = dict((v,k) for k,v in labels_map.items())

In [28]:
labels_map

{'commissive|anger': 0,
 'commissive|disgust': 1,
 'commissive|fear': 2,
 'commissive|happiness': 3,
 'commissive|no_emotion': 4,
 'commissive|sadness': 5,
 'commissive|surprise': 6,
 'directive|anger': 7,
 'directive|disgust': 8,
 'directive|fear': 9,
 'directive|happiness': 10,
 'directive|no_emotion': 11,
 'directive|sadness': 12,
 'directive|surprise': 13,
 'inform|anger': 14,
 'inform|disgust': 15,
 'inform|fear': 16,
 'inform|happiness': 17,
 'inform|no_emotion': 18,
 'inform|sadness': 19,
 'inform|surprise': 20,
 'question|anger': 21,
 'question|disgust': 22,
 'question|fear': 23,
 'question|happiness': 24,
 'question|no_emotion': 25,
 'question|sadness': 26,
 'question|surprise': 27}

In [29]:
num_labels = len(labels)

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
from datasets import Dataset, DatasetDict

In [32]:
train_df, val_df = train_test_split(classification, test_size=0.2)
train_df["label"].replace(labels_map, inplace=True)
val_df["label"].replace(labels_map, inplace=True)
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))

In [33]:
dataset = DatasetDict({"train": train_dataset, "test": val_dataset})

In [34]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilroberta-base", num_labels=num_labels)

model.config.id2label = id2label
model.config.label2id = label2id

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
from transformers import TrainingArguments

output_dir = f"../../models/distilroberta-daily_dialog-{label_type}"

In [36]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

  metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [37]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [38]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy="epoch",
    num_train_epochs=6
)

In [39]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

train_dataset = tokenized_datasets["train"].shuffle(seed=42)
eval_dataset = tokenized_datasets["test"].shuffle(seed=42)

Map:   0%|          | 0/72008 [00:00<?, ? examples/s]

Map:   0%|          | 0/18002 [00:00<?, ? examples/s]

In [40]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.3683,1.344033,0.565604
2,1.2747,1.339678,0.57738
3,1.1707,1.312579,0.579158
4,1.0484,1.364442,0.56827


In [None]:
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
tokenizer.save_vocabulary(output_dir)

In [None]:
from transformers import pipeline
classifier = pipeline("text-classification", model=output_dir, return_all_scores=True)

In [None]:
classifier.model.config.id2label = id2label
classifier.model.config.label2id = labels_map

In [None]:
classifier.model.config

In [None]:
from transformers import pipeline

classifier = pipeline("text-classification", model=output_dir, return_all_scores=True)
classifier.model.config.id2label = id2label
classifier.model.config.label2id = label2id

def classify(text):
    results = classifier(text)
    max_score = max(results[0], key=lambda x:x["score"])
    return max_score["label"]

In [None]:
classify("What's going on?")

In [None]:
classify("I don't think so.")

In [None]:
classify("You think so? I don't know, really.")

In [None]:
classify("Do as I say.")

In [None]:
classify("Tell that guy to shut up")

In [None]:
classify("I'm sick of seeing you here")

In [None]:
classify("Take the book and read it")

In [None]:
classify("Do you think it's okay?")

In [None]:
classify("It's tempting but it's not good for our fitness")

In [None]:
classify("Sometimes I think life is not worth living.")

In [None]:
classify("Find another person.")

In [None]:
classify("Re-train on this dataset")

In [None]:
classify("Do this")

In [None]:
classify("Get this from google")

In [None]:
classify('why not go again to celebrate out one-year anniversary ? We can go to the same beach , stay in the same hotel and enjoy a dinner in the same restaurant .')

In [None]:
classify("Why bother")

In [None]:
classify("Who's the author of this article")

In [None]:
classify("Where was this")