In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
from pathlib import Path
from typing import List, Dict, Union, Tuple

import torch, os, re, pandas as pd, json
from sklearn.model_selection import train_test_split
from transformers import (
    DataCollatorForLanguageModeling,
    DataCollatorWithPadding,
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    AutoConfig,
    AutoTokenizer,
    AutoModelForCausalLM
)
from datasets import Dataset, DatasetDict, list_metrics, load_metric, load_from_disk


In [6]:
base_model = "microsoft/DialoGPT-small"
data_dir = "../data/empatheticdialogues"
output_dir = "../models/empathetic-DialoGPT-small"
batch_size = 2
logging_dir = output_dir
prediction_loss_only = True

In [68]:
def find_data_filepaths(data_dir):
    data_filepaths = list(Path(data_dir).glob("*.csv"))
    return data_filepaths


def load_data(data_dir):
    data = {}
    data_filepaths = find_data_filepaths(data_dir)
    for data_filepath in data_filepaths:
        data_name = data_filepath.stem
        data[data_name] = pd.read_csv(data_filepath, encoding="utf-8", on_bad_lines='skip')
    return data


def create_datasets(data, eos_token="<|endofsentence|>"):
    datasets = {}
    for name, df in data.items():
        grouped = df[["conv_id", "prompt", "utterance"]].groupby("conv_id")["utterance"]
        concat_text = grouped.transform(lambda x: eos_token.join(x))
        datasets[name] = Dataset.from_dict({"text": concat_text.unique()})
    return datasets

In [69]:
data = load_data(data_dir)
data["test"]

Unnamed: 0,conv_id,utterance_idx,context,prompt,speaker_idx,utterance,selfeval,tags
0,hit:0_conv:0,1,guilty,I felt guilty when I was driving home one nigh...,0,Yeah about 10 years ago I had a horrifying exp...,2|2|5_5|5|5,
1,hit:0_conv:0,3,guilty,I felt guilty when I was driving home one nigh...,0,No I wasn't hit. It turned out they were drunk...,2|2|5_5|5|5,
2,hit:0_conv:0,5,guilty,I felt guilty when I was driving home one nigh...,0,I don't know I was new to driving and hadn't e...,2|2|5_5|5|5,
3,hit:34_conv:69,1,caring,My mother stopped by my house one day and said...,45,Well_comma_ can you tell me about your experie...,5|5|5_3|2|2,
4,hit:34_conv:69,3,caring,My mother stopped by my house one day and said...,45,Oh my goodness_comma_ that's very scary! I hop...,5|5|5_3|2|2,
...,...,...,...,...,...,...,...,...
5696,hit:12413_conv:24826,3,grateful,I'm glad that life is being good to me,437,Glad you think so as well!,5|5|5_4|3|4,
5697,hit:12416_conv:24832,1,disgusted,I saw a huge cockroach outside my house today....,481,I saw a huge cockroach outside my house today!,5|5|5_4|3|4,
5698,hit:12416_conv:24832,3,disgusted,I saw a huge cockroach outside my house today....,481,Not yet since it's the weekend. We live in Tex...,5|5|5_4|3|4,
5699,hit:12423_conv:24847,1,anxious,I have a big test on Monday. I am so nervous_c...,481,I have a big test on Monday_comma_ I am so ner...,5|5|5_5|5|5,


In [72]:
datasets = create_datasets(data)
print(datasets["test"].shape)
datasets["test"]

(2541, 1)


Dataset({
    features: ['text'],
    num_rows: 2541
})

In [122]:
def preprocess_function(tokenizer, text_column="text", max_length=256):  
    def _tokenize(examples):
        flatten = lambda l: [item for sublist in l for item in sublist] 
        sanitized_text = [v.replace("_comma_", ",") for k, v in examples.items()]
        tokenized = tokenizer(
            sanitized_text,
            padding="max_length",
            max_length=max_length,
        )
        examples["input_ids"] = flatten(tokenized["input_ids"])
        return examples
    return _tokenize


def preprocess_datasets(datasets, tokenizer, text_column="text", max_length=256):
    columns = lambda d: d.features.keys()
    preprocessed_datasets = {}
    
    for name, dataset in datasets.items():
        ds = dataset.map(
            preprocess_function(tokenizer, text_column, max_length),
            remove_columns=columns(dataset)
        )
        ds.set_format(type="torch", columns=["input_ids"])
        preprocessed_datasets[name] = ds
        
    return preprocessed_datasets
        
    
    

In [115]:
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [123]:
preprocessed_datasets = preprocess_datasets(datasets, tokenizer)

  0%|          | 0/2541 [00:00<?, ?ex/s]

  0%|          | 0/2763 [00:00<?, ?ex/s]

  0%|          | 0/17839 [00:00<?, ?ex/s]

In [124]:
preprocessed_datasets

{'test': Dataset({
     features: ['input_ids'],
     num_rows: 2541
 }),
 'valid': Dataset({
     features: ['input_ids'],
     num_rows: 2763
 }),
 'train': Dataset({
     features: ['input_ids'],
     num_rows: 17839
 })}

In [125]:
preprocessed_datasets["train"]["input_ids"][0]

tensor([   40,  3505,  1016,   284,   766,   262, 26056,   351,   616,  1266,
         1545,    13,   632,   373,   262,   717,   640,   356,  1683,  3377,
          640,  3436,  1978,    13,  4900,   612,   373,   257,  1256,   286,
          661,    11,   356,  2936,   588,   262,   691,   661,   287,   262,
          995, 29847,    91,   437,  1659, 34086,   594,    91,    29, 16973,
          428,   257,  1545,   345,   547,   287,  1842,   351,    11,   393,
          655,   257,  1266,  1545,    30,    27,    91,   437,  1659, 34086,
          594,    91,    29,  1212,   373,   257,  1266,  1545,    13,   314,
         2051,   607, 29847,    91,   437,  1659, 34086,   594,    91,    29,
         8496,   468,   673,  3750,    30,    27,    91,   437,  1659, 34086,
          594,    91,    29,  1135,   645,  2392,  1561, 29847,    91,   437,
         1659, 34086,   594,    91,    29,  5812,   373,   428,  1223,   326,
         3022,   780,   286,   281,  4578,    30, 50257, 50257, 

In [126]:
tokenizer.decode(preprocessed_datasets["train"]["input_ids"][0])

'I remember going to see the fireworks with my best friend. It was the first time we ever spent time alone together. Although there was a lot of people, we felt like the only people in the world.<|endofsentence|>Was this a friend you were in love with, or just a best friend?<|endofsentence|>This was a best friend. I miss her.<|endofsentence|>Where has she gone?<|endofsentence|>We no longer talk.<|endofsentence|>Oh was this something that happened because of an argument?[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][

In [120]:
DatasetDict(preprocessed_datasets)

DatasetDict({
    test: Dataset({
        features: ['input_ids'],
        num_rows: 2541
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 2763
    })
    train: Dataset({
        features: ['input_ids'],
        num_rows: 17839
    })
})

In [127]:
from src.dataset import load_and_preprocess_datasets

In [129]:
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

preprocessed_datasets = load_and_preprocess_datasets(data_dir, tokenizer)

  0%|          | 0/2541 [00:00<?, ?ex/s]

  0%|          | 0/2763 [00:00<?, ?ex/s]

  0%|          | 0/17839 [00:00<?, ?ex/s]