In [24]:
import regex as re
import os
import time
import numpy as np
import pandas as pd
import random
import tqdm
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

In [2]:
from torchfly.transformers import GPT2SimpleLM, UnifiedTokenizer

from utils import safe_clean_text

In [3]:
def extract_data(df_dialogs):
    data = {}

    for i in tqdm.trange(len(df_dialogs)):
        line = df.iloc[i]

        if line["B2"] not in data:
            data[line["B2"]] = {}
            data[line["B2"]]["text"] = []
            data[line["B2"]]["token_ids"] = []

        if line["B4"] == 0:
            text = "A:" + line["Unit"].strip()
            text = safe_clean_text(text)
            text += "\n\n\n"
            data[line["B2"]]["text"].append(text)
            data[line["B2"]]["token_ids"].append(tokenizer.encode(text))
        else:
            text = "B:" + line["Unit"].strip()
            text = safe_clean_text(text)
            text += "\n\n\n"
            data[line["B2"]]["text"].append(text)
            data[line["B2"]]["token_ids"].append(tokenizer.encode(text))

    return data

In [4]:
tokenizer = UnifiedTokenizer()

In [5]:
df = pd.read_csv("dialog_data/persuasionforgood/data/FullData/full_dialog.csv")

In [6]:
data = extract_data(df)
data = list(data.values())

100%|██████████| 20932/20932 [00:05<00:00, 3582.42it/s]


In [7]:
indices = np.arange(len(data))
np.random.shuffle(indices)
train_data = [data[idx] for idx in indices[100:]]
val_data = [data[idx] for idx in indices[:100]]

In [31]:
with open("train.json", "w") as f:
    json.dump(train_data, f, indent=4)
    
with open("val.json", "w") as f:
    json.dump(val_data, f, indent=4)

In [8]:
# import huggingface transformers
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW, WarmupLinearSchedule

In [14]:
class DialogFragmentSampler:
    def __init__(self, max_len=1024):
        self.max_tokens_len = max_len
        
    def __call__(self, dialog):
        """dialog is a dict which has key "token_ids"
        """
        dialog_fragment = {}
        
        lengths = np.array([len(item) for item in dialog['token_ids']])
            
        # if the entire dialog is smaller than the max len
        if lengths.sum() < self.max_tokens_len:
            return dialog
            
        cumsum_len =  lengths.cumsum()
        reverse_cumsum_len = cumsum_len[-1] - cumsum_len
        
        # based on the reverse cumsum, we can have a range to select from
        start_turns = np.arange(len(reverse_cumsum_len))[reverse_cumsum_len > self.max_tokens_len]
        # remove odd numbers
        start_turns = [idx for idx in start_turns if idx%2==0]
        # randomly choose one
        random_start_turn = random.choice(start_turns)
        new_cumsum_len = cumsum_len - cumsum_len[random_start_turn]
        
        # find the maximum end turn (only odd turn)
        for i in reversed(range(len(new_cumsum_len))):
            if i%2==1 and new_cumsum_len[i] < self.max_tokens_len:
                random_end_turn = i
                break
        
        dialog_fragment["text"] = dialog['text'][random_start_turn:random_end_turn+1]
        dialog_fragment["token_ids"] = dialog['token_ids'][random_start_turn:random_end_turn+1]
        
        return dialog_fragment

In [15]:
sampler = DialogFragmentSampler()

In [21]:
sampler(example)

{'text': ['A:Good evening, I would hope you are well. I would also like for you to donate to a great charity called Save the Children.\n\n\n',
  'B:I might be interested, please explain what Save the Children does?\n\n\n',
  'A:Save the Children program halp children in many different ways. One way is to help them to get nutritional foods to eat.\n\n\n',
  'B:Are you the leader/facilitator of this program?\n\n\n',
  "A:I am not but I truly think this organization is well worth donating to. They are committed to children in providing health and nutrition programs that save children's lives and ensure they grow up healthy.\n\n\n",
  'B:How did you hear about this program?\n\n\n',
  'A:I heard about this program through friends and family who have volunteered with this program. Have you ever volunteered with any organization?\n\n\n',
  'B:Yes I volunteered with a horse rescue ranch when I was in high school, we helped to save, retrain, and rehabilitate injured and neglected animals\n\n\n'

In [11]:
print('A:I agree - certainly important for everyone, but often its the kids who can\\\'t speak for themselves that have the hardest time accessing the things they need. Between working with immediate disaster relief (like post hurricane Harvey) or longer term issues - like improving access to clean water in Cambodia, and they are on the "Wise Giving" award from Better Business Bureau - which measures how well funds get to the charitable work (versus expensive staff, etc)')

A:I agree - certainly important for everyone, but often its the kids who can\'t speak for themselves that have the hardest time accessing the things they need. Between working with immediate disaster relief (like post hurricane Harvey) or longer term issues - like improving access to clean water in Cambodia, and they are on the "Wise Giving" award from Better Business Bureau - which measures how well funds get to the charitable work (versus expensive staff, etc)


In [12]:
text = 'A:I agree - certainly important for everyone, but often its the kids who can\\\'t speak for themselves that have the hardest time accessing the things they need. Between working with immediate disaster relief (like post hurricane Harvey) or longer term issues - like improving access to clean water in Cambodia, and they are on the "Wise Giving" award from Better Business Bureau - which measures how well funds get to the charitable work (versus expensive staff, etc)'

In [16]:
"¡ª"

'¡ª'

In [17]:
text = "A:In the U.S. and around the world, Save the Children does whatever it takes ¡ª every day and in times of crisis ¡ª to give children a healthy start in life, the opportunity to learn and protection from harm.  When crisis strikes and children are most vulnerable, they are always among the first to respond and the last to leave. They ensure children's unique needs are met and their voices are heard."

In [18]:
text

"A:In the U.S. and around the world, Save the Children does whatever it takes ¡ª every day and in times of crisis ¡ª to give children a healthy start in life, the opportunity to learn and protection from harm.  When crisis strikes and children are most vulnerable, they are always among the first to respond and the last to leave. They ensure children's unique needs are met and their voices are heard."

"A:In the U.S. and around the world, Save the Children does whatever it takes  every day and in times of crisis  to give children a healthy start in life, the opportunity to learn and protection from harm.  When crisis strikes and children are most vulnerable, they are always among the first to respond and the last to leave. They ensure children's unique needs are met and their voices are heard."