In [1]:
from pathlib import Path
import pandas as pd
import json

In [3]:
tasks = ["NLU", "POL", "NLG"]
special_sep_tokens = [
    "[eoaa]",
    "[eoau]",
    "[eoda]",
    "[eodp]",
    "[eodu]",
    "[soaa]",
    "[soau]",
    "[soda]",
    "[sodp]",
    "[sodu]",
]
basedir = Path('/home/jitianbo/Workspace/driver_simulator_kvret/data/data_for_clm_ablation/')
csvdir = Path('/home/jitianbo/Workspace/driver_simulator_kvret/data/data_with_dp/')


modes = ["train", "test", "dev"]
csv_data = {
    mode: pd.read_csv(csvdir.joinpath(f"{mode}.csv"))
    for mode in modes
}



def remove_ss_token(txt):
    tokens = txt.split()
    tokens = [e for e in tokens if e not in special_sep_tokens]
    return " ".join(tokens)

def remove_dup_space(txt):
    tokens = txt.strip().split()
    return " ".join(tokens)

# 去除history

In [16]:
savebasedir = basedir.joinpath("history")
savebasedir.mkdir(exist_ok=True)

sp_token_fpath = savebasedir.joinpath("additional_special_tokens.json")

sp_tokens_to_add = {
    "additional_special_tokens": [
        "[sodp]", "[eodp]", # driver profile
        "[soau]", "[eoau]", # assistant utterance
        "[soaa]", "[eoaa]", # assistant action
        "[soda]", "[eoda]", # driver action
        "[sodu]", "[eodu]", # driver utterance         
        "[address]", "[agenda]", "[date]", "[distance]", "[event]", "[friday]", 
        "[greeting]", "[location]", "[monday]", "[party]", "[poi]", "[poi_type]", 
        "[room]", "[saturday]", "[sunday]", "[thursday]", "[time]", "[today]", "[traffic_info]", 
        "[tuesday]", "[weather_attribute]", "[wednesday]",
    ]
}

with sp_token_fpath.open('w') as f:
    json.dump(sp_tokens_to_add,f,indent=2)


for mode in modes:
    txt_lines = []
    sour_lines = []
    targ_lines = []
    task_txt_lines = {e:[] for e in tasks}
    task_sour_lines = {e:[] for e in tasks}
    task_targ_lines = {e:[] for e in tasks}
    pd_data = csv_data[mode]
    for idx,row in pd_data.iterrows():
        cur_task = row['task']
        if cur_task == 'DST':
            cur_task = 'POL'
        if cur_task == 'NLU' and row['turn_id'] == 0:
            continue
        # history = row['history']
        # if pd.isna(history):
        #     history = ""
        input_ = row['input']
        profile = row['profile']
        target = row['target']
        
        # sour_line = f"{profile} {history} {input_}"
        sour_line = f"{profile} {input_}"
        targ_line = f"{target}"
        txt_line = f"{sour_line} {targ_line}"


        
        txt_line = remove_dup_space(txt_line)
        sour_line = remove_dup_space(sour_line)
        targ_line = remove_dup_space(targ_line)
        # 去除空数据
        # if sour_line == '[soda] [eoda]':
        #     continue
        
        txt_lines.append(txt_line)
        sour_lines.append(sour_line)
        targ_lines.append(targ_line)
        
        task_txt_lines[cur_task].append(txt_line)
        task_sour_lines[cur_task].append(sour_line)
        task_targ_lines[cur_task].append(targ_line)
        
    txt_path = savebasedir.joinpath(f"{mode}.txt")
    with txt_path.open('w') as f:
        f.writelines( [f"{e}\n" for e in txt_lines])
        
    sour_path = savebasedir.joinpath(f"{mode}.source")
    with sour_path.open('w') as f:
        f.writelines( [f"{e}\n" for e in sour_lines])
    
    targ_path = savebasedir.joinpath(f"{mode}.target")
    with targ_path.open('w') as f:
        f.writelines( [f"{e}\n" for e in targ_lines])
    
    for task in tasks:
        task_savebasedir = savebasedir.joinpath(task)
        task_savebasedir.mkdir(exist_ok=True)
        
        cur_task_txt_lines = task_txt_lines[task]
        cur_task_sour_lines = task_sour_lines[task]
        cur_task_targ_lines = task_sour_lines[task]
        
        task_txt_path = task_savebasedir.joinpath(f"{mode}-{task}.txt")
        with task_txt_path.open('w') as f:
            f.writelines( [f"{e}\n" for e in cur_task_txt_lines])

        task_sour_path = task_savebasedir.joinpath(f"{mode}-{task}.source")
        with task_sour_path.open('w') as f:
            f.writelines( [f"{e}\n" for e in cur_task_sour_lines])

        task_targ_path = task_savebasedir.joinpath(f"{mode}-{task}.target")
        with task_targ_path.open('w') as f:
            f.writelines( [f"{e}\n" for e in cur_task_targ_lines])

# 去除profile

In [17]:
savebasedir = basedir.joinpath("profile")
savebasedir.mkdir(exist_ok=True)

sp_token_fpath = savebasedir.joinpath("additional_special_tokens.json")

# 没有 driver profile
sp_tokens_to_add = {
    "additional_special_tokens": [
        "[sodp]", "[eodp]", # driver profile
        "[soau]", "[eoau]", # assistant utterance
        "[soaa]", "[eoaa]", # assistant action
        "[soda]", "[eoda]", # driver action
        "[sodu]", "[eodu]", # driver utterance         
        "[address]", "[agenda]", "[date]", "[distance]", "[event]", "[friday]", 
        "[greeting]", "[location]", "[monday]", "[party]", "[poi]", "[poi_type]", 
        "[room]", "[saturday]", "[sunday]", "[thursday]", "[time]", "[today]", "[traffic_info]", 
        "[tuesday]", "[weather_attribute]", "[wednesday]",
    ]
}

with sp_token_fpath.open('w') as f:
    json.dump(sp_tokens_to_add,f,indent=2)

for mode in modes:
    txt_lines = []
    sour_lines = []
    targ_lines = []
    task_txt_lines = {e:[] for e in tasks}
    task_sour_lines = {e:[] for e in tasks}
    task_targ_lines = {e:[] for e in tasks}
    pd_data = csv_data[mode]
    for idx,row in pd_data.iterrows():
        cur_task = row['task']
        if cur_task == 'DST':
            cur_task = 'POL'
        if cur_task == 'NLU' and row['turn_id'] == 0:
            continue
        history = row['history']
        if pd.isna(history):
            history = ""
        input_ = row['input']
        profile = row['profile']
        target = row['target']
        
        # sour_line = f"{profile} {history} {input_}"
        sour_line = f"{history} {input_}"
        targ_line = f"{target}"
        txt_line = f"{sour_line} {targ_line}"
        
        txt_line = remove_dup_space(txt_line)
        sour_line = remove_dup_space(sour_line)
        targ_line = remove_dup_space(targ_line)
        
        # 去除空数据
        # if sour_line == '[soau] [eoau] [soaa] [eoaa]' and mode == "train":
        # if sour_line == '[soau] [eoau] [soaa] [eoaa]':
            # continue
        
        txt_lines.append(txt_line)
        sour_lines.append(sour_line)
        targ_lines.append(targ_line)
        
        task_txt_lines[cur_task].append(txt_line)
        task_sour_lines[cur_task].append(sour_line)
        task_targ_lines[cur_task].append(targ_line)
        
    txt_path = savebasedir.joinpath(f"{mode}.txt")
    with txt_path.open('w') as f:
        f.writelines( [f"{e}\n" for e in txt_lines])
        
    sour_path = savebasedir.joinpath(f"{mode}.source")
    with sour_path.open('w') as f:
        f.writelines( [f"{e}\n" for e in sour_lines])
    
    targ_path = savebasedir.joinpath(f"{mode}.target")
    with targ_path.open('w') as f:
        f.writelines( [f"{e}\n" for e in targ_lines])
    
    for task in tasks:
        task_savebasedir = savebasedir.joinpath(task)
        task_savebasedir.mkdir(exist_ok=True)
        
        cur_task_txt_lines = task_txt_lines[task]
        cur_task_sour_lines = task_sour_lines[task]
        cur_task_targ_lines = task_sour_lines[task]
        
        task_txt_path = task_savebasedir.joinpath(f"{mode}-{task}.txt")
        with task_txt_path.open('w') as f:
            f.writelines( [f"{e}\n" for e in cur_task_txt_lines])

        task_sour_path = task_savebasedir.joinpath(f"{mode}-{task}.source")
        with task_sour_path.open('w') as f:
            f.writelines( [f"{e}\n" for e in cur_task_sour_lines])

        task_targ_path = task_savebasedir.joinpath(f"{mode}-{task}.target")
        with task_targ_path.open('w') as f:
            f.writelines( [f"{e}\n" for e in cur_task_targ_lines])

# 去除history和profile

In [18]:
savebasedir = basedir.joinpath("history-profile")
# savebasedir = basedir.joinpath("history-profile-2")
savebasedir.mkdir(exist_ok=True)

sp_token_fpath = savebasedir.joinpath("additional_special_tokens.json")

# 没有 driver profile
sp_tokens_to_add = {
    "additional_special_tokens": [
        "[sodp]", "[eodp]", # driver profile
        "[soau]", "[eoau]", # assistant utterance
        "[soaa]", "[eoaa]", # assistant action
        "[soda]", "[eoda]", # driver action
        "[sodu]", "[eodu]", # driver utterance         
        "[address]", "[agenda]", "[date]", "[distance]", "[event]", "[friday]", 
        "[greeting]", "[location]", "[monday]", "[party]", "[poi]", "[poi_type]", 
        "[room]", "[saturday]", "[sunday]", "[thursday]", "[time]", "[today]", "[traffic_info]", 
        "[tuesday]", "[weather_attribute]", "[wednesday]",
    ]
}

with sp_token_fpath.open('w') as f:
    json.dump(sp_tokens_to_add,f,indent=2)


for mode in modes:
    txt_lines = []
    sour_lines = []
    targ_lines = []
    task_txt_lines = {e:[] for e in tasks}
    task_sour_lines = {e:[] for e in tasks}
    task_targ_lines = {e:[] for e in tasks}
    pd_data = csv_data[mode]
    for idx,row in pd_data.iterrows():
        cur_task = row['task']
        if cur_task == 'DST':
            cur_task = 'POL'
        if cur_task == 'NLU' and row['turn_id'] == 0:
            continue
        history = row['history']
        if pd.isna(history):
            history = ""
        input_ = row['input']
        profile = row['profile']
        target = row['target']
        
        # sour_line = f"{profile} {history} {input_}"
        sour_line = f"{input_}"
        targ_line = f"{target}"
        txt_line = f"{sour_line} {targ_line}"
        
        txt_line = remove_dup_space(txt_line)
        sour_line = remove_dup_space(sour_line)
        targ_line = remove_dup_space(targ_line)
        
        # 去除空数据
        # if sour_line == '[soau] [eoau] [soaa] [eoaa]' and mode == "train":
        # if sour_line == '[soau] [eoau] [soaa] [eoaa]':
        #     continue
        
        
        txt_lines.append(txt_line)
        sour_lines.append(sour_line)
        targ_lines.append(targ_line)
        
        task_txt_lines[cur_task].append(txt_line)
        task_sour_lines[cur_task].append(sour_line)
        task_targ_lines[cur_task].append(targ_line)
        
    txt_path = savebasedir.joinpath(f"{mode}.txt")
    with txt_path.open('w') as f:
        f.writelines( [f"{e}\n" for e in txt_lines])
        
    sour_path = savebasedir.joinpath(f"{mode}.source")
    with sour_path.open('w') as f:
        f.writelines( [f"{e}\n" for e in sour_lines])
    
    targ_path = savebasedir.joinpath(f"{mode}.target")
    with targ_path.open('w') as f:
        f.writelines( [f"{e}\n" for e in targ_lines])
    
    for task in tasks:
        task_savebasedir = savebasedir.joinpath(task)
        task_savebasedir.mkdir(exist_ok=True)
        
        cur_task_txt_lines = task_txt_lines[task]
        cur_task_sour_lines = task_sour_lines[task]
        cur_task_targ_lines = task_sour_lines[task]
        
        task_txt_path = task_savebasedir.joinpath(f"{mode}-{task}.txt")
        with task_txt_path.open('w') as f:
            f.writelines( [f"{e}\n" for e in cur_task_txt_lines])

        task_sour_path = task_savebasedir.joinpath(f"{mode}-{task}.source")
        with task_sour_path.open('w') as f:
            f.writelines( [f"{e}\n" for e in cur_task_sour_lines])

        task_targ_path = task_savebasedir.joinpath(f"{mode}-{task}.target")
        with task_targ_path.open('w') as f:
            f.writelines( [f"{e}\n" for e in cur_task_targ_lines])