# Swb

In [1]:
import os
import re
import numpy as np
import difflib

import tb

import utils_general
import utils_trees
import utils_swb

print(tb.PTB_base_dir)

/home/grads/m/mariateleki/disfluency/treebank_3


# Train, Dev, Test Splits

In [2]:
mrg_path = os.path.join(tb.PTB_base_dir, "parsed", "mrg", "swbd")

# get train_files
train_files_2 = [os.path.join(mrg_path, "2", file) for file in os.listdir(os.path.join(mrg_path, "2")) if "sw2" in file]
train_files_3 = [os.path.join(mrg_path, "3", file) for file in os.listdir(os.path.join(mrg_path, "3")) if "sw3" in file]
train_files = train_files_2 + train_files_3

# get dev_files
dev_files = [os.path.join(mrg_path, "4", file) for file in os.listdir(os.path.join(mrg_path, "4")) if "sw40" in file or "sw41" in file]

# get test_files
test_files = [os.path.join(mrg_path, "4", file) for file in os.listdir(os.path.join(mrg_path, "4")) if not ("sw40" in file or "sw41" in file)]

# test that the number of files is correct
print("-----TESTS-----\n")
print("total number of swb files:", len(train_files + dev_files + test_files))
print("number of files in 4 dir:", len([file for file in os.listdir(os.path.join(mrg_path, "4"))]))
print("number of dev files:", len(dev_files))
print("number of test files:", len(test_files), "\n")
print("preview of train_files:", train_files[0:5], "\n")
print("preview of dev_files:", dev_files[0:5], "\n")
print("preview of test_files:", test_files[0:5], "\n")

-----TESTS-----

total number of swb files: 650
number of files in 4 dir: 154
number of dev files: 63
number of test files: 91 

preview of train_files: ['/home/grads/m/mariateleki/disfluency/treebank_3/parsed/mrg/swbd/2/sw2717.mrg', '/home/grads/m/mariateleki/disfluency/treebank_3/parsed/mrg/swbd/2/sw2877.mrg', '/home/grads/m/mariateleki/disfluency/treebank_3/parsed/mrg/swbd/2/sw2383.mrg', '/home/grads/m/mariateleki/disfluency/treebank_3/parsed/mrg/swbd/2/sw2111.mrg', '/home/grads/m/mariateleki/disfluency/treebank_3/parsed/mrg/swbd/2/sw2640.mrg'] 

preview of dev_files: ['/home/grads/m/mariateleki/disfluency/treebank_3/parsed/mrg/swbd/4/sw4099.mrg', '/home/grads/m/mariateleki/disfluency/treebank_3/parsed/mrg/swbd/4/sw4013.mrg', '/home/grads/m/mariateleki/disfluency/treebank_3/parsed/mrg/swbd/4/sw4108.mrg', '/home/grads/m/mariateleki/disfluency/treebank_3/parsed/mrg/swbd/4/sw4154.mrg', '/home/grads/m/mariateleki/disfluency/treebank_3/parsed/mrg/swbd/4/sw4026.mrg'] 

preview of test_fil

In [3]:
# copies over the trees and the trees with tags
tagged_train_files, tagged_dev_files, tagged_test_files = utils_swb.get_trees_and_trees_tagged_files(train_files, dev_files, test_files)

# Create Fluent & Disfluent Versions

In [4]:
print("\n-----print a single file with its fluent and disfluent version -----\n")

EXAMPLE_FILE = "./swb/test/trees-tagged/sw4363.mrg" # "./train/trees-tagged/sw2005.mrg" "./train/trees-tagged/sw2008.mrg" "test/trees-tagged/sw4311.mrg"

disfluent_text = utils_trees.get_clean_transcript_from_tree_file(filepath=EXAMPLE_FILE, get_disfluent=True)
fluent_text = utils_trees.get_clean_transcript_from_tree_file(filepath=EXAMPLE_FILE, get_disfluent=False)

# uncomment this line to print the original tree file
# print(utils_general.read_file(EXAMPLE_FILE), "\n\n")

print("DISFLUENT <SEP> COUNT:", disfluent_text.count("<SEP>"))
print("FLUENT <SEP> COUNT:", fluent_text.count("<SEP>"), "\n\n")

print("DISFLUENT:", disfluent_text, "\n\n")
print("FLUENT:", fluent_text, "\n\n")


-----print a single file with its fluent and disfluent version -----

DISFLUENT <SEP> COUNT: 31
FLUENT <SEP> COUNT: 31 


DISFLUENT: Wh- what's your uh. Uh. Are you into the Cowboys? <SEP> Oh God no. Uh no. I'm not. I haven't been. I grew up in Dallas but I'm still not a Cowboys' man. I like Philadelphia Eagles. Oh yeah? <SEP> Yeah. Well that that's all right. That's my favorite team. So. <SEP> Uh now that Randall's coming back next season I hope they'll be do a little bit better. They didn't do too bad last season. Yeah. But I hope they can do better. How about yourself? Well I don't know. I kind of go back and forth uh depending on who's really hot and who's not. I guess I'm kind of a fair weather fan in a lot of respects. <SEP> Uh-huh. But uh I guess if I have uh my druthers I'd probably go for the Seattle Seahawks. Oh tha-. I kind of like them. <SEP> Yeah. That's another one of my favorites. Yeah. They're kind of. They're not they're not my favorite totally. But I do like them. I 

In [5]:
utils_swb.get_fluent_and_disfluent_files(tagged_train_files, tagged_dev_files, tagged_test_files)

# Reproducibility Check
Makes sure that the output of these 3 files (selected randomly) is the same.

In [6]:
random_file_1_path = os.path.join(".", "swb/test/fluent/sw4312.txt")
random_file_2_path = os.path.join(".", "swb/test/disfluent/sw4785.txt")
random_file_3_path = os.path.join(".", "swb/dev/trees-tagged/sw4137.mrg") 

# reads the output of 3 files
random_file_1 = utils_general.read_file(random_file_1_path)
random_file_2 = utils_general.read_file(random_file_2_path)
random_file_3 = utils_general.read_file(random_file_3_path)

# re-runs the writing files out & transformations
train_files, dev_files, test_files = utils_swb.get_trees_and_trees_tagged_files(train_files, dev_files, test_files)
utils_swb.get_fluent_and_disfluent_files(train_files, dev_files, test_files)

run2_random_file_1 = utils_general.read_file(random_file_1_path)
run2_random_file_2 = utils_general.read_file(random_file_2_path)
run2_random_file_3 = utils_general.read_file(random_file_3_path)

# ensures that the new files are the same as the old files
print(list(difflib.unified_diff(random_file_1.split(" "), run2_random_file_1.split(" "))))
print(list(difflib.unified_diff(random_file_2.split(" "), run2_random_file_2.split(" "))))
print(list(difflib.unified_diff(random_file_3.split(" "), run2_random_file_3.split(" "))))

[]
[]
[]


# Write `./model_files`

In [7]:
utils_general.create_and_or_clear_this_dir(os.path.join(".", "swb", "model_files"))

fluency_dict = {"fluent": False, "disfluent": True}

for split in ["train", "dev", "test"]:
    
    input_dir = os.path.join(".", "swb", split)
    output_dir = os.path.join(".", "swb", "model_files", split)
    
    utils_general.just_create_this_dir(input_dir)
    utils_general.create_and_or_clear_this_dir(output_dir)
    
    for fluency, fluency_bool in fluency_dict.items():
        # write it out to the csv
        current_input_dir = os.path.join(input_dir, fluency)
        current_output_path = os.path.join(output_dir, fluency + ".txt")
        for file in [os.path.join(current_input_dir,f) for f in os.listdir(os.path.join(current_input_dir)) if (f.endswith(".txt") and not f.endswith("-checkpoint.txt"))]:
            new_file_name = file.split("/")[-1].replace(".mrg",".txt")
            text = ""
            text = utils_general.read_file(os.path.join(".", "swb", split, fluency, new_file_name))
            with open(current_output_path, mode="a") as f:
                f.write(new_file_name + ":" + text + "\n")

# Tests

In [8]:
print("\n----- <SEP> alignment tests to ensure there are the same number of <SEP>s in each file-----\n")

for split in ["train", "dev", "test"]:  # "train", "dev", "test"
    print(split)
    fluent_text = utils_general.read_file(os.path.join(".", "swb", "model_files", split, "fluent.txt"))
    disfluent_text = utils_general.read_file(os.path.join(".", "swb", "model_files", split, "disfluent.txt"))
    
    print(fluent_text.count("<SEP>"))
    print(disfluent_text.count("<SEP>"))
    print()

    
fluent_text = utils_general.read_file(os.path.join(".", "swb", "model_files", "train", "fluent.txt"))
disfluent_text = utils_general.read_file(os.path.join(".", "swb", "model_files", "train", "disfluent.txt"))

n_files_different = 0
for line_fluent, line_disfluent in zip(fluent_text.split("\n"), disfluent_text.split("\n")):
    if line_fluent.count("<SEP>") != line_disfluent.count("<SEP>"):
        n_files_different += 1
        print(line_fluent.count("<SEP>"), ":", line_fluent, "\n\n", line_disfluent.count("<SEP>"), ":", line_disfluent)
        for diff in difflib.unified_diff(line_fluent.split("."), line_disfluent.split(".")):
            print(diff)
print("n_files_different = ", n_files_different)


----- <SEP> alignment tests to ensure there are the same number of <SEP>s in each file-----

train
13898
13898

dev
1184
1184

test
1496
1496

n_files_different =  0
