# Wikipedia

In [1]:
import pandas as pd
import regex as re
import numpy as np
import os
import difflib

import utils_general
import utils_transformations

f = "/data2/maria/wiki-split/test.tsv"

df = pd.read_csv(f, sep="\t", header=None)

WIKIPEDIA_DIR = os.path.join(".", "wikipedia")
utils_general.create_and_or_clear_this_dir(WIKIPEDIA_DIR)

rng = np.random.default_rng(seed=0)

def write_wikipedia_files_out():
    for i in range(len(df)):
        sentence = df.iloc[i,0]
        
        # shift punctuation
        sentence = re.sub(r"\s*,\s*", ", ", sentence)
        sentence = re.sub(r"\s*\.\s*", ". ", sentence)
        sentence = re.sub(r"\s*\;\s*", "; ", sentence)
        sentence = re.sub(r"\s*\:\s*", ": ", sentence)
        sentence = re.sub(r"\s*\!\s*", "! ", sentence)
        sentence = re.sub(r"\s*\?\s*", "? ", sentence)
        
        # special case
        sentence = sentence.replace(" n't","n't")
        
        sentence = re.sub(r"''","'", sentence)
        sentence = re.sub(r"'\s*'","'", sentence)
        sentence = re.sub(r"' ([^']+?) '", r" '\1'", sentence)
        sentence = re.sub(r"\( ([^']+?) \)", r"(\1)", sentence)
        sentence = re.sub(r"\b\s'", "'", sentence)
        sentence = re.sub(r"\.\s*\.\s*\.", "...", sentence)
        sentence = re.sub(r"\s+", " ", sentence)
        sentence = sentence.strip()

        utils_general.write_file(str(i) + ".txt", WIKIPEDIA_DIR, sentence)    

        for t in ["repeats","interjections","false-starts","repeats-and-false-starts","repeats-and-interjections","interjections-and-false-starts","all-3"]:
            current_trf_dir = os.path.join(".", "wikipedia", t)
            utils_general.just_create_this_dir(current_trf_dir)

            for n in [0,1,2,3,4,5,6,7,8,9,10]:

                # get the new filename
                new_filename = str(n) + "_" + str(i) + ".txt"

                if t == "repeats":
                    transcript_text = utils_transformations.get_repeats_text(n, sentence, rng)

                elif t == "interjections":
                    transcript_text = utils_transformations.get_interjections_text(n, sentence, rng)  

                elif t == "false-starts":
                    transcript_text = utils_transformations.get_false_starts_text(n, sentence, rng)

                elif t == "repeats-and-false-starts":
                    transcript_text = utils_transformations.get_repeats_text(n, sentence, rng)
                    transcript_text = utils_transformations.get_false_starts_text(n, sentence, rng)

                elif t == "repeats-and-interjections":
                    transcript_text = utils_transformations.get_repeats_text(n, sentence, rng)
                    transcript_text = utils_transformations.get_interjections_text(n, sentence, rng) 

                elif t == "interjections-and-false-starts":
                    transcript_text = utils_transformations.get_interjections_text(n, sentence, rng)
                    transcript_text = utils_transformations.get_false_starts_text(n, sentence, rng)

                elif t == "all-3":
                    transcript_text = utils_transformations.get_interjections_text(n, sentence, rng)
                    transcript_text = utils_transformations.get_false_starts_text(n, sentence, rng)
                    transcript_text = utils_transformations.get_repeats_text(n, sentence, rng)  

                utils_general.write_file(new_filename, current_trf_dir, sentence)
                
write_wikipedia_files_out()

In [2]:
for i in range(0,30):
    id_str = str(i)
    print(utils_general.read_file(os.path.join(".", "wikipedia", f"{id_str}.txt")), "\n")
    # print(utils_general.read_file(os.path.join(".", "wikipedia", "repeats", f"0_{id_str}.txt")), "\n")
    # print(utils_general.read_file(os.path.join(".", "wikipedia", "repeats", f"3_{id_str}.txt")), "\n")
    # print(utils_general.read_file(os.path.join(".", "wikipedia", "interjections", f"3_{id_str}.txt")), "\n")
    # print(utils_general.read_file(os.path.join(".", "wikipedia", "false-starts", f"3_{id_str}.txt")), "\n")
    # print(utils_general.read_file(os.path.join(".", "wikipedia", "all-3", f"3_{id_str}.txt")), "\n")

# for id_str in range(0,1000):
#     print(utils_general.read_file(os.path.join(".", "wikipedia", f"{id_str}.txt")), "\n")

'Bandolier - Budgie', a free iTunes app for iPad, iPhone and iPod touch, released in December 2011, tells the story of the making of Bandolier in the band's own words - including an extensive audio interview with Burke Shelley. 

'Eden Black' was grown from seed in the late 1980s by Stephen Morley, under his conditions it produces pitchers that are almost completley black. 

'Wilson should extend his stint on The Voice to renew public interest in the band; given that they're pulling out all the stops, they deserve all the acclaim that surrounded them for their first two albums. 

'New York Mining Disaster 1941' was the second EP released by the Bee Gees in 1967 on the Spin Records, like their first EP, it was released only in Australia. 

'ADAPTOGENS: Herbs for Strength, Stamina, and Stress Relief,' Healing Arts Press, 2007 - contains a detailed monograph on Schisandra chinensis as well as highlights health benefits. 

'Aerodynamic' is an instrumental song by Daft Punk that is particul

## Reproducibility Checks

Makes sure that the output of these 3 files (selected randomly) is the same.

In [3]:
random_file_1_path = os.path.join(".", "wikipedia", "repeats-and-false-starts","8_1.txt")
random_file_2_path = os.path.join(".", "wikipedia", "interjections","0_3908.txt")
random_file_3_path = os.path.join(".", "wikipedia", "all-3","0_432.txt")

# reads the output of 3 files
random_file_1 = utils_general.read_file(random_file_1_path)
random_file_2 = utils_general.read_file(random_file_2_path)
random_file_3 = utils_general.read_file(random_file_3_path)

# re-runs the writing files out & transformations
write_wikipedia_files_out()
run2_random_file_1 = utils_general.read_file(random_file_1_path)
run2_random_file_2 = utils_general.read_file(random_file_2_path)
run2_random_file_3 = utils_general.read_file(random_file_3_path)

# ensures that the new files are the same as the old files
print(list(difflib.unified_diff(random_file_1.split(" "), run2_random_file_1.split(" "))))
print(list(difflib.unified_diff(random_file_2.split(" "), run2_random_file_2.split(" "))))
print(list(difflib.unified_diff(random_file_3.split(" "), run2_random_file_3.split(" "))))

[]
[]
[]
