In [132]:
import os
import json
import csv

In [133]:
DATA_PATH = os.path.join(os.path.curdir, "data")
SAVE_PATH = os.path.join(os.path.curdir, "data", "training_formatted_data")

os.makedirs(SAVE_PATH, exist_ok=True)

In [134]:
def find_text_files_in_dir(path):
    text_files = []
    sub_directories = []
    for item_name in os.listdir(DATA_PATH):
        item_path = os.path.join(DATA_PATH, item_name)
        if item_name.endswith(".txt") and os.path.isfile(item_path):
            text_files.append(item_path)
        elif os.path.isdir(item_path):
            sub_directories.append(item_path)

    for sub_directory in sub_directories:
        for item_name in os.listdir(sub_directory):
            item_path = os.path.join(sub_directory, item_name)
            if item_name.endswith(".txt") and os.path.isfile(item_path):
                text_files.append(item_path)

    return text_files

In [135]:
text_file_paths = find_text_files_in_dir(DATA_PATH)

In [136]:
print(text_file_paths[0])
print(len(text_file_paths))

./data/saints_chapters/7_Keep_Up_Good_Courage.txt
176


In [137]:
formatted_data = []

In [138]:
for text_file_path in text_file_paths:
    with open(text_file_path, "r") as text_file:
        text = text_file.read()
        formatted_data.append({"text": text})

print(len(formatted_data))

176


In [139]:
# FORMAT SCRAPED DATA
DC_DATA = os.path.join(DATA_PATH, "dc", "scrape_dc.json")
TG_DATA = os.path.join(DATA_PATH, "topical_guide", "scrape_tg.json")
CONF_DATA = os.path.join(DATA_PATH, "conference_data", "conference_talks.json")

def format_json_entry(entry):
    cleaned_text = ""

    for key, value in entry.items():
        cleaned_text += value + "\n"

    return cleaned_text

with open(DC_DATA, "r") as dc_file:
    dc_data = json.load(dc_file)
    for entry in dc_data.values():
        del entry["description"]
        formatted_data.append({"text": format_json_entry(entry)})

with open(CONF_DATA, "r") as conf_file:
    conf_data = json.load(conf_file)
    for entry in conf_data.values():
        formatted_data.append({"text": format_json_entry(entry)})

print(len(formatted_data))

4051


In [140]:
TRAIN_SET_PATH = os.path.join(SAVE_PATH, "train.json")
VALIDATION_SET_PATH = os.path.join(SAVE_PATH, "validation.json")

In [141]:
# Randomly shuffle the data
import random

random.shuffle(formatted_data)

In [142]:
# Split the data into train and validation sets
train_validation_split_index = int(len(formatted_data) * 0.95)
train_set = formatted_data[: train_validation_split_index]
print(len(train_set))
validation_set = formatted_data[train_validation_split_index:]
print(len(validation_set))

3848
203


In [143]:
print(train_set[0])
print(validation_set[0])

{'text': 'When Nicodemus came to Jesus early in the Savior’s ministry, he spoke for all of us when he said, “Rabbi, we know that thou art a teacher come from God.”1\nChrist was, of course, much more than a teacher. He was the very Son of God, the Holy One of the eternal gospel plan, the Savior and Redeemer of the world.\nBut Nicodemus was starting about the way you and I started, the way any child or young student or new convert begins—by recognizing and responding to a thrilling teacher who touches the innermost feelings of our heart.\nIn recent months President Gordon B. Hinckley has called on us to hold our people close to the Church, especially the newly converted member. In issuing this call President Hinckley has reminded that we all need at least three things to remain firmly in the faith—a friend, a responsibility, and “[nourishing] by the good word of God.”2\nInspired instruction in the home and in the Church helps provide this crucial element of nourishing by the good word of

In [144]:
with open(os.path.join(TRAIN_SET_PATH), "w+") as train_file:
    json.dump(train_set, train_file)

In [145]:
with open(os.path.join(VALIDATION_SET_PATH), "w+") as validation_file:
    json.dump(validation_set, validation_file)