### Split the json file to Train and Test

In [3]:
import json
from collections import defaultdict
import os

### Extraction Text

In [27]:
# split data into train and test
def read_in_data(file_name):
    test_data = None
    train = defaultdict()
    with open(file_name, 'rb') as f:
        test_data = json.load(f)
    # with a minimum of 5 examples
    percent_5 = max(int(len(test_data['Examples'][0]['Input']) * 0.05),5)
    
    if len(test_data['Examples'][0]['Input']) < 5:
        if len(test_data['Examples'][0]['Output']) > 5:
            inputstr = test_data['Examples'][0]['Input'][0]['Value']
            ending_idx = test_data['Examples'][0]['Output'][5]['End']
            
            train['InputTable'] = test_data['Examples'][0]['Input'][0]
            train['InputTable']['Value'] = inputstr[:ending_idx+10]

            train['OutputTable'] = test_data['Examples'][0]['Output'][0: 5]
            
            train['TestingTable'] = test_data['Examples'][0]['Input'][0]
            train['TestingTable']['Value'] = inputstr[ending_idx :]
            train['TestingTable']['Start'] = ending_idx
            
            train['TestAnswer'] = test_data['Examples'][0]['Output'][5:]

    else:
        train['InputTable'] = test_data['Examples'][0]['Input'][0: percent_5]
        train['OutputTable'] = test_data['Examples'][0]['Output'][0: percent_5]
        train['TestingTable'] = test_data['Examples'][0]['Input'][percent_5:]
        train['TestAnswer'] = test_data['Examples'][0]['Output'][percent_5:]

    return train

In [28]:

folder_path = "prose/Extraction.Text"

for sub_folder in os.listdir(folder_path):
    # Iterate through all files in the folder
    if sub_folder != '.DS_Store':

        sub_folder_path = os.path.join(folder_path, sub_folder)
        for filename in os.listdir(sub_folder_path):
            # Join the folder path with the filename to get the full file path
            if filename not in ['Split','output.json','input.txt','meta.json']:
                file_path = os.path.join(sub_folder_path, filename)
                print(file_path)
                split_data = read_in_data(file_path)
                
                # create path for output file
                new_folder_path = sub_folder_path+ '/Split'
                if not os.path.exists(new_folder_path):
                    os.makedirs(new_folder_path)
                output_file_name = os.path.join(new_folder_path, filename)

                # write the output file
                with open(output_file_name, "w") as file: 
                    json.dump(split_data, file)



prose/Extraction.Text/Log.000003/DateStruct.Date.spec.json
75
prose/Extraction.Text/Log.000003/root.DateStruct.spec.json
38999
prose/Extraction.Text/Log.000003/root.Date.spec.json
38999
prose/Extraction.Text/Log.000002/root.Definition.spec.json
509
prose/Extraction.Text/Log.000002/State.StateFlag.spec.json
23
prose/Extraction.Text/Log.000002/Type.TypeFlag.spec.json
20
prose/Extraction.Text/Log.000002/root.BaseAddress.spec.json
509
prose/Extraction.Text/Log.000002/root.StateFlag.spec.json
509
prose/Extraction.Text/Log.000002/Protect.ProtectDefinition.spec.json
21
prose/Extraction.Text/Log.000002/BaseAddressStruct.RegionSize.spec.json
157
prose/Extraction.Text/Log.000002/root.State.spec.json
509
prose/Extraction.Text/Log.000002/BaseAddressStruct.StateFlag.spec.json
157
prose/Extraction.Text/Log.000002/BaseAddressStruct.StateDefinition.spec.json
157
prose/Extraction.Text/Log.000002/root.StateDefinition.spec.json
509
prose/Extraction.Text/Log.000002/root.ProtectFlag.spec.json
509
prose/Ext

In [25]:
f = 'prose/Extraction.Text/Log.000003/root.DateStruct.spec.json'
read_in_data(f)

38999
477
{'Start': 467, 'End': 38999, 'Value': '\n2\t2024-10-04 10:31:40.373857\t3918\t6054\t0\t114ac440-4dd8-3fde-d584-66a9ee4a57c3\n2\t2024-10-04 06:30:06.768312\t3918\t6593\t0\tc119a9dd-69a1-f271-f050-9d312cdc3212\n2\t2024-10-04 09:36:59.525243\t3918\t6471\t0\t28c5b719-c5bb-8d6e-d9c3-9f082f76e0a4\n2\t2024-10-04 02:15:55.400608\t3918\t7389\t0\t2866864d-b686-5eca-3d87-b9120ea39ace\n2\t2024-10-04 12:24:30.149072\t3918\t5567\t0\teafbaa0a-e51c-12ec-6d31-3a7b720771ff\n2\t2024-10-04 08:58:23.827869\t3918\t9316\t0\tac551b96-0252-edab-1e0d-8758a4451683\n2\t2024-10-04 09:38:09.220490\t3918\t5236\t0\t4a3829f2-a50c-bd4c-ab7b-b19d2f3fd90a\n2\t2024-10-04 06:36:28.412339\t3918\t3022\t0\t47605ce8-0010-6633-1bb8-6c26c515afbc\n2\t2024-10-04 05:06:00.912619\t3918\t1859\t0\t1c1278e6-8a8c-fa71-ae6f-520623e86434\n2\t2024-10-04 08:11:37.712176\t3918\t1711\t0\tbc749bc4-4783-9118-a709-cc33336d3e72\n2\t2024-10-04 11:33:37.203626\t3918\t3871\t0\t14e8cb28-d790-da45-60fd-375647e61b11\n2\t2024-10-04 12:21:38.97

### Transformation Text

In [1]:
def read_in_data(file_name,sub_folder):
    test_data = None
    train = defaultdict()
    input_list = []
    output_list = []
    with open(file_name, 'rb') as f:
        test_data = json.load(f)
    
    
    for d in test_data['Examples']:
        if d['Input'][0]:
            input_list.append([d['Input'][0].replace(",", "-")])
        else:
            input_list.append([d['Input'][0]])
        output_list.append([d['Output']])
    
    percent_5 = max(int(len(input_list) * 0.05),5)
    train["InputTable"] = input_list[0:percent_5]
    train["NumSamples"] = len(train['InputTable'])
    train["TestName"] = sub_folder
    train["TestingTable"] = input_list
    train["OutputTable"] = output_list[0:percent_5]
    train["TestAnswer"] = output_list
    
    return train

In [2]:

folder_path = "prose/Transformation.Text"

for sub_folder in os.listdir(folder_path):
    # Iterate through all files in the folder
    if sub_folder != '.DS_Store':

        sub_folder_path = os.path.join(folder_path, sub_folder)
        for filename in os.listdir(sub_folder_path):
            # Join the folder path with the filename to get the full file path
            if filename not in ['Split','meta.json']:
                file_path = os.path.join(sub_folder_path, filename)
                output_path = "foofah/Transformation.Text/" + sub_folder + ".txt"
                print(sub_folder)
                split_data = read_in_data(file_path,sub_folder)
                

                # write the output file
                with open(output_path, "w", encoding='utf-8') as file: 
                    json.dump(split_data, file, ensure_ascii=False)



NameError: name 'os' is not defined

In [60]:
# .encode("utf-8")
file_name = "prose/Transformation.Text/Address.000002/spec.json"
train = read_in_data(file_name)

In [61]:
print(type(train['InputTable'][0][0]))

<class 'str'>


In [62]:
print(train['InputTable'][0][0])

Aysu Fatma Ahmed 492 24th Place NW-Edison-AK-(896) 388-9065-000-93-6876-38891


In [63]:
file_name = "foofah/foofah/exp0_2_1.txt"
with open(file_name, 'rb') as f:
        test_data = json.load(f)
print(type(test_data['InputTable'][0][0]))

<class 'str'>


In [69]:
file_name = "foofah/Transformation.Text/Address.000002.txt"
with open(file_name, 'rb') as f:
        trans = json.load(f)
print(type(trans['InputTable'][0][0]))

<class 'str'>


In [42]:
raw_data = [map(str, x) for x in test_data['InputTable']]
print(raw_data)

[<map object at 0x109e48400>, <map object at 0x109e49b70>]


In [45]:
raw_data = [map(str, x) for x in train['InputTable']]
print(raw_data)

[<map object at 0x109e49ff0>, <map object at 0x109e49d80>, <map object at 0x109e4b910>, <map object at 0x109e49d50>, <map object at 0x109e49450>]


In [48]:
raw_data = [map(str, x) for x in trans['InputTable']]
print(raw_data)

[<map object at 0x109e4bdf0>, <map object at 0x109e49cf0>, <map object at 0x109e494e0>, <map object at 0x109e4b700>, <map object at 0x109e49720>]


In [19]:
# split data into train and test
def split_data(file_name):
    test_data = None
    train = defaultdict()
    test = defaultdict()
    with open(file_name, 'rb') as f:
        test_data = json.load(f)
    train['InputTable'] = test_data['Input'][:5]
    train['OutputTable'] = test_data['Output'][:5]
    test['TestingTable'] = test_data['Input'][:5]
    test['TestAnswer'] = test_data['Output'][:5]

    
    with open(file_name[:-5] + '_train.json', "w") as file:   
            json.dump(train, file)
    with open(file_name[:-5] + '_test.json', "w") as file:   
            json.dump(test, file)

In [20]:
split_data('executive/executive_exp05.json')

In [99]:
path = 'foofah/Transformation.Text/Address.000002.txt'
with open(path, 'rb') as f:
    data = json.load(f)
o = transform_dataset(data['TestingTable'])
o

[['492 24th Place NW'],
 ['967 03th Place SE'],
 ['880 81th Place SE'],
 ['256 32th Place NE'],
 ['526 03th Place NE']]

In [95]:
data['TestingTable']

[['Aysu Fatma Ahmed 492 24th Place NW - Edison-AK-(896) 388-9065-000-93-6876-38891'],
 ['Fiamma Greco 967 03th Place SE - Long Beach-OK-(129) 734-1247-000-61-4879-03719'],
 ['Muralixxxxx Hasmik Drakou 880 81th Place SE - Brasília-WY-(467) 817-7191-000-47-5407-17293'],
 ['Joyikuttyxxxxx Charles Arjun 256 32th Place NE - Kiev-AR-(666) 929-0960-000-14-1069-41230'],
 ['Elise Vadeboncoeur 526 03th Place NE - Curitiba-OH-(993) 525-5024-000-84-6131-03719']]

In [98]:
def transform_dataset(input_dataset):
    output_dataset = []
    for data in input_dataset:
        address = data[0].split('-')[1]
        output_dataset.append([address])
    return output_dataset

In [2]:
import json
from collections import defaultdict

In [676]:
name = '48_5'

In [677]:

file_name = "foofah/foofah/exp0_"+ name+".txt" 
with open(file_name, 'rb') as f:
        test_data = json.load(f)

In [678]:
test_data['OutputTable']

[['Company A', 'Address 1', 'City', 'State', 'Zip', 'Phone', ''],
 ['Company B', 'Address 1', 'City', 'State', 'Zip', '', ''],
 ['Company C', 'Address 1', 'City', 'State', 'Zip', 'Phone', 'Website'],
 ['Company D', '770 Oak Lane', 'Grand Forks', 'ND', '58201', '', ''],
 ['Company E', '833 South Street', 'Bowie', 'MD', '20715', '', '']]

In [669]:
len(test_data['TestAnswer'][0])

13

In [674]:
output = test_data['TestAnswer'][1]
output = [output[1:]]
output

[['C', 'B', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'B', 'B']]

In [628]:
print(len(test_data["OutputTable"][0]),len(test_data["TestAnswer"][0]))

5 3


In [629]:
output = []
for o in  test_data['TestAnswer']:
    output.append(o[0:len(test_data["OutputTable"][0])])
output

[['Bob', 'Dan', 'Michael']]

In [475]:
test_data['TestAnswer']

[['Pants', 'Yellow'],
 ['Pants', 'Blue'],
 ['Pants', 'Green'],
 ['Pants', 'Orange'],
 ['Pants', 'Red'],
 ['Scarf', 'Green'],
 ['Scarf', 'Orange'],
 ['Scarf', 'Red'],
 ['Scarf', 'Yellow'],
 ['Scarf', 'Blue'],
 ['Glove', 'Blue'],
 ['Glove', 'Purple'],
 ['Glove', 'Red'],
 ['Glove', 'Yellow'],
 ['Glove', 'Green'],
 ['Shoes', 'Black'],
 ['Shoes', 'White'],
 ['Shoes', 'Grey'],
 ['Shoes', 'Red'],
 ['Shoes', 'Yellow']]

In [402]:
temp = []
dic = {}
temp_dic ={}
for i in range(len(test_data['OutputTable'])):
    data = test_data['InputTable']
    outputdata = test_data['OutputTable']
    temp_dic["Input"] = data[1]
    temp_dic["Output"] = outputdata[i]
    temp.append(temp_dic)
    temp_dic = {}

# for i in range(len(test_data['TestingTable'])):
#     data = test_data['TestingTable']
#     outputdata = test_data['TestAnswer']
#     temp_dic["Input"] = data[i]
#     temp_dic["Output"] = outputdata[i]
#     temp.append(temp_dic)
#     temp_dic = {}
dic["Examples"] = temp

IndexError: list index out of range

In [381]:
dic

{}

In [377]:
path = "prose/foofah/exp0_"+ name + ".json"
with open(path, "w") as file:   
            json.dump(dic, file)

In [675]:
out_path = "../output/prose/foofah/exp0_" + name + ".json"
with open(out_path, "w") as file:   
            json.dump(output, file)