In [1]:
import os 
import json 
import pandas as pd 
from collections import Counter

In [3]:
def display_attr_stat(train_df, aug_df, to_add=False):
    train_attr_stats = dict(sorted(dict(Counter(train_df['attribute'])).items(), key= lambda item : item[1], reverse=True))
    aug_attr_stats = dict(sorted(dict(Counter(aug_df['attribute'])).items(), key= lambda item : item[1], reverse=True))
    attr_names = list(train_attr_stats.keys())
    train_attr_values = list(train_attr_stats.values())
    aug_attr_values = []
    for attr_name in attr_names:
        try:
            aug_attr_values.append(aug_attr_stats[attr_name])
        except KeyError:
            aug_attr_values.append(0)
    attr_df = pd.DataFrame()
    attr_df['Attr Name'] = attr_names
    attr_df['Train Count'] = train_attr_values
    attr_df['Aug Count'] = aug_attr_values
    if to_add:
        total_values = [tv + av for tv, av in zip(train_attr_values, aug_attr_values)]
        attr_df['Total Count'] = total_values
    print(attr_df)
    return attr_df


In [4]:
# NOTE: Aug df
filter_dir = 'filter'
filter_dir = os.path.join(filter_dir, 'sel_aug')
filename = 'filter_aug.csv'
aug_df = pd.read_csv(os.path.join(filter_dir, filename)).sample(frac=1, random_state=37)


In [2]:
# NOTE: Train data
data_path = '../data/FairytaleQA/train.json'
train_data = []
with open(data_path, 'r') as infile:
    for line in infile:
        train_data.append(json.loads(line))
train_df = pd.DataFrame(train_data)

In [7]:
attr_df = display_attr_stat(train_df, aug_df, to_add=True)

             Attr Name  Train Count  Aug Count  Total Count
0               action         2694          0         2694
1  causal relationship         2368          0         2368
2            character          962       2804         3766
3              feeling          824       2169         2993
4   outcome resolution          811       1717         2528
5              setting          523       1505         2028
6           prediction          366        833         1199


In [11]:
attr_df['Train Count'][attr_df['Attr Name'] == 'action'].values[0]

2694

# Visualize data

In [23]:
def display_cooccurence(df, col1, col2):
    co_mat = pd.crosstab(df[col1], df[col2])
    print(co_mat)
    if col1 == 'ex_or_im' or col2 == 'ex_or_im':
        exp_sum = sum(co_mat['explicit'].tolist())
        imp_sum = sum(co_mat['implicit'].tolist())
        print('\nSum')
        print('Explicit {:d}'.format(exp_sum))
        print('Implicit {:d}'.format(imp_sum))
    elif col1 == 'local_or_sum' or col2 == 'local_or_sum':
        exp_sum = sum(co_mat['local'].tolist())
        imp_sum = sum(co_mat['summary'].tolist())
        print('\nSum')
        print('Local {:d}'.format(exp_sum))
        print('Summary {:d}'.format(imp_sum))

In [11]:
def json_to_df(json_file):
    train_data = []
    with open(json_file, 'r') as infile:
        for line in infile:
            train_data.append(json.loads(line))
    train_df = pd.DataFrame(train_data)
    return train_df

In [12]:
main_dir = '../data/FairytaleQA/'

In [14]:
# train 
filename = 'train.json'
file_path = os.path.join(main_dir, filename)
print('Filename {:s}'.format(filename))
df = json_to_df(file_path)
display_cooccurence(df, 'attribute', 'ex_or_im')

Filename train.json
ex_or_im             explicit  implicit
attribute                              
action                   2392       302
causal relationship      1329      1039
character                 911        51
feeling                   350       474
outcome resolution        711       100
prediction                171       195
setting                   518         5

Sum
Explicit 6382
Implicit 2166


In [15]:
# an example
filename = 'prompt_aug_full_train.json'
file_path = os.path.join(main_dir, filename)
print('Filename {:s}'.format(filename))
df = json_to_df(file_path)
display_cooccurence(df, 'attribute', 'ex_or_im')

Filename prompt_aug_full_train.json
ex_or_im             explicit  implicit
attribute                              
action                   3483       384
causal relationship      1903      1392
character                1341        59
feeling                   515       640
outcome resolution       1018       136
prediction                203       229
setting                   765         8

Sum
Explicit 9228
Implicit 2848


In [16]:
# an example
filename = 'prompt_aug_selective_train.json'
file_path = os.path.join(main_dir, filename)
print('Filename {:s}'.format(filename))
df = json_to_df(file_path)
display_cooccurence(df, 'attribute', 'ex_or_im')

Filename prompt_aug_selective_train.json
ex_or_im             explicit  implicit
attribute                              
action                   2392       302
causal relationship      1329      1039
character                1341        59
feeling                   515       640
outcome resolution       1018       136
prediction                203       229
setting                   765         8

Sum
Explicit 7563
Implicit 2413


In [17]:
# an example
filename = 'prompt_aug_control_count.json'
file_path = os.path.join(main_dir, filename)
print('Filename {:s}'.format(filename))
df = json_to_df(file_path)
display_cooccurence(df, 'attribute', 'ex_or_im')

Filename prompt_aug_control_count.json
ex_or_im             explicit  implicit
attribute                              
action                   2392       302
causal relationship      1329      1039
character                1139        60
feeling                   517       682
outcome resolution       1052       147
prediction                585       614
setting                  1188        11

Sum
Explicit 8202
Implicit 2855


In [24]:
# train 
filename = 'train.json'
file_path = os.path.join(main_dir, filename)
print('Filename {:s}'.format(filename))
df = json_to_df(file_path)
display_cooccurence(df, 'attribute', 'local_or_sum')

Filename train.json
local_or_sum         local  summary
attribute                          
action                2574      120
causal relationship   2057      311
character              910       52
feeling                768       56
outcome resolution     766       45
prediction             170      196
setting                511       12

Sum
Local 7756
Summary 792


In [25]:
# an example
filename = 'prompt_aug_full_train.json'
file_path = os.path.join(main_dir, filename)
print('Filename {:s}'.format(filename))
df = json_to_df(file_path)
display_cooccurence(df, 'attribute', 'local_or_sum')

Filename prompt_aug_full_train.json
local_or_sum         local  summary
attribute                          
action                3717      150
causal relationship   2878      417
character             1328       72
feeling               1087       68
outcome resolution    1091       63
prediction             201      231
setting                754       19

Sum
Local 11056
Summary 1020


In [26]:
# an example
filename = 'prompt_aug_control_count.json'
file_path = os.path.join(main_dir, filename)
print('Filename {:s}'.format(filename))
df = json_to_df(file_path)
display_cooccurence(df, 'attribute', 'local_or_sum')

Filename prompt_aug_control_count.json
local_or_sum         local  summary
attribute                          
action                2574      120
causal relationship   2057      311
character             1140       59
feeling               1117       82
outcome resolution    1135       64
prediction             555      644
setting               1176       23

Sum
Local 9754
Summary 1303


In [27]:
# an example
filename = 'prompt_aug_control_count_rb.json'
file_path = os.path.join(main_dir, filename)
print('Filename {:s}'.format(filename))
df = json_to_df(file_path)
display_cooccurence(df, 'attribute', 'local_or_sum')

Filename prompt_aug_control_count_rb.json
local_or_sum         local  summary
attribute                          
action                1141       58
causal relationship   1033      166
character             1140       59
feeling               1117       82
outcome resolution    1135       64
prediction             555      644
setting               1176       23

Sum
Local 7297
Summary 1096
