In [1]:
import random
import pandas as pd
from collections import Counter
from sklearn.utils import shuffle

random.seed(37)

In [2]:
# Import data
train_df = pd.read_csv('../../train_val_split_csv/train.csv')
codex_attr_df = pd.read_csv('codex_augment_attr.csv')
print('Codex Attr Df Head')
codex_attr_df.head()

Codex Attr Df Head


Unnamed: 0,pair_id,attribute1,source_title,cor_section,answer,prompt,question,predicted attribute
0,23ce8aec4b32591b,causal relationship,king-kojata,5,To go on a journey.,"Generate attriburte from story, answer and que...",Why did Milan leave his home?,The attribute is: causal relationship
1,23ce8aec4b32591b,setting,king-kojata,5,on his journey,"Generate attriburte from story, answer and que...",Where did Milan see a lake as smooth as glass ...,The attribute is: setting
2,23ce8aec4b32591b,action,king-kojata,5,thirty little white garments,"Generate attriburte from story, answer and que...",What did Milan notice lying on the grass?,The attribute is: action
3,23ce8aec4b32591b,outcome resolution,king-kojata,5,They disappeared.,"Generate attriburte from story, answer and que...",What happened when the ducks turned into maidens?,The attribute is: outcome resolution
4,23ce8aec4b32591b,outcome resolution,king-kojata,5,He will not be able to find the magic castle.,"Generate attriburte from story, answer and que...",What will happen if Milan does not find the th...,The attribute is: prediction


In [3]:
def preprocess_codex_df(codex_df):
    '''
    1. Extract attribute name from predicted attribute
    2. Change column names -> attribute1-org_attr and predicted attribute - attribute1
    '''
    prefix = 'The attribute is: '
    attrs = codex_df['predicted attribute'].apply(lambda strmine: strmine.split(prefix)[1])
    codex_df.drop(columns=['predicted attribute'], inplace=True)
    codex_df.rename(columns={'attribute1':'codex_attribute'})
    codex_df['attribute1'] = attrs

In [4]:
preprocess_codex_df(codex_attr_df) # Inplace changes
codex_attr_df.head()

Unnamed: 0,pair_id,attribute1,source_title,cor_section,answer,prompt,question
0,23ce8aec4b32591b,causal relationship,king-kojata,5,To go on a journey.,"Generate attriburte from story, answer and que...",Why did Milan leave his home?
1,23ce8aec4b32591b,setting,king-kojata,5,on his journey,"Generate attriburte from story, answer and que...",Where did Milan see a lake as smooth as glass ...
2,23ce8aec4b32591b,action,king-kojata,5,thirty little white garments,"Generate attriburte from story, answer and que...",What did Milan notice lying on the grass?
3,23ce8aec4b32591b,outcome resolution,king-kojata,5,They disappeared.,"Generate attriburte from story, answer and que...",What happened when the ducks turned into maidens?
4,23ce8aec4b32591b,prediction,king-kojata,5,He will not be able to find the magic castle.,"Generate attriburte from story, answer and que...",What will happen if Milan does not find the th...


In [5]:
def get_attr_stats(train_df, codex_df, save_name):
    print('Original Trainset size', len(train_df))
    print('Augmented Trainset size', len(codex_df))
    org_count = dict(sorted(dict(Counter(train_df['attribute1'])).items()))
    codex_count = dict(sorted(dict(Counter(codex_df['attribute1'])).items()))
    attr_list = list(org_count.keys())
    org_attr_count = list(org_count.values())
    codex_attr_count = list(codex_count.values())
    total_attr_count = [org+codex for org, codex in zip(org_attr_count, codex_attr_count)]
    org_attr_sum, codex_attr_sum = sum(org_attr_count), sum(codex_attr_count)
    org_attr_per = [ctr/org_attr_sum for ctr in org_attr_count]
    codex_attr_per = [ctr/codex_attr_sum for ctr in codex_attr_count]
    attr_comp_df = pd.DataFrame()
    attr_comp_df['Attribute'] = attr_list
    attr_comp_df['Original Count'] = org_attr_count
    attr_comp_df['Codex Count'] = codex_attr_count
    attr_comp_df['Total Count'] = total_attr_count
    attr_comp_df['Original Percent'] = org_attr_per
    attr_comp_df['Codex Percent'] = codex_attr_per
    if save_name is not None:
        attr_comp_df.to_csv(save_name, index=False)
    return attr_comp_df

In [6]:
attr_comp_df = get_attr_stats(train_df, codex_attr_df, 'org_and_codex_attr_stats.csv')
attr_comp_df

Original Trainset size 6005
Augmented Trainset size 91433


Unnamed: 0,Attribute,Original Count,Codex Count,Total Count,Original Percent,Codex Percent
0,action,1921,31352,33273,0.3199,0.342896
1,causal relationship,1673,27893,29566,0.278601,0.305065
2,character,666,7497,8163,0.110908,0.081994
3,feeling,574,4923,5497,0.095587,0.053843
4,outcome resolution,594,4894,5488,0.098918,0.053526
5,prediction,225,5221,5446,0.037469,0.057102
6,setting,352,9653,10005,0.058618,0.105575


In [7]:
# NOTE: Balanced Augment (Upto least total-count)
min_tot_count = min(attr_comp_df['Total Count'])
min_index = attr_comp_df['Total Count'].tolist().index(min_tot_count)
min_attr_name = attr_comp_df.loc[min_index, 'Attribute']
print('Minimum Attribute "%s": %d'%(min_attr_name, min_tot_count))

# Shuffle codex data
allowed_count_lst = [min_tot_count-org for org in attr_comp_df['Original Count']] 
allowed_count_dict = {atr_name:allowed_count for atr_name, allowed_count in zip(attr_comp_df['Attribute'].tolist(), allowed_count_lst)}
print(allowed_count_dict)

# # Start appending
# for i, row in codex_attr_df.iterrows():
#     print(i, row)
all_rows = []
for i in range(len(codex_attr_df)):
    row_vals = codex_attr_df.loc[i].values.tolist()
    all_rows.append(row_vals)
# Shuffle all_rows
random.shuffle(all_rows)
    
append_rows = []
for row_list in all_rows:
    if allowed_count_dict[row_list[1]] > 0:
        append_rows.append(row_list)
        allowed_count_dict[row_list[1]] -= 1

assert len(append_rows) == sum(allowed_count_lst)
print(len(append_rows))

append_df = pd.DataFrame(append_rows, columns=codex_attr_df.columns)

# Augment Dataset
codex_aug_train_df = train_df.append(pd.DataFrame(append_df,
                            columns=train_df.columns),
                            ignore_index=True)
codex_aug_train_df

Minimum Attribute "prediction": 5446
{'action': 3525, 'causal relationship': 3773, 'character': 4780, 'feeling': 4872, 'outcome resolution': 4852, 'prediction': 5221, 'setting': 5094}
32117


  codex_aug_train_df = train_df.append(pd.DataFrame(append_df,


Unnamed: 0,pair_id,source_title,cor_section,answer,question,local_or_sum,attribute1,attribute2,ex_or_im
0,23ce8aec4b32591b,king-kojata,5,a horse,What did the Prince tell his father to give him?,local,action,,explicit
1,f3cfe23c09215e97,black-sheep,5,comical,How did the ram look without fur and because h...,local,character,,explicit
2,7de1192ea017582b,the-boyhood-of-cuchulain,2,alarmed,How did Dectera secretly feel about Setanta's ...,local,feeling,,explicit
3,606c0713383559a6,the-toad-woman,89,astonished,How did the Toad-Woman feel when she saw her c...,summary,feeling,,explicit
4,6a9173711794569b,sheem-the-forsaken-boy,24,strove to catch Sheem,What did Owasso do when he saw Sheem?,local,action,,explicit
...,...,...,...,...,...,...,...,...,...
38117,6010b699fa525cc7,flax,8,It will be worn out before it had half finishe...,What will the paper do with its journey?,,prediction,,
38118,b8acaf6247ba5748,sagacious-monkey-and-boar,2,"He will be killed then roasted, stewed, and ea...",What will happen to the monkey?,,prediction,,
38119,a25d08011f3d55c8,the-enchanted-moccasins,18,He would be taken home.,What will happen when Ko-ko put on the enchant...,,prediction,,
38120,c6bb382ee8c9560d,the-red-swan,25,He will be sent to jail.,What will happen to the foolish fellow if he k...,,prediction,,


In [10]:
new_attr_df = get_attr_stats(train_df, codex_aug_train_df, None)
new_attr_df

Original Trainset size 6005
Augmented Trainset size 38122


Unnamed: 0,Attribute,Original Count,Codex Count,Total Count,Original Percent,Codex Percent
0,action,1921,5446,7367,0.3199,0.142857
1,causal relationship,1673,5446,7119,0.278601,0.142857
2,character,666,5446,6112,0.110908,0.142857
3,feeling,574,5446,6020,0.095587,0.142857
4,outcome resolution,594,5446,6040,0.098918,0.142857
5,prediction,225,5446,5671,0.037469,0.142857
6,setting,352,5446,5798,0.058618,0.142857


In [12]:
codex_aug_train_df.to_csv('codex_attr_balanced_augment.csv', index=False)

# Balanced Codex Only + Balanced (Exact Match Augment)

In [17]:
# NOTE: Balanced Augment - 2 (Upto least codex-count)
min_aug_count = min(attr_comp_df['Codex Count'])
min_index = attr_comp_df['Codex Count'].tolist().index(min_aug_count)
min_attr_name = attr_comp_df.loc[min_index, 'Attribute']
print('Minimum Attribute "%s": %d'%(min_attr_name, min_aug_count))

# Shuffle codex data
allowed_count_dict = {atr_name:min_aug_count for atr_name in attr_comp_df['Attribute'].tolist()}
print(allowed_count_dict)

# # Start appending
# for i, row in codex_attr_df.iterrows():
#     print(i, row)
all_rows = []
for i in range(len(codex_attr_df)):
    row_vals = codex_attr_df.loc[i].values.tolist()
    all_rows.append(row_vals)
# Shuffle all_rows
random.shuffle(all_rows)
    
append_rows = []
for row_list in all_rows:
    if allowed_count_dict[row_list[1]] > 0:
        append_rows.append(row_list)
        allowed_count_dict[row_list[1]] -= 1

print(len(append_rows))

append_df_2 = pd.DataFrame(append_rows, columns=codex_attr_df.columns)

# # Augment Dataset
# codex_aug_train_df_2 = train_df.append(pd.DataFrame(append_df_2,
#                             columns=train_df.columns),
#                             ignore_index=True)
# codex_aug_train_df_2

Minimum Attribute "outcome resolution": 4894
{'action': 4894, 'causal relationship': 4894, 'character': 4894, 'feeling': 4894, 'outcome resolution': 4894, 'prediction': 4894, 'setting': 4894}
34258


In [27]:
# Load Sel Match Data
sel_ex_match_df = pd.read_csv('../Sel_Exact_Match_Augment_Train.csv')
sel_ex_match_df

Unnamed: 0,pair_id,source_title,cor_section,answer,question,local_or_sum,attribute1,attribute2,ex_or_im
0,23ce8aec4b32591b,king-kojata,5,a horse,What did the Prince tell his father to give him?,local,action,,explicit
1,f3cfe23c09215e97,black-sheep,5,comical,How did the ram look without fur and because h...,local,character,,explicit
2,7de1192ea017582b,the-boyhood-of-cuchulain,2,alarmed,How did Dectera secretly feel about Setanta's ...,local,feeling,,explicit
3,606c0713383559a6,the-toad-woman,89,astonished,How did the Toad-Woman feel when she saw her c...,summary,feeling,,explicit
4,6a9173711794569b,sheem-the-forsaken-boy,24,strove to catch Sheem,What did Owasso do when he saw Sheem?,local,action,,explicit
...,...,...,...,...,...,...,...,...,...
12574,04aa82f8be4d53dc,the-dragon-princess,"11, 12",pleased,How did the emperor feel when he saw the gems?,summary,prediction,feeling,explicit
12575,91f18b64e3745d3b,the-enchanted-moccasins,8,angry,How did the younger brother feel after he came...,local,prediction,feeling,implicit
12576,f4847f15729e5a29,murmur-goose-egg,1927,the devil,Who beat Murmur?,summary,prediction,character,explicit
12577,7f934010c2f55e6f,the-white-cat,48,joy and astonishment,How did the King and his courtiers feel after ...,local,prediction,action,explicit


In [28]:
# Append SEL Ex Match with codex data
sel_ex_match_df['aug_type'] = ['org' for _ in range(len(sel_ex_match_df))]
append_df_2['aug_type'] = ['codex' for _ in range(len(append_df_2))]
codex_aug_train_df_2 = sel_ex_match_df.append(pd.DataFrame(append_df_2,
                            columns=sel_ex_match_df.columns),
                            ignore_index=True)

  codex_aug_train_df_2 = sel_ex_match_df.append(pd.DataFrame(append_df_2,


In [29]:
new_attr_df_2 = get_attr_stats(train_df, codex_aug_train_df_2, None)
new_attr_df_2

Original Trainset size 6005
Augmented Trainset size 46837


Unnamed: 0,Attribute,Original Count,Codex Count,Total Count,Original Percent,Codex Percent
0,action,1921,6815,8736,0.3199,0.145505
1,causal relationship,1673,6567,8240,0.278601,0.14021
2,character,666,6691,7357,0.110908,0.142857
3,feeling,574,6691,7265,0.095587,0.142857
4,outcome resolution,594,6691,7285,0.098918,0.142857
5,prediction,225,6691,6916,0.037469,0.142857
6,setting,352,6691,7043,0.058618,0.142857


In [30]:
codex_aug_train_df_2.to_csv('codexattremmatch_augment.csv', index=False)