In [None]:
import numpy as np
import pandas as pd
import openai
from copy import deepcopy as cc
from tqdm import tqdm

In [None]:
import evaluate
bert_score = evaluate.load("bertscore")
bleu_score = evaluate.load("google_bleu")
bleurt = evaluate.load("bleurt", module_type="metric")
openai.api_key = "xxxx"

In [13]:
human_eval_ip = pd.read_csv('./../Data/human_eval_dat.csv')

In [14]:
expert_selected_AID = ['3AQF3RZ5596OVGWAPKGAIYKWIISF63', '3K9FOBBF2I7I2TCYAC5PFMB2AV3NL8', '37XITHEISXXAENAKQ6T7LNQPHKWCRP', '3ERET4BTVNXMTLQMO2AJIL33GJP9KQ', '3OB0CAO74IDASWXMQTZ24AG2XAVYHN',
                       '386PBUZZXGLMJ3LDVEDXMFUL1R9JL7', '3E7TUJ2EGDAEFFKCWLOWE88DWVX9DO', '3K9FOBBF2I7I2TCYAC5PFMB2AZGNLT', '3II4UPYCOKVK7E1YDNZI03Y26BMDQV', '36PW28KO40KXC48HKMDGGL1I2C8AEZ',
                       '39JEC7537VP27UR1ZQYHMKFS29PVCJ', '3CFJTT4SXUER97C592RNR9XKTVGI7S', '3JPSL1DZ5TN16ALUDLGZ68VRG36ANJ', '3OF2M9AATHC1ZC8ZS04IV95WVUAKZT', '30LB5CDZNDYZMM1VO7U0CPQNHV10ZY']

In [17]:
task_description = {'8': "Task Description: Can the 'State' be inferred from the 'Story' ?", 
                    '6': "Task Description: Make minimal revision to the 'Story' to make it consistent with the 'State'.", 
                    '7': "Task Description: Describe the change in participant state between 'Story 1' and 'Story 2'."}

gpt3_dat = pd.read_pickle("./../Data/GPT3_roberta_repr.pkl") 

gpt3_dat['task_6'] = gpt3_dat.apply(lambda x: f"Story: {x['Story']}\nState: {x['mod_assertion']}\nRevised Story: {x['Mod_Story']}\n", axis=1)
gpt3_dat['task_7'] = gpt3_dat.apply(lambda x: f"Story 1: {x['Story']}\nStory 2: {x['Mod_Story']}\nState 1: {x['assertion']}\nState 2: {x['mod_assertion']}\n", axis=1)

gpt3_dat['task_6_query'] = gpt3_dat.apply(lambda x: f"Story: {x['Story']}\nState: {x['mod_assertion']}\nRevised Story: ", axis=1)
gpt3_dat['task_7_query'] = gpt3_dat.apply(lambda x: f"Story 1: {x['Story']}\nStory 2: {x['Mod_Story']}", axis=1)

In [19]:
# val_len = val_dat.shape[0]
# val_sample = np.random.permutation(np.arange(val_len))[:200]
# np.save('./../Data/GPT3_val_sample.npy', val_sample)
# val_sample = np.load('./../Data/GPT3_val_sample.npy')

In [20]:
# Complete training data
tr_dat = gpt3_dat.loc[gpt3_dat.dat_type == 'Train_set']

# Subset of training data, that are curated by expert (w/o noise)
tr_dat_curated = gpt3_dat.loc[gpt3_dat.apply(lambda x: (x.dat_type == 'Train_set') and (x.GPT3_subset == 1.0), axis = 1), :]

# Validation data - used for hyperparam selection
# val_dat = gpt3_dat.loc[gpt3_dat.dat_type == 'Val_set']
# val_dat = val_dat.iloc[val_sample]

# Evaluation/test data - used for reporting GPT3 performance
val_dat = gpt3_dat.loc[gpt3_dat.AssignmentId.isin(human_eval_ip.AssignmentId)]

tr_dat.shape, tr_dat_curated.shape, val_dat.shape

((8476, 17), (202, 17), (200, 17))

##### For the subsequent cells:
- task_no = 6 for story generation from counterfactual
- task_no = 7 for state change generation task

### Approach 1 - Expert curated prompt examples

In [None]:
no_of_exs_list = [10]
approach = 1
task_no = 6
for no_of_exs in no_of_exs_list:
    task_prompt = "\n".join(gpt3_dat.loc[gpt3_dat.AssignmentId.isin(expert_selected_AID[:no_of_exs]), f'task_{task_no}'].tolist())
    task_prompt = task_description['6'] + '\n\n' + task_prompt

    final_dict_task = []
    for idx, row in tqdm(val_dat.iloc[:].iterrows(), total=len(val_dat.iloc[:])):
        df_temp = row.to_dict()
        gpt3_query = task_prompt + '\n' + df_temp[f'task_{task_no}_query']
        response = openai.Completion.create(model="text-davinci-002",
                                                prompt=gpt3_query,
                                                temperature=.9,
                                                max_tokens=100,
                                                top_p=1,
                                                frequency_penalty=0.5,
                                                presence_penalty=0.1)
        df_temp['GPT3_response'] = response['choices']
        df_temp['GPT3_response_readable'] = response['choices'][0]['text'].strip()
        final_dict_task.append(df_temp)

    final_dict_task_copy = pd.DataFrame(final_dict_task)
    final_dict_task_copy.to_csv(f'./../gpt3_op/Val_approach_{approach}_task_{task_no}_{no_of_exs}.csv', index = False)

### Approach 2 - Randomly sampled incontext examples

In [None]:
no_of_exs_list = [10]
task_no = 6
approach = 2

for no_of_exs in no_of_exs_list:
    final_dict_task = []
    for idx, row in tqdm(val_dat.iloc[:].iterrows(), total=len(val_dat.iloc[:])):

        # If we want to create prompt from examples randomly sampled from the entire train set, then use the 1st snippet below
        # task_prompt = "\n".join(tr_dat.sample(no_of_exs).loc[:, f'task_{task_no}'].tolist())

        # If we want to create prompt from examples randomly sampled from the subset of 200 expert curated train set, then use the 2nd snippet below            
        task_prompt = "\n".join(tr_dat_curated.sample(no_of_exs).loc[:, f'task_{task_no}'].tolist())
        
        task_prompt = task_description[f'{task_no}'] + '\n\n' + task_prompt

        df_temp = row.to_dict()
        gpt3_query = task_prompt + '\n' + df_temp[f'task_{task_no}_query']
        response = openai.Completion.create(model="text-davinci-002",
                                                prompt=gpt3_query,
                                                temperature=.9,
                                                max_tokens=100,
                                                top_p=1,
                                                frequency_penalty=0.5,
                                                presence_penalty=0.1)
        df_temp['GPT3_response'] = response['choices']
        df_temp['GPT3_prompt'] = gpt3_query

        df_temp['GPT3_response_readable'] = response['choices'][0]['text'].strip()
        final_dict_task.append(df_temp)

    final_dict_task_copy = pd.DataFrame(final_dict_task)
    final_dict_task_copy.to_csv(f'./../gpt3_op/Val_approach_{approach}_task_{task_no}_{no_of_exs}.csv', index = False)

### Approach 3 (Nearest neighbor - RoBERTa similarity)

In [None]:
import torch
from torch import nn

cos = nn.CosineSimilarity(dim=1, eps=0)

def get_similarity_task6(k, search_df, query_row):

    query_story_vec = query_row['story_Roberta_repr'].unsqueeze(0)
    query_conterfactual_state_vec = query_row['mod_state_Roberta_repr'].unsqueeze(0)

    candidate_story_vec = search_df['story_Roberta_repr']
    candidate_conterfactual_state_vec = search_df['mod_state_Roberta_repr']

    candidate_story_vec = torch.cat(list(map(lambda x: x.unsqueeze(0), candidate_story_vec)))
    candidate_conterfactual_state_vec = torch.cat(list(map(lambda x: x.unsqueeze(0), candidate_conterfactual_state_vec)))

    story_similarity = cos(query_story_vec, candidate_story_vec)
    state_similarity = cos(query_conterfactual_state_vec, candidate_conterfactual_state_vec)

    total_similarity = (story_similarity + state_similarity)/2
    most_similar_indices = total_similarity.argsort()[-k::]

    return most_similar_indices.numpy()


def get_similarity_task7(k, search_df, query_row):

    query_story_vec = query_row['story_Roberta_repr'].unsqueeze(0)
    query_mod_story_vec = query_row['mod_story_Roberta_repr'].unsqueeze(0)

    candidate_story_vec = search_df['story_Roberta_repr']
    candidate_mod_story_vec = search_df['mod_state_Roberta_repr']

    candidate_story_vec = torch.cat(list(map(lambda x: x.unsqueeze(0), candidate_story_vec)))
    candidate_mod_story_vec = torch.cat(list(map(lambda x: x.unsqueeze(0), candidate_mod_story_vec)))

    story_similarity = cos(query_story_vec, candidate_story_vec)
    mod_story_similarity = cos(query_mod_story_vec, candidate_mod_story_vec)

    total_similarity = (story_similarity + mod_story_similarity)/2
    most_similar_indices = total_similarity.argsort()[-k::]

    return most_similar_indices.numpy()

In [None]:
no_of_exs_list = [10]
task_no = 6
approach = 3

for no_of_exs in no_of_exs_list:
    final_dict_task = []

    for idx, row in tqdm(val_dat.iloc[:].iterrows(), total=len(val_dat.iloc[:])):

        if task_no == 6:
            nearest_k_instances_idx = get_similarity_task6(k = no_of_exs, search_df=tr_dat, query_row=row)
        if task_no == 7:
            nearest_k_instances_idx = get_similarity_task7(k = no_of_exs, search_df=tr_dat, query_row=row)

        task_prompt = "\n".join(tr_dat.iloc[nearest_k_instances_idx][f'task_{task_no}'].tolist())
        task_prompt = task_description[f'{task_no}'] + '\n\n' + task_prompt

        df_temp = row.to_dict()
        gpt3_query = task_prompt + '\n' + df_temp[f'task_{task_no}_query']
        response = openai.Completion.create(model="text-davinci-002",
                                                prompt=gpt3_query,
                                                temperature=.9,
                                                max_tokens=100,
                                                top_p=1,
                                                frequency_penalty=0.5,
                                                presence_penalty=0.1)
        df_temp['GPT3_response'] = response['choices']
        df_temp['GPT3_prompt'] = gpt3_query
        df_temp['GPT3_response_readable'] = response['choices'][0]['text'].strip()
        final_dict_task.append(df_temp)
    final_dict_task_copy = pd.DataFrame(final_dict_task)
    final_dict_task_copy.to_csv(f'./../gpt3_op/Hu_eval_approach_{approach}_task_{task_no}_{no_of_exs}.csv', index = False)

-  Hyperparameter selection: Automatic evaluation for 
     - 3 prompting approaches (random, expert-curated, most similar in-context examples)
     -  different # of incontext examples

##### Eval of Task 6

In [None]:
approaches = [1, 2, 3]
no_of_exs_list = [5, 10, 15]
task_no = 7

for approach in approaches:
    print(f'\n{"--"*20}\nApproach number :: {approach}\n{"--"*20}\n')
    for no_of_exs in no_of_exs_list:
        print(f'\n>> number of examples in prompt :: {no_of_exs}')

        final_dict_task_copy = pd.read_csv(f'./../gpt3_op/Val_approach_{approach}_task_{task_no}_{no_of_exs}.csv')
        final_dict_task_copy['task7_ref'] = final_dict_task_copy.apply(lambda x: 'State 1: ' + x['assertion'] + ' State 2: ' + x['mod_assertion'], axis = 1)
        final_dict_task_copy['GPT3_response_readable'] = final_dict_task_copy['GPT3_response_readable'].apply(lambda x: x.strip().replace('\n', ' '))

        bleu_3 = bleu_score.compute(predictions=final_dict_task_copy['GPT3_response_readable'], references = final_dict_task_copy['Mod_Story'], max_len = 4, min_len = 1)
        bert_s = bert_score.compute(predictions=final_dict_task_copy['GPT3_response_readable'], references=final_dict_task_copy['Mod_Story'], lang = 'en', rescale_with_baseline = True)
        bleurt_s = bleurt.compute(predictions=final_dict_task_copy['GPT3_response_readable'], references=final_dict_task_copy['Mod_Story'])

        print(np.mean(bleurt_s['scores']), np.mean(bert_s['f1']), bleu_3)

##### Eval of Task 7

In [10]:
approaches = [1, 2, 3]
no_of_exs_list = [5, 10, 15]
task_no = 7

for approach in approaches:
    print(f'\n{"--"*20}\nApproach number :: {approach}\n{"--"*20}\n')
    for no_of_exs in no_of_exs_list:
        print(f'\n>> number of examples in prompt :: {no_of_exs}')
        final_dict_task_copy = pd.read_csv(f'./../gpt3_op/Val_approach_{approach}_task_{task_no}_{no_of_exs}.csv')

        # final_dict_task_copy = pd.read_csv(f'./../gpt3_op/Val_approach_{2}_task_{7}_{10}.csv')
        # final_dict_task_copy['task7_ref_state1'] = final_dict_task_copy.apply(lambda x: 'State 1: ' + x['assertion'] + ' State 2: ' + x['mod_assertion'], axis = 1)
        final_dict_task_copy['GPT3_response_readable'] = final_dict_task_copy['GPT3_response_readable'].apply(lambda x: str(x).strip().replace('\n', ' '))
        final_dict_task_copy['GPT3_response_readable_state1'] = final_dict_task_copy['GPT3_response_readable'].apply(lambda x: x.split('State 2:')[0].replace('State 1:', '').strip())
        final_dict_task_copy['GPT3_response_readable_state2'] = final_dict_task_copy['GPT3_response_readable'].apply(lambda x: x.split('State 2:')[-1].strip())

        bleu_3_state1 = bleu_score.compute(predictions=final_dict_task_copy['GPT3_response_readable_state1'], references = final_dict_task_copy['assertion'], max_len = 4, min_len = 1)
        bert_s_state1 = bert_score.compute(predictions=final_dict_task_copy['GPT3_response_readable_state1'], references=final_dict_task_copy['assertion'], lang = 'en', rescale_with_baseline = True)
        bleurt_s_state1 = bleurt.compute(predictions=final_dict_task_copy['GPT3_response_readable_state1'], references=final_dict_task_copy['assertion'])

        bleu_3_state2 = bleu_score.compute(predictions=final_dict_task_copy['GPT3_response_readable_state2'], references = final_dict_task_copy['mod_assertion'], max_len = 4, min_len = 1)
        bert_s_state2 = bert_score.compute(predictions=final_dict_task_copy['GPT3_response_readable_state2'], references=final_dict_task_copy['mod_assertion'], lang = 'en', rescale_with_baseline = True)
        bleurt_s_state2 = bleurt.compute(predictions=final_dict_task_copy['GPT3_response_readable_state2'], references=final_dict_task_copy['mod_assertion'])

        print((np.mean(bleurt_s_state1['scores']) + np.mean(bleurt_s_state2['scores']))/2, (np.mean(bert_s_state1['f1']) + np.mean(bert_s_state2['f1']))/2, (bleu_3_state1['google_bleu'] + bleu_3_state2['google_bleu'])/2)


----------------------------------------
Approach number :: 1
----------------------------------------


>> number of examples in prompt :: 5
-0.7145838774275035 0.5299286359548568 0.10743231918681027

>> number of examples in prompt :: 10
-0.7674064121022821 0.5280463938601314 0.10857974253511266

>> number of examples in prompt :: 15
-0.7546867879386991 0.519381995536387 0.10727671918245689

----------------------------------------
Approach number :: 2
----------------------------------------


>> number of examples in prompt :: 5
-0.834946702774614 0.5143468851875515 0.09402387377308594

>> number of examples in prompt :: 10
-0.7845454420521856 0.5121545459702611 0.09883878876225816

>> number of examples in prompt :: 15
-0.786708257496357 0.5216857675660866 0.10466752868676693

----------------------------------------
Approach number :: 3
----------------------------------------


>> number of examples in prompt :: 5
-0.7491250667441636 0.5210992804542184 0.10971820557996129

>> n



-0.7900637151300907 0.5027628561481834 0.10533783783783784


- #### Generating RoBERTa representation for story/states for each pasta instance [Don't care/ignore]

In [None]:
from transformers import (
    RobertaModel,
    RobertaTokenizerFast)
import torch
from collections import OrderedDict

tokenizer = RobertaTokenizerFast.from_pretrained('roberta-large') 
roberta_chkpt = torch.load('./../model_checkpoint_TACL3/t_8_1_m_roberta-large_b_12_lr_5e-06_w_1e-06_s_0_epoch_4.pt', map_location=torch.device('cpu'))
roberta_chkpt = OrderedDict([(k.replace('LM_base.', ''), v) for k, v in roberta_chkpt.items() if k.startswith('linear_op') ==  False])
roberta_mdl = RobertaModel.from_pretrained('roberta-large')
roberta_mdl.load_state_dict(roberta_chkpt)

In [None]:
story_tok = tokenizer.batch_encode_plus(gpt3_dat['Story'].tolist(), padding=True)
state_tok = tokenizer.batch_encode_plus(gpt3_dat['assertion'].tolist(), padding=True)
mod_story_tok = tokenizer.batch_encode_plus(gpt3_dat['mod_assertion'].tolist(), padding=True)
mod_story_tok = tokenizer.batch_encode_plus(gpt3_dat['Mod_Story'].tolist(), padding=True)

In [None]:
res2 = roberta_mdl.forward(input_ids=torch.tensor(res['input_ids']), attention_mask=torch.tensor(res['attention_mask']))

In [None]:
B = 12
st, end = 0, gpt3_dat.shape[0]
ix_list = np.arange(st, end, B)

st_ix = ix_list
end_ix = np.append(ix_list[1:], end)

story_repr, state_repr, mod_state_repr, mod_story_repr = [], [], [], []

for s, e in tqdm(zip(st_ix, end_ix), total = len(st_ix)):
    rows_batch = gpt3_dat.iloc[s: e]

    story_tok = tokenizer.batch_encode_plus(rows_batch['Story'].tolist(), padding=True)
    state_tok = tokenizer.batch_encode_plus(rows_batch['assertion'].tolist(), padding=True)
    mod_state_tok = tokenizer.batch_encode_plus(rows_batch['mod_assertion'].tolist(), padding=True)
    mod_story_tok = tokenizer.batch_encode_plus(rows_batch['Mod_Story'].tolist(), padding=True)

    story_batch_repr = roberta_mdl.forward(input_ids=torch.tensor(story_tok['input_ids']), attention_mask=torch.tensor(story_tok['attention_mask']))
    state_batch_repr = roberta_mdl.forward(input_ids=torch.tensor(state_tok['input_ids']), attention_mask=torch.tensor(state_tok['attention_mask']))
    mod_state_batch_repr = roberta_mdl.forward(input_ids=torch.tensor(mod_state_tok['input_ids']), attention_mask=torch.tensor(mod_state_tok['attention_mask']))
    mod_story_batch_repr = roberta_mdl.forward(input_ids=torch.tensor(mod_story_tok['input_ids']), attention_mask=torch.tensor(mod_story_tok['attention_mask']))

    story_repr.extend(story_batch_repr['last_hidden_state'][:, 0, :])
    state_repr.extend(state_batch_repr['last_hidden_state'][:, 0, :])
    mod_state_repr.extend(mod_state_batch_repr['last_hidden_state'][:, 0, :])
    mod_story_repr.extend(mod_story_batch_repr['last_hidden_state'][:, 0, :])