In [2]:
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5TokenizerFast,
    get_linear_schedule_with_warmup, 
    BertTokenizerFast, RobertaTokenizerFast)
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
import torch
import pdb
from tqdm import tqdm
import torch.nn as nn
import os
import gc
gc.disable()
import time
import pandas as pd
import evaluate
from pasta_dataset import create_train_test_dataset

bert_score = evaluate.load("bertscore")
bleu_score = evaluate.load("google_bleu")
bleurt = evaluate.load("bleurt", module_type="metric")
rouge = evaluate.load('rouge')

Using default BLEURT-Base checkpoint for sequence maximum length 128. You can use a bigger model for better results with e.g.: evaluate.load('bleurt', 'bleurt-large-512').


INFO:tensorflow:Reading checkpoint /home/sayontan/.cache/huggingface/metrics/bleurt/default/downloads/extracted/98dc9460806ce3f1e4bb720f895eb85c10b0ce49c567cc7c70c9b108906be5cd/bleurt-base-128.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint bert_custom
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:bert_custom
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... max_seq_length:128
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating WordPiece tokenizer.
INFO:tensorflow:WordPiece tokenizer instantiated.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.


2023-12-06 06:13:42.816067: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-12-06 06:13:42.816138: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: context
2023-12-06 06:13:42.816150: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: context
2023-12-06 06:13:42.816316: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: NOT_FOUND: was unable to find libcuda.so DSO loaded into this program
2023-12-06 06:13:42.816369: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 450.80.2
2023-12-06 06:13:42.816714: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other opera

INFO:tensorflow:BLEURT initialized.


In [3]:
data_dict = create_train_test_dataset(root = './../data')
train_data, test_data, val_data = data_dict['tr_dat'], data_dict['te_dat'], data_dict['val_dat']

test_data.loc[:, 'story'] = test_data.apply(lambda x: " ".join([x[f'Input.line{i}'] for i in range(1, 6)]), axis = 1)
test_data.loc[:, 'mod_story'] = test_data.apply(lambda x: " ".join([x[f'Answer.mod_line{i}'] for i in range(1, 6)]), axis = 1)
train_data.shape, val_data.shape, test_data.shape

----
Train_data shape :: (8476, 20)
test_data shape :: (917, 20)
val_data shape :: (1350, 20)
\----


((8476, 20), (1350, 20), (917, 22))

### Task 8 - Human performance

In [4]:
def get_acc(df, aid_subset = None):
    df['correct_pred'] = df.apply(lambda x: int((str(x['target_op']).strip() == str(x['gen_op']).strip())), axis = 1)

    if aid_subset != None:
        df_subset = df.loc[df.AssignmentId.isin(aid_subset)]
        df2 = df_subset.groupby(['AssignmentId', 'story_type'])['correct_pred'].sum()
        acc = df_subset['correct_pred'].to_numpy().mean()
        cons_acc = sum(df2.to_numpy() == 2)/len(df2)
        print(f'For subset :: Accuracy :: {acc*100:.1f} Contrastive accuracy :: {cons_acc*100:.1f}')

    df2 = df.groupby(['AssignmentId', 'story_type'])['correct_pred'].sum()
    acc = df['correct_pred'].to_numpy().mean()
    cons_acc = sum(df2.to_numpy() == 2)/len(df2)
    print(f'Accuracy :: {acc*100:.1f} Contrastive accuracy :: {cons_acc*100:.1f}')
    return

### Human eval data - path list
- Task 8 (Story-State Inference)
  - human  : ```./../human_eval_data/mturk_op/MturkOP_Te_200_t8_1.csv``` (setting 1; default task setting)
  - human  : ```./../human_eval_data/mturk_op/MturkOP_Te_200_t8.csv``` (setting 0; w/o justification set indicator)
- Task 6
  - T5-large : ```./../human_eval_data/mturk_op/MturkOP_Te_full_t_6_m_t5-large_b_4_lr_0.0001_w_1e-06_s_0_epoch_4.csv```
  - T5-base : ```./../human_eval_data/mturk_op/MturkOP_Te_full_t_6_m_t5-base_b_12_lr_0.0001_w_1e-06_s_0_epoch_6.csv```
  - GPT3: ```./../human_eval_data/mturk_op/MturkOP_Te_200_t6_GPT3_app_3_exs_10.csv```
- Task 7
  - T5-large: ```./../human_eval_data/mturk_op/MturkOP_Te_full_t_7_m_t5-large_b_4_lr_0.0001_w_1e-06_s_0_epoch_4.csv```
  - T5-base: ```./../human_eval_data/mturk_op/MturkOP_Te_full_t_7_m_t5-base_b_10_lr_0.0001_w_1e-06_s_0_epoch_6.csv```
  - GPT3: ```./../human_eval_data/mturk_op/MturkOP_Te_200_t7_GPT3_app_1_exs_5.csv```

In [5]:
task_8_human_response = pd.read_csv('./../human_eval_data/mturk_op/MturkOP_Te_200_t8.csv')
req_cols = ['Input.AssignmentId',  'Input.Input_line1', 'Input.Input_line2', 'Input.Input_line3', 'Input.Input_line4', 'Input.Input_line5',
            'Input.assertion', 'Input.story_state_flag', 'WorkerId',
            'Answer.sb_entail_a.0', 'Answer.sb_entail_a.1', 'Answer.sb_entail_a.2', 'Answer.sb_entail_a.3', 'Answer.sb_entail_a.4']
task_8_human_response = task_8_human_response.loc[:, req_cols]
task_8_human_response.rename(columns={"Input.AssignmentId": "AssignmentId"}, inplace=True)

task_8_human_response['story_type'] = task_8_human_response['Input.story_state_flag'].apply(lambda x: 'story' if x.startswith('story') == True else 'mod_story')
task_8_human_response['state_type'] = task_8_human_response['Input.story_state_flag'].apply(lambda x: 'mod_state' if x.endswith('mod_state') == True else 'state')
task_8_human_response['gen_op'] = task_8_human_response.apply(lambda x: [i for i in range(5) if x[f'Answer.sb_entail_a.{i}'] == True][0] >= 3 , axis=1)
task_8_human_response['target_op'] = task_8_human_response['Input.story_state_flag'].apply(lambda x: x in ['mod_story_mod_state', 'story_state'])

task_8_human_response2 = task_8_human_response.groupby(['AssignmentId', 'story_type', 'state_type', 'target_op'])['gen_op'].sum()>1
task_8_human_response2 = task_8_human_response2.reset_index()
get_acc(task_8_human_response2, aid_subset = None)
get_acc(task_8_human_response2.loc[task_8_human_response2.story_type == 'story'], aid_subset = None)
get_acc(task_8_human_response2.loc[task_8_human_response2.story_type == 'mod_story'], aid_subset = None)

Accuracy :: 93.5 Contrastive accuracy :: 88.8
Accuracy :: 96.2 Contrastive accuracy :: 93.0
Accuracy :: 90.8 Contrastive accuracy :: 84.5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['correct_pred'] = df.apply(lambda x: int((str(x['target_op']).strip() == str(x['gen_op']).strip())), axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['correct_pred'] = df.apply(lambda x: int((str(x['target_op']).strip() == str(x['gen_op']).strip())), axis = 1)


### Task 6

- #### T5

In [6]:
# human_eval_task_6 = pd.read_csv('./../human_eval_data/mturk_op/MturkOP_Te_full_t_6_m_t5-large_b_4_lr_0.0001_w_1e-06_s_0_epoch_4.csv')
human_eval_task_6 = pd.read_csv('./../human_eval_data/mturk_op/MturkOP_Te_full_t_6_m_t5-base_b_12_lr_0.0001_w_1e-06_s_0_epoch_6.csv')
human_eval_task_6['story_logical'] = human_eval_task_6.apply(lambda x: [i for i in range(2) if x[f'Answer.sb_is_logical.{i}'] == True][0] == 1, axis=1)
human_eval_task_6['correct_implication'] = human_eval_task_6.apply(lambda x: [i for i in range(5) if x[f'Answer.sb_entail_a.{i}']==True ][0]>=3, axis=1)
human_eval_task_6['similarity'] = human_eval_task_6.apply(lambda x: [i for i in range(4) if x[f'Answer.sb_sim_sa.{i}']==True ][0], axis=1)

human_eval_task_6_grp = human_eval_task_6.groupby(['Input.AssignmentId']).agg({'story_logical': 'mean', 'correct_implication': 'mean'}).reset_index()

human_eval_task_6_grp['story_logical'] = human_eval_task_6_grp['story_logical'].apply(lambda x: int(x > 0.5))
human_eval_task_6_grp['correct_implication'] = human_eval_task_6_grp['correct_implication'].apply(lambda x: int(x > 0.5))
human_eval_task_6_grp['logical_correct_implication'] = human_eval_task_6_grp.apply(lambda x: x['story_logical']*x['correct_implication'], axis=1)

print(f'>> Correct Implication score :: {human_eval_task_6_grp["correct_implication"].mean()*100}')
print(f'>> story is lgoical:: {human_eval_task_6_grp["story_logical"].mean()*100}')
print(f'>> Both story is lgoical and implies the conterfactual state:: {human_eval_task_6_grp["logical_correct_implication"].mean()*100}')

# print(f'>> Similarity score:: {100 - human_eval_task_6["similarity"].mean()*100}')
print(f'>> Similarity score:: {((3 - human_eval_task_6["similarity"] )/3).mean()*100}')

>> Correct Implication score :: 41.0
>> story is lgoical:: 77.0
>> Both story is lgoical and implies the conterfactual state:: 34.0
>> Similarity score:: 91.38888888888887


- GPT3

In [7]:
human_eval_task_6 = pd.read_csv('./../human_eval_data/mturk_op/MturkOP_Te_200_t6_GPT3_app_3_exs_10.csv')

human_eval_task_6['story_logical'] = human_eval_task_6.apply(lambda x: [i for i in range(2) if x[f'Answer.sb_is_logical.{i}'] == True][0] == 0, axis=1)
human_eval_task_6['correct_implication'] = human_eval_task_6.apply(lambda x: [i for i in range(5) if x[f'Answer.sb_entail_a.{i}']==True ][0]<=1, axis=1)

human_eval_task_6['similarity'] = human_eval_task_6.apply(lambda x: [i for i in range(4) if x[f'Answer.sb_sim_sa.{i}']==True ][0], axis=1)

human_eval_task_6_grp = human_eval_task_6.groupby(['Input.AssignmentId']).agg({'story_logical': 'mean', 'correct_implication': 'mean', 'similarity': 'mean'}).reset_index()

human_eval_task_6_grp['story_logical'] = human_eval_task_6_grp['story_logical'].apply(lambda x: int(x > 0.5))
human_eval_task_6_grp['correct_implication'] = human_eval_task_6_grp['correct_implication'].apply(lambda x: int(x > 0.5))
human_eval_task_6_grp['logical_correct_implication'] = human_eval_task_6_grp.apply(lambda x: x['story_logical']*x['correct_implication'], axis=1)

# print(human_eval_task_6_grp['story_logical'].mean(), human_eval_task_6_grp['correct_implication'].mean(), human_eval_task_6_grp['logical_correct_implication'].mean())

correct_implication = np.append(human_eval_task_6_grp["correct_implication"].to_numpy(), np.zeros(200 - len(human_eval_task_6_grp["correct_implication"])))
logical_story = np.append(human_eval_task_6_grp["story_logical"].to_numpy(), np.zeros(200 - len(human_eval_task_6_grp["story_logical"])))
logical_and_corr_impl = np.append(human_eval_task_6_grp["logical_correct_implication"].to_numpy(), np.zeros(200 - len(human_eval_task_6_grp["logical_correct_implication"])))
similarity = np.append(human_eval_task_6["similarity"].to_numpy(), np.zeros(600 - len(human_eval_task_6["similarity"])))

print(f'>> Correct Implication score :: {correct_implication.mean()*100:3f}')
print(f'>> story is lgoical:: {logical_story.mean()*100:3f}')
print(f'>> Both story is lgoical and implies the conterfactual state:: {logical_and_corr_impl.mean()*100}')
print(f'>> Similarity score:: {((3-similarity)/3).mean()*100}')

>> Correct Implication score :: 50.000000
>> story is lgoical:: 86.000000
>> Both story is lgoical and implies the conterfactual state:: 48.5
>> Similarity score:: 86.33333333333333


### Task 7

- for both T5 and GPT3

In [10]:
# human_eval_task_7 = pd.read_csv('./../human_eval_data/mturk_op/MturkOP_Te_full_t_7_m_t5-base_b_10_lr_0.0001_w_1e-06_s_0_epoch_6.csv')
human_eval_task_7 = pd.read_csv('./../human_eval_data/mturk_op/MturkOP_Te_full_t_7_m_t5-large_b_4_lr_0.0001_w_1e-06_s_0_epoch_4.csv')
# human_eval_task_7 = pd.read_csv('./../human_eval_data/mturk_op/MturkOP_Te_200_t7_GPT3_app_1_exs_5.csv')

human_eval_task_7['state_is_attr'] = human_eval_task_7.apply(lambda x: [i for i in range(2) if x[f'Answer.ass1_is_attr.{i}'] ==True ][0], axis=1)
human_eval_task_7['state_not_in_story'] = human_eval_task_7['Answer.ass_in_story.0'].apply(lambda x: int(x == True))
human_eval_task_7['state_imp_by_storya'] = human_eval_task_7.apply(lambda x: [i for i in range(5) if x[f'Answer.sa_entail_a.{i}'] ==True ][0] >= 3, axis=1)
human_eval_task_7['state_imp_by_storyb'] = human_eval_task_7.apply(lambda x: [i for i in range(5) if x[f'Answer.sb_entail_a.{i}'] ==True ][0] >= 3, axis=1)

human_eval_task_7_state = human_eval_task_7.loc[human_eval_task_7['Input.state_type'] == 'state', :]
human_eval_task_7_mod_state = human_eval_task_7.loc[human_eval_task_7['Input.state_type'] == 'mod_state', :]

human_eval_task_7_state_grp = human_eval_task_7_state.groupby('Input.AssignmentId').agg({'state_is_attr': 'mean', 'state_not_in_story': 'mean', 'state_imp_by_storya': 'mean', 'state_imp_by_storyb': 'mean'}).reset_index()
human_eval_task_7_mod_state_grp = human_eval_task_7_mod_state.groupby('Input.AssignmentId').agg({'state_is_attr': 'mean', 'state_not_in_story': 'mean', 'state_imp_by_storya': 'mean', 'state_imp_by_storyb': 'mean'}).reset_index()

human_eval_task_7_state_grp['state_is_attr'] = human_eval_task_7_state_grp['state_is_attr'].apply(lambda x: int(x > 0.5))
human_eval_task_7_mod_state_grp['state_is_attr'] = human_eval_task_7_mod_state_grp['state_is_attr'].apply(lambda x: int(x > 0.5))

human_eval_task_7_state_grp['state_not_in_story'] = human_eval_task_7_state_grp['state_not_in_story'].apply(lambda x: int(x > 0.5))
human_eval_task_7_mod_state_grp['state_not_in_story'] = human_eval_task_7_mod_state_grp['state_not_in_story'].apply(lambda x: int(x > 0.5))

human_eval_task_7_state_grp['correct_state_change'] = human_eval_task_7_state_grp.apply(lambda x: np.max(x['state_imp_by_storya'] - x['state_imp_by_storyb'], 0), axis=1)
human_eval_task_7_mod_state_grp['correct_state_change'] = human_eval_task_7_mod_state_grp.apply(lambda x: np.max(x['state_imp_by_storyb'] - x['state_imp_by_storya'], 0), axis=1)

human_eval_task_7_state_grp['overall_score'] = human_eval_task_7_state_grp.apply(lambda x: x['state_is_attr']*x['state_not_in_story']*x['correct_state_change'], axis=1)
human_eval_task_7_mod_state_grp['overall_score'] = human_eval_task_7_mod_state_grp.apply(lambda x: x['state_is_attr']*x['state_not_in_story']*x['correct_state_change'], axis=1)

col = ['state_is_attr', 'correct_state_change', 'state_not_in_story', 'overall_score']
for c in col:
    print(f'\n>> {c}')
    # var = (human_eval_task_7_state_grp[c].to_numpy()*human_eval_task_7_mod_state_grp[c].to_numpy()).mean()
    var = (human_eval_task_7_state_grp[c].mean() + human_eval_task_7_mod_state_grp[c].mean())/2
    print(var*100)


>> state_is_attr
99.25

>> correct_state_change
58.74999999999999

>> state_not_in_story
97.0

>> overall_score
55.49999999999999
