<a href="https://colab.research.google.com/github/ShreyasheeSinha/Determining-Robustness-of-NLU-Models/blob/main/analysis_notebooks/significance_testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Notebook with code to carry out significance testing.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!cp /content/gdrive/MyDrive/paraphraser_work/model_predictions.tar.gz model_predictions.tar.gz

In [None]:
! gzip -d model_predictions.tar.gz

In [None]:
! tar -xvf  'model_predictions.tar'

model_predictions/
model_predictions/MNLI_train_RTE_dev/
model_predictions/MNLI_train_RTE_dev/roberta-base/
model_predictions/MNLI_train_RTE_dev/roberta-base/roberta_base_2_class_rte_test_p_predictions.csv
model_predictions/MNLI_train_RTE_dev/roberta-base/roberta_base_2_class_hypo_only_rte_test_p_predictions.csv
model_predictions/MNLI_train_RTE_dev/roberta-base/roberta_base_3_class_rte_test_p_predictions.csv
model_predictions/MNLI_train_RTE_dev/roberta-base/roberta_base_3_class_hypo_only_rte_test_p_predictions.csv
model_predictions/MNLI_train_RTE_dev/roberta/
model_predictions/MNLI_train_RTE_dev/roberta/roberta-base/
model_predictions/MNLI_train_RTE_dev/roberta/roberta-base/roberta_base_3_class_rte_test_predictions.csv
model_predictions/MNLI_train_RTE_dev/roberta/roberta-base/roberta_base_3_class_rte_dev_predictions.csv
model_predictions/MNLI_train_RTE_dev/roberta/roberta-base/roberta_base_3_class_hypo_only_rte_dev_predictions.csv
model_predictions/MNLI_train_RTE_dev/roberta/roberta-ba

In [None]:
%cd /content/model_predictions/MNLI_train_RTE_paraphrased_dev/cbow/

/content/model_predictions/MNLI_train_RTE_paraphrased_dev/cbow


In [None]:
import pandas as pd
import json
import numpy as np
from scipy import stats
from tqdm import tqdm
# np.random.seed(42)

In [None]:
def read_jsonl_file(path):
  with open(path, 'r') as json_file:
      json_list = list(json_file)

  results = []
  for json_str in json_list:
      results.append(json.loads(json_str))
  return results

In [None]:
test_data = pd.DataFrame(read_jsonl_file("/content/gdrive/MyDrive/paraphraser_work/Final_datasets/rte_test_p.jsonl")) # Unannotated dataset: /content/gdrive/MyDrive/paraphraser_work/MTurk data/Generated_dataset/rte_test_paraphrased_p_d_or_h_d_sorted.jsonl
test_data = test_data.fillna(value=np.nan)

In [None]:
# THIS IS KEY TO OUR WORK, DO NOT ALTER
def merge_data(test_data, predictions):
  df = pd.DataFrame()

  for index, row in test_data.iterrows():
    dataset = row['dataset']
    corpus_sent_id = row['corpus_sent_id']
    s1_para_id = row['s1_para_id']
    s2_para_id = row['s2_para_id']
    prediction_rows = predictions[(predictions['dataset'] == dataset) & (predictions['corpus_sent_id'] == corpus_sent_id)]
    if pd.isnull(s1_para_id) and pd.isnull(s2_para_id):
      prediction_rows = prediction_rows[(prediction_rows['s1_para_id'].isnull()) & (prediction_rows['s2_para_id'].isnull())]
    elif pd.isnull(s1_para_id):
      prediction_rows = prediction_rows[(prediction_rows['s1_para_id'].isnull()) & (prediction_rows['s2_para_id'] == s2_para_id)]
    elif pd.isnull(s2_para_id):
      prediction_rows = prediction_rows[(prediction_rows['s1_para_id'] == s1_para_id) & (prediction_rows['s2_para_id'].isnull())]
    else:
      prediction_rows = prediction_rows[(prediction_rows['s1_para_id'] == s1_para_id) & (prediction_rows['s2_para_id'] == s2_para_id)]
    df = df.append(prediction_rows)
  return df

In [None]:
def is_significant_adam(df):
  grouped_df = df.groupby(["dataset", "corpus_sent_id"])
  number_of_model_preds_different_from_ph = 0
  number_of_nh_preds_different_from_ph = 0
  total_model = 0
  total_nh = 0
  p_value = 0

  for name, group in grouped_df:
    ph_pred_df = group[group['s1_para_id'].isnull() & group['s2_para_id'].isnull()]["prediction"]
    if len(ph_pred_df) > 0:
      ph_pred = group[group['s1_para_id'].isnull() & group['s2_para_id'].isnull()]["prediction"].to_list()[0]
      paraphrased_model_preds = group[~(group['s1_para_id'].isnull() & group['s2_para_id'].isnull())]["prediction"].to_list()
      paraphrased_nh_preds = group[~(group['s1_para_id'].isnull() & group['s2_para_id'].isnull())]["nh_prediction"].to_list()

      # total += len(paraphrased_model_preds)
      total_nh += len(paraphrased_nh_preds)
      for paraphrased_pred in paraphrased_nh_preds:
        if paraphrased_pred != ph_pred:
          number_of_nh_preds_different_from_ph += 1

      total_model += len(paraphrased_model_preds)
      for paraphrased_pred in paraphrased_model_preds:
        if paraphrased_pred != ph_pred:
          number_of_model_preds_different_from_ph += 1
  # print(total)
  # return number_of_preds_different_from_ph, number_of_preds_different_from_ph/total * 100
  if total_model > 0:
    percent_model_preds_change = number_of_model_preds_different_from_ph/total_model * 100
  if total_nh > 0:
    percent_nh_preds_change = number_of_nh_preds_different_from_ph/total_nh * 100
  # print(percent_nh_preds_change, percent_model_preds_change)
  return percent_nh_preds_change > percent_model_preds_change
  # return percent_nh_preds_change, percent_model_preds_change

In [None]:
def is_significant_yash1(df):
  grouped_df = df.groupby(["dataset", "corpus_sent_id"])
  number_of_model_preds_different_from_ph = 0
  number_of_nh_preds_different_from_ph = 0
  total_model = 0
  total_nh = 0
  p_value = 0

  for name, group in grouped_df:
    ph_pred_df = group[group['s1_para_id'].isnull() & group['s2_para_id'].isnull()]["prediction"]
    if len(ph_pred_df) > 0:
      ph_pred = group[group['s1_para_id'].isnull() & group['s2_para_id'].isnull()]["prediction"].to_list()[0]
      paraphrased_model_preds = group[~(group['s1_para_id'].isnull() & group['s2_para_id'].isnull())]["prediction"].to_list()
      paraphrased_nh_preds = group[~(group['s1_para_id'].isnull() & group['s2_para_id'].isnull())]["nh_prediction"].to_list()

      # total += len(paraphrased_model_preds)
      total_nh += len(paraphrased_nh_preds)
      for paraphrased_pred in paraphrased_nh_preds:
        if paraphrased_pred != ph_pred:
          number_of_nh_preds_different_from_ph += 1

      total_model += len(paraphrased_model_preds)
      for paraphrased_pred in paraphrased_model_preds:
        if paraphrased_pred != ph_pred:
          number_of_model_preds_different_from_ph += 1
  # print(total)
  # return number_of_preds_different_from_ph, number_of_preds_different_from_ph/total * 100
  if total_model > 0:
    percent_model_preds_change = number_of_model_preds_different_from_ph/total_model * 100
  if total_nh > 0:
    percent_nh_preds_change = number_of_nh_preds_different_from_ph/total_nh * 100
  # print(percent_nh_preds_change, percent_model_preds_change)
  # return percent_nh_preds_change > percent_model_preds_change
  return percent_nh_preds_change, percent_model_preds_change

In [None]:
# Yash 2

def yash_2(df, model_name):
  p_value = 0
  sample_frac = 0.50
  num_samples = 1000
  dist_nh_preds = []
  dist_model_preds = []
  seed_val = 0
  np.random.seed(seed_val)
  # seed_val += 1
  nh_options = [0, 1]
  nh_preds = []

  for i in range(0, len(df)):
    nh_preds.append(np.random.choice(nh_options, 1)[0])
  df['nh_prediction'] = nh_preds

  for i in tqdm(range(num_samples)):
    # np.random.seed(seed_val)
    # seed_val += 1
    # nh_options = [0, 1]
    # nh_preds = []

    # for i in range(0, len(df)):
    #   nh_preds.append(np.random.choice(nh_options, 1)[0])
    # df['nh_prediction'] = nh_preds
    sampled_df = df.sample(frac=sample_frac)
    # if is_significant_adam(sampled_df):
    #   p_value += 1
    nh_preds_change, model_preds_change = is_significant_yash1(sampled_df)
    dist_nh_preds.append(nh_preds_change)
    dist_model_preds.append(model_preds_change)

  print("yash2", model_name, stats.ttest_ind(dist_nh_preds, dist_model_preds, equal_var=True))

In [None]:
# Yash 3

def yash_3(df, model_name):
  p_value = 0
  sample_frac = 0.50
  num_samples = 1000
  dist_nh_preds = []
  dist_model_preds = []
  seed_val = 0

  for i in tqdm(range(num_samples)):
    np.random.seed(seed_val)
    seed_val += 1
    nh_options = [0, 1]
    nh_preds = []

    for i in range(0, len(df)):
      nh_preds.append(np.random.choice(nh_options, 1)[0])
    df['nh_prediction'] = nh_preds
    sampled_df = df.sample(frac=sample_frac)
    # if is_significant_adam(sampled_df):
    #   p_value += 1
    nh_preds_change, model_preds_change = is_significant_yash1(sampled_df)
    dist_nh_preds.append(nh_preds_change)
    dist_model_preds.append(model_preds_change)

  print("yash3", model_name, stats.ttest_ind(dist_nh_preds, dist_model_preds, equal_var=True))

In [None]:
model_name_path_dict = {
    "cbow": "/content/model_predictions/MNLI_train_RTE_paraphrased_dev/cbow/cbow_3_class_rte_test_p_predictions.csv",
    "bilstm": "/content/model_predictions/MNLI_train_RTE_paraphrased_dev/bilstm/bilstm_3_class_rte_test_p_predictions.csv",
    "roberta_mlp": "/content/model_predictions/MNLI_train_RTE_paraphrased_dev/roberta/roberta-large/roberta_large_3_class_rte_test_p_predictions.csv",
    "bert_mlp": "/content/model_predictions/MNLI_train_RTE_paraphrased_dev/bert/bert-large-uncased/bert_large_3_class_rte_test_p_predictions.csv",
    "gpt-3": "/content/rte_test_paraphrased_preds.csv",
    "roberta-mnli": "/content/roberta_mnli_3_class_rte_test_p_predictions.csv"
}

In [None]:
for key, value in model_name_path_dict.items():
  predictions = pd.read_csv(value)
  df = merge_data(test_data, predictions)
  yash_2(df, key)
  yash_3(df, key)

100%|██████████| 1000/1000 [23:51<00:00,  1.43s/it]


yash2 cbow Ttest_indResult(statistic=335.4300355450884, pvalue=0.0)


100%|██████████| 1000/1000 [25:22<00:00,  1.52s/it]


yash3 cbow Ttest_indResult(statistic=309.14293915031783, pvalue=0.0)


100%|██████████| 1000/1000 [23:38<00:00,  1.42s/it]


yash2 bilstm Ttest_indResult(statistic=303.56577027477425, pvalue=0.0)


100%|██████████| 1000/1000 [25:04<00:00,  1.50s/it]


yash3 bilstm Ttest_indResult(statistic=293.8636684879669, pvalue=0.0)


100%|██████████| 1000/1000 [23:54<00:00,  1.43s/it]


yash2 roberta_mlp Ttest_indResult(statistic=442.6747136339459, pvalue=0.0)


 76%|███████▌  | 755/1000 [18:31<06:00,  1.47s/it]


KeyboardInterrupt: ignored