In [None]:
import pandas as pd
import json
import itertools

In [None]:
!git clone https://github.com/ShreyasheeSinha/Determining-Robustness-of-NLU-Models.git

Cloning into 'Determining-Robustness-of-NLU-Models'...
remote: Enumerating objects: 324, done.[K
remote: Counting objects: 100% (176/176), done.[K
remote: Compressing objects: 100% (119/119), done.[K
remote: Total 324 (delta 80), reused 122 (delta 56), pack-reused 148[K
Receiving objects: 100% (324/324), 1.90 MiB | 16.89 MiB/s, done.
Resolving deltas: 100% (160/160), done.


In [None]:
PATH = "/content/drive/MyDrive/paraphraser_work/MTurk data/Final dataset/all_grammatically_correct_thresholded_paraphrases.csv"
RTE_DEV_PATH = "/content/Determining-Robustness-of-NLU-Models/data/RTE_data/RTE_dev.jsonl"
RTE_TEST_PATH = "/content/Determining-Robustness-of-NLU-Models/data/RTE_data/RTE_test.jsonl"

In [None]:
def read_jsonl_file(path):
  with open(path, 'r') as json_file:
      json_list = list(json_file)

  results = []
  for json_str in json_list:
      results.append(json.loads(json_str))
  return results

In [None]:
rte_dev_data = pd.DataFrame(read_jsonl_file(RTE_DEV_PATH))
rte_test_data = pd.DataFrame(read_jsonl_file(RTE_TEST_PATH))

In [None]:
paraphrased_data = pd.read_csv(PATH)

In [None]:
paraphrased_test_data = paraphrased_data[paraphrased_data['origin_id'].str.contains("test")]
paraphrased_dev_data = paraphrased_data[paraphrased_data['origin_id'].str.contains("dev")]

In [None]:
paraphrased_test_data.drop(columns=['Unnamed: 0'], inplace=True)
paraphrased_dev_data.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
rte_test_data

Unnamed: 0,dataset,corpus_sent_id,sentence2,gold_label,sentence1,task,id
0,RTE1_test.column,1066,The Nuttall and Gibson fires were caused by li...,True,"The biggest fires on Mount Graham , the lightn...",RC,1066_RTE1_test.column
1,RTE3_test.column,767,Mr Putin is a vehement critic of Mr Litvinenko .,False,There has been some speculation in the Russian...,SUM,767_RTE3_test.column
2,RTE1_test.column,39,Bill Clinton received a reported $ 10 million ...,True,Mr. Clinton received a hefty advance for the b...,PP,39_RTE1_test.column
3,RTE2_test.column,485,Insurgents have launched attacks on Iraq 's oi...,True,Insurgent attacks on pipelines have brought oi...,SUM,485_RTE2_test.column
4,RTE3_test.column,681,The court heard US appeals for the release of ...,True,They refused to appear in the World Court 10 y...,SUM,681_RTE3_test.column
...,...,...,...,...,...,...,...
2395,RTE2_test.column,578,Mathew Staver is a supporter of Miers .,False,"Mathew Staver , president of Liberty Counsel ,...",IE,578_RTE2_test.column
2396,RTE3_test.column,661,Kenya 's wildlife is managed in a profitable m...,False,"Leakey believed Kenya 's wildlife , which unde...",SUM,661_RTE3_test.column
2397,RTE3_test.column,214,French conscientious objectors are given speci...,False,Almost a century of French history has come to...,IR,214_RTE3_test.column
2398,RTE3_test.column,526,Earthquakes have the potential to generate a t...,True,The most recent underwater volcano eruption wa...,QA,526_RTE3_test.column


In [None]:
def get_paraphrases(data, key_prefix):
  paraphrases = []
  if not data.empty:
    paraphrase1 = data.iloc[0]['paraphrase1']
    paraphrase2 = data.iloc[0]['paraphrase2']
    paraphrase3 = data.iloc[0]['paraphrase3']
    # print(paraphrase1, paraphrase2, paraphrase3)
    paraphrases = []
    if not pd.isnull(paraphrase1):
      paraphrases.append((key_prefix + "_para1", paraphrase1))
    if not pd.isnull(paraphrase2):
      paraphrases.append((key_prefix + "_para2", paraphrase2))
    if not pd.isnull(paraphrase3):
      paraphrases.append((key_prefix + "_para3", paraphrase3))
  return paraphrases

In [None]:
def generate_pairs(premise_list, hypothesis_list):
  pairs = []
  for pair in itertools.product(premise_list, hypothesis_list):
    pairs.append(pair)
  return pairs

In [None]:
paraphrased_test_data[paraphrased_test_data['origin_id'] == "RTE2_test.column_s2_485"]

Unnamed: 0,origin_id,sentence,paraphrase1,score1,range1,paraphrase2,score2,range2,paraphrase3,score3,range3
428,RTE2_test.column_s2_485,Insurgents have launched attacks on Iraq 's oi...,Insurgents have launched attacks on Iraq's oil...,2.0,100.0,Insurgents have launched attacks on the oil in...,2.0,86.0,,,


In [None]:
generate_pairs(["p", "p1", "p2", "p3"], ["h", "h1", "h2", "h3"])

[('p', 'h'),
 ('p', 'h1'),
 ('p', 'h2'),
 ('p', 'h3'),
 ('p1', 'h'),
 ('p1', 'h1'),
 ('p1', 'h2'),
 ('p1', 'h3'),
 ('p2', 'h'),
 ('p2', 'h1'),
 ('p2', 'h2'),
 ('p2', 'h3'),
 ('p3', 'h'),
 ('p3', 'h1'),
 ('p3', 'h2'),
 ('p3', 'h3')]

In [None]:
def generate_paraphrased_dataset(rte_data, paraphrased_data):
  generated_dataset = []
  for index, row in rte_data.iterrows():
    # generated_dataset.append(row.to_dict())
    sentence1 = row['sentence1']
    sentence2 = row['sentence2']
    s1_origin_id = row['dataset'] + "_s1_" + str(row['corpus_sent_id'])
    s2_origin_id = row['dataset'] + "_s2_" + str(row['corpus_sent_id'])
    # print(s1_origin_id, s2_origin_id)
    s1_paraphrases = paraphrased_data[paraphrased_data['origin_id'] == s1_origin_id]
    s2_paraphrases = paraphrased_data[paraphrased_data['origin_id'] == s2_origin_id]
    s1_sentences_paraphrases = []
    s2_sentences_paraphrases = []
    s1_sentences_paraphrases.append(("premise", sentence1))
    s2_sentences_paraphrases.append(("hypothesis", sentence2))
    s1_sentences_paraphrases.extend(get_paraphrases(s1_paraphrases, "premise"))
    s2_sentences_paraphrases.extend(get_paraphrases(s2_paraphrases, "hypothesis"))
    pairs = generate_pairs(s1_sentences_paraphrases, s2_sentences_paraphrases)

    for pair in pairs:
      new_row = {}
      new_row['dataset'] = row['dataset']
      new_row['corpus_sent_id'] = row['corpus_sent_id']
      new_row['gold_label'] = row['gold_label']
      new_row['task'] = row['task']
      premise, hypothesis = pair
      new_row['sentence1'] = premise[1]
      new_row['sentence2'] = hypothesis[1]

      if premise[0].endswith("para1") or premise[0].endswith("para2") or premise[0].endswith("para3"):
        new_row['s1_para_id'] = row['dataset'] + "_s1_" + str(row['corpus_sent_id']) + premise[0][-6:]
        new_row['silver_label'] = row['gold_label']
      
      if hypothesis[0].endswith("para1") or hypothesis[0].endswith("para2") or hypothesis[0].endswith("para3"):
        new_row['s2_para_id'] = row['dataset'] + "_s2_" + str(row['corpus_sent_id']) + hypothesis[0][-6:]
        new_row['silver_label'] = row['gold_label']
    
      generated_dataset.append(new_row)
  
  return pd.DataFrame(generated_dataset)

In [None]:
rte_test_paraphrased = generate_paraphrased_dataset(rte_test_data, paraphrased_test_data)

In [None]:
rte_test_paraphrased.to_csv("/content/drive/MyDrive/paraphraser_work/MTurk data/Generated_dataset/rte_test_paraphrased.csv")

In [None]:
rte_dev_paraphrased = generate_paraphrased_dataset(rte_dev_data, paraphrased_dev_data)

In [None]:
rte_dev_paraphrased.to_csv("/content/drive/MyDrive/paraphraser_work/MTurk data/Generated_dataset/rte_dev_paraphrased.csv")

In [None]:
def save_data(df, path):
    with open(path, 'w') as f:
        f.write(df.to_json(orient='records', lines=True))

In [None]:
save_data(rte_dev_paraphrased, "/content/drive/MyDrive/paraphraser_work/MTurk data/Generated_dataset/rte_dev_paraphrased.jsonl")

In [None]:
save_data(rte_test_paraphrased, "/content/drive/MyDrive/paraphraser_work/MTurk data/Generated_dataset/rte_test_paraphrased.jsonl")