In [33]:
import json
import pandas as pd
from collections import Counter

In [3]:
FILEPATH_BASE = 'QNLI/QNLI-Stress Test/'

FILEPATH_TRAIN = 'QNLI-Stress Test_train.json'
FILEPATH_TEST = 'QNLI-Stress Test_test.json'
FILEPATH_DEV = 'QNLI-Stress Test_dev.json'

with open(FILEPATH_BASE + FILEPATH_TRAIN, 'r') as file:
    train = json.load(file)

with open(FILEPATH_BASE + FILEPATH_DEV, 'r') as file:
    dev = json.load(file)

with open(FILEPATH_BASE + FILEPATH_TEST, 'r') as file:
    test = json.load(file)


print(len(train))
print(len(dev))
print(len(test))

6475
970
1691


In [9]:
train[0]

{'statement1': "'' Someone just came in and shot my daughter and husband , '' Flores ' wife frantically told 911 .",
 'statement2': 'Raul Flores , daughter , 9 , shot dead ; wire calls 911',
 'options': ' Entailment or neutral?',
 'answer': 'neutral',
 'type': 'Type_7',
 'statement1_sci_10E': "'' Someone just came in and shot my daughter and husband , '' Flores ' wife frantically told 9.1100000000E+02 .",
 'statement1_char': "'' Someone just came in and shot my daughter and husband , '' Flores ' wife frantically told 9 1 1 .",
 'statement1_sci_10E_char': "'' Someone just came in and shot my daughter and husband , '' Flores ' wife frantically told 9 . 1 1 0 0 0 0 0 0 0 0 E + 0 2 .",
 'statement2_sci_10E': 'Raul Flores , daughter , 9.0000000000E+00 , shot dead ; wire calls 9.0000000000E+0011',
 'statement2_char': 'Raul Flores , daughter , 9 , shot dead ; wire calls 9 1 1',
 'statement2_sci_10E_char': 'Raul Flores , daughter , 9 . 0 0 0 0 0 0 0 0 0 0 E + 0 0 , shot dead ; wire calls 9 . 0

# sanity check

In [17]:
def have_same_structure(dicts):
    if not dicts:
        return True  # An empty list is considered to have the same structure

    reference_keys = set(dicts[0].keys())

    for d in dicts[1:]:
        if set(d.keys()) != reference_keys:
            return False

    return True

In [20]:
print(have_same_structure(train))
print(have_same_structure(dev))
print(have_same_structure(test))

True
True
True


In [22]:
list(train[0].keys())

['statement1',
 'statement2',
 'options',
 'answer',
 'type',
 'statement1_sci_10E',
 'statement1_char',
 'statement1_sci_10E_char',
 'statement2_sci_10E',
 'statement2_char',
 'statement2_sci_10E_char',
 'statement1_mask',
 'statement2_mask',
 'EQUATE']

Okay so train/dev/test are all lists of dictionaries of the same keys.

# understand the fields

In [24]:
def get_unique_values(dicts, target_key):
    if not dicts:
        return True  # An empty list is considered to have the same structure

    freq_counts = Counter(item[target_key] for item in dicts)


    return freq_counts

In [32]:
print(get_unique_values(train, 'EQUATE'))
print(get_unique_values(dev, 'EQUATE'))
print(get_unique_values(test, 'EQUATE'))

Counter({'StressTest': 4619, 'NewsNLI': 968, 'AWPNLI': 722, 'RTE_Quant': 166})
Counter({'StressTest': 970})
Counter({'StressTest': 1691})


In [27]:
print(get_unique_values(train, 'options'))
print(get_unique_values(dev, 'options'))
print(get_unique_values(test, 'options'))

Counter({' Entailment or contradiction or neutral?': 4619, ' Entailment or neutral?': 1134, ' Entailment or contradiction?': 722})
Counter({' Entailment or contradiction or neutral?': 970})
Counter({' Entailment or contradiction or neutral?': 1691})


In [28]:
print(get_unique_values(train, 'answer'))
print(get_unique_values(dev, 'answer'))
print(get_unique_values(test, 'answer'))

Counter({'Entailment': 2461, 'neutral': 2112, 'contradiction': 1902})
Counter({'Entailment': 324, 'neutral': 323, 'contradiction': 323})
Counter({'contradiction': 564, 'neutral': 564, 'Entailment': 563})


In [29]:
print(get_unique_values(train, 'type'))
print(get_unique_values(dev, 'type'))
print(get_unique_values(test, 'type'))

Counter({'Type_7': 6475})
Counter({'Type_7': 970})
Counter({'Type_7': 1691})


Findings: 
* Only in the `train` set does the data come from different EQUATE subsets. `dev` and `test` all come from Stress dataset. 
* In `train` set, some of the questions can potentiallly be any of the 3 labels, some questions can only be one of the 2 labels. In `dev` and `test` all questions can have either of the 3 labels.

We don't need these 3 fields: `type`,`statement1_mask`, `statement2_mask`.   
* Because `type` are all the same (probably referring to the task number in NumGlue since QNLI is task 7 in NumGlue dataset).
* `statement1_mask`, `statement2_mask` are not relevant to our task. 

Relevant fields are: 
* (`statement1`, `statement2`) for original notation
 `options`,
* (`statement1_sci_10E`, `statement2_sci_10E`) for scientific notation
* (`statement1_char`, `statement2_char`) for character notation
* (`statement1_sci_10E_char`, `statement2_sci_10E_char`) for character-scientific notation
* `options` indicate the range of possible ansers (only have diff value in `train` set)
* `answer` to indicate weather its "Entailment", "Contradiction", or "Neutral"
* `EQUATE` to indicate which subset of EQUATE it comes from (only have diff value in `train` set)

# a closer look at data

In [35]:
df = pd.DataFrame(train)

cross_tab = pd.crosstab(index=df['options'], columns=df['EQUATE'])

cross_tab

EQUATE,AWPNLI,NewsNLI,RTE_Quant,StressTest
options,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Entailment or contradiction or neutral?,0,0,0,4619
Entailment or contradiction?,722,0,0,0
Entailment or neutral?,0,968,166,0


# is answers evenly distributed? 

In [45]:
def get_answer_percentage(dicts): 
    answer_counter = get_unique_values(dev, 'answer')
        
    answer_freq = {key: round(value / len(dev), 2) for key, value in answer_counter.items()}
    
    return answer_freq

In [48]:
get_answer_percentage(train) # wow it's so balenced, would it differ per source EQUATE? 

{'neutral': 0.33, 'Entailment': 0.33, 'contradiction': 0.33}

In [46]:
get_answer_percentage(dev)

{'neutral': 0.33, 'Entailment': 0.33, 'contradiction': 0.33}

In [47]:
get_answer_percentage(test)

{'neutral': 0.33, 'Entailment': 0.33, 'contradiction': 0.33}

In [None]:
train_stress = [if train['EQUATE'] == 'for record in train]
answer_dev = get_unique_values(dev, 'answer')
answer_test = get_unique_values(test, 'answer')

answer_freq_dev = {key: round(value / len(dev), 2) for key, value in answer_dev.items()}
answer_freq_test = {key: round(value / len(test), 2) for key, value in answer_test.items()}

print(answer_freq_dev)
print(answer_freq_test)

# end