In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json

## Train Data

In [79]:
with open('data\\raw\\train.json') as f:
  gold = json.load(f)

In [80]:
for data in gold:
  section = gold[data]["Section_id"]
  primary_id = gold[data]["Primary_id"]
  with open(f'data/raw/CT/{primary_id}.json') as f:
    primary = json.load(f)

  primary_premise = ' '.join(primary[section])
  gold[data]["primary_premise"] = primary_premise

  if gold[data]["Type"] == 'Comparison':
    secondary_id = gold[data]["Secondary_id"]
    with open(f'data/raw/CT/{secondary_id}.json') as f:
      secondary = json.load(f)

    secondary_premise = ' '.join(secondary[section])
    gold[data]["secondary_premise"] = secondary_premise

In [81]:
with open('data/original/cnli/train.tsv', 'w', encoding='utf-8') as f:
    f.write('sentence\tlabel\n')
    for data in gold:
        primary_premise = gold[data]["primary_premise"]
        statement = gold[data]["Statement"]
        section_id = gold[data]["Section_id"]
        label = 0 if gold[data]["Label"]=="Contradiction" else 1
        if gold[data]["Type"] == 'Comparison':
            secondary_premise = gold[data]["secondary_premise"]
            f.write(f'Type : {gold[data]["Type"]} [SEP] Section ID : {section_id} [SEP] Primary : {primary_premise} [SEP] Secondary : {secondary_premise} [SEP] Statement : {statement} \t{label}\n')
            # print(f'Type : {gold[data]["Type"]} [SEP] Section ID : {section_id} [SEP] Primary : {primary_premise} [SEP] Secondary : {secondary_premise} [SEP] Statement : {statement} \t{label}\n'.count('\t'))
        else:
            f.write(f'Type : {gold[data]["Type"]} [SEP] Section ID : {section_id} [SEP] Primary : {primary_premise} [SEP] Statement : {statement} \t{label}\n')
            # print(f'Type : {gold[data]["Type"]} [SEP] Section ID : {section_id} [SEP] Primary : {primary_premise} [SEP] Statement : {statement} \t{label}\n'.count('\t'))

In [82]:
random_seed = 42

np.random.seed(random_seed)

# list of indices where the label is "entailment" in gold dict
entailment_indices = [i for i, data in enumerate(gold) if gold[data]["Label"] == "Entailment"]

# select 16 random indices from entailment_indices
random_indices = np.random.choice(entailment_indices, 16, replace=False)


examples_lst = []
for i, data in enumerate(gold):
    if i not in random_indices:
        continue
    primary_premise = gold[data]["primary_premise"]
    secondary_premise = gold[data]["secondary_premise"] if gold[data]["Type"] == 'Comparison' else "No Secondary Premise."
    statement = gold[data]["Statement"]
    examples_lst.append({'primary_premise': primary_premise, 'secondary_premise': secondary_premise, 'statement': statement})

# create a json file from the examples_dict dictionary
with open(f'data/original/cnli/examples-{random_seed}.json', 'w') as outfile:
    json.dump(examples_lst, outfile)

In [96]:
# count number of words in each premise and statement and sum up
# also get maximum number of words in a premise and statement
# count words per section and max words per section

max_premise_per_section = {}
max_premise_per_type = {}
max_statement_per_section = {}
max_statement_per_type = {}
max_primary_premise = 0
max_secondary_premise = 0
max_statement = 0
total_words = 0
for data in gold:
    primary_premise = gold[data]["primary_premise"]
    secondary_premise = gold[data]["secondary_premise"] if gold[data]["Type"] == 'Comparison' else "No Secondary Premise."
    statement = gold[data]["Statement"]
    total_words += len(primary_premise.split()) + len(secondary_premise.split()) + len(statement.split())
    max_primary_premise = max(max_primary_premise, len(primary_premise.split()))
    max_secondary_premise = max(max_secondary_premise, len(secondary_premise.split()))
    max_statement = max(max_statement, len(statement.split()))
    type_id = gold[data]["Type"]
    max_premise_per_type[type_id] = max(max_premise_per_type.get(type_id, 0), len(primary_premise.split()) + len(secondary_premise.split()) + len(statement.split()))
    max_statement_per_type[type_id] = max(max_statement_per_type.get(type_id, 0), len(statement.split()))
    section_id = gold[data]["Section_id"]
    max_premise_per_section[section_id] = max(max_premise_per_section.get(section_id, 0), len(primary_premise.split()) + len(secondary_premise.split()) + len(statement.split()))
    max_statement_per_section[section_id] = max(max_statement_per_section.get(section_id, 0), len(statement.split()))

print(f"Total number of words in all premises and statements: {total_words}")
print(f"Maximum number of words in a primary premise: {max_primary_premise}")
print(f"Maximum number of words in a secondary premise: {max_secondary_premise}")
print(f"Maximum number of words in a statement: {max_statement}")
print(f"Maximum number of words in a premise and statement per type: {max_premise_per_type}")
print(f"Maximum number of words in a premise and statement per section: {max_premise_per_section}")
print(f"Maximum number of words in a statement per type: {max_statement_per_type}")
print(f"Maximum number of words in a statement per section: {max_statement_per_section}")

Total number of words in all premises and statements: 428597
Maximum number of words in a primary premise: 1547
Maximum number of words in a secondary premise: 1349
Maximum number of words in a statement: 65
Maximum number of words in a premise and statement per type: {'Comparison': 1736, 'Single': 1575}
Maximum number of words in a premise and statement per section: {'Intervention': 612, 'Eligibility': 1736, 'Adverse Events': 226, 'Results': 753}
Maximum number of words in a statement per type: {'Comparison': 52, 'Single': 65}
Maximum number of words in a statement per section: {'Intervention': 65, 'Eligibility': 52, 'Adverse Events': 48, 'Results': 45}


In [111]:
gold[list(gold.keys())[-212]]

{'Type': 'Single',
 'Section_id': 'Results',
 'Primary_id': 'NCT01808573',
 'Statement': 'the primary trial Patients receiving Neratinib Plus Capecitabine had a Mean (95% Confidence Interval)  Progression Free Survival more than 2 months longer than patients administered with Lapatinib Plus Capecitabine.',
 'Label': 'Entailment',
 'primary_premise': 'Outcome Measurement:    Centrally Assessed Progression Free Survival   Progression Free Survival (PFS), Measured in Months, for Randomized Subjects of the Central Assessment. The time interval from the date of randomization until the first date on which recurrence, progression (per Response Evaluation Criteria in Solid Tumors Criteria (RECIST) v1.1), or death due to any cause, is documented. For subjects without recurrence, progression or death, it is censored at the last valid tumor assessment. Progression is defined using Response Evaluation Criteria in Solid Tumors Criteria (RECIST v1.1), as a 20% increase in the sum of the longest diam

## Validation Data

In [71]:
with open('data\\raw\\dev.json') as f:
  gold = json.load(f)

In [72]:
for data in gold:
  section = gold[data]["Section_id"]
  primary_id = gold[data]["Primary_id"]
  with open(f'data/raw/CT/{primary_id}.json') as f:
    primary = json.load(f)

  primary_premise = ' '.join(primary[section])
  gold[data]["primary_premise"] = primary_premise

  if gold[data]["Type"] == 'Comparison':
    secondary_id = gold[data]["Secondary_id"]
    with open(f'data/raw/CT/{secondary_id}.json') as f:
      secondary = json.load(f)

    secondary_premise = ' '.join(secondary[section])
    gold[data]["secondary_premise"] = secondary_premise

In [73]:

with open('data/original/cnli/dev.tsv', 'w', encoding='utf-8') as f:
    f.write('sentence\tlabel\n')
    for data in gold:
        primary_premise = gold[data]["primary_premise"]
        statement = gold[data]["Statement"]
        section_id = gold[data]["Section_id"]
        label = 0 if gold[data]["Label"]=="Contradiction" else 1
        if gold[data]["Type"] == 'Comparison':
            secondary_premise = gold[data]["secondary_premise"]
            f.write(f'Type : {gold[data]["Type"]} [SEP] Section ID : {section_id} [SEP] Primary : {primary_premise} [SEP] Secondary : {secondary_premise} [SEP] Statement : {statement} \t{label}\n')
            # print(f'Type : {gold[data]["Type"]} [SEP] Section ID : {section_id} [SEP] Primary : {primary_premise} [SEP] Secondary : {secondary_premise} [SEP] Statement : {statement} \t{label}\n'.count('\t'))
        else:
            f.write(f'Type : {gold[data]["Type"]} [SEP] Section ID : {section_id} [SEP] Primary : {primary_premise} [SEP] Statement : {statement} \t{label}\n')
            # print(f'Type : {gold[data]["Type"]} [SEP] Section ID : {section_id} [SEP] Primary : {primary_premise} [SEP] Statement : {statement} \t{label}\n'.count('\t'))

In [74]:
len(gold)

200

## Test Data

In [24]:
with open('data\\raw\\test.json') as f:
  gold = json.load(f)

In [25]:
for data in gold:
  section = gold[data]["Section_id"]
  primary_id = gold[data]["Primary_id"]
  with open(f'data/raw/CT/{primary_id}.json') as f:
    primary = json.load(f)

  primary_premise = ' '.join(primary[section])
  gold[data]["primary_premise"] = primary_premise

  if gold[data]["Type"] == 'Comparison':
    secondary_id = gold[data]["Secondary_id"]
    with open(f'data/raw/CT/{secondary_id}.json') as f:
      secondary = json.load(f)

    secondary_premise = ' '.join(secondary[section])
    gold[data]["secondary_premise"] = secondary_premise

In [31]:
with open('data/original/cnli/test.tsv', 'w', encoding='utf-8') as f:
    f.write('sentence\tlabel\n')
    for data in gold:
        primary_premise = gold[data]["primary_premise"]
        statement = gold[data]["Statement"]
        section_id = gold[data]["Section_id"]
        label = 0 if gold[data]["Label"]=="Contradiction" else 1
        if gold[data]["Type"] == 'Comparison':
            secondary_premise = gold[data]["secondary_premise"]
            f.write(f'Type : {gold[data]["Type"]} [SEP] Section ID : {section_id} [SEP] Primary : {primary_premise} [SEP] Secondary : {secondary_premise} [SEP] Statement : {statement} \t{label}\n')
            # print(f'Type : {gold[data]["Type"]} [SEP] Section ID : {section_id} [SEP] Primary : {primary_premise} [SEP] Secondary : {secondary_premise} [SEP] Statement : {statement} \t{label}\n'.count('\t'))
        else:
            f.write(f'Type : {gold[data]["Type"]} [SEP] Section ID : {section_id} [SEP] Primary : {primary_premise} [SEP] Statement : {statement} \t{label}\n')
            # print(f'Type : {gold[data]["Type"]} [SEP] Section ID : {section_id} [SEP] Primary : {primary_premise} [SEP] Statement : {statement} \t{label}\n'.count('\t'))

In [42]:
tst_lst = []
for data in gold:
    # if gold[data]["Label"] == 'Contradiction':
    #     continue
    primary_premise = gold[data]["primary_premise"]
    # statement = gold[data]["Statement"]
    secondary_premise = gold[data]["secondary_premise"] if gold[data]["Type"] == 'Comparison' else "No Secondary Premise."
    tst_lst.append({'primary_premise': primary_premise, 'secondary_premise': secondary_premise})

# create a json file from the examples_dict dictionary
with open('data/original/cnli/test_list.json', 'w') as outfile:
    json.dump(tst_lst, outfile)