In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, huggingface-hub, datasets
Successfully installed datasets-2.1

In [None]:
import pandas as pd
import spacy
from ast import literal_eval
import nltk
nltk.download('punkt')
from collections import Counter
from tqdm import tqdm
import re
import string
from typing import Tuple
from datasets import load_dataset

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
years = [2018, 2019, 2020, 2021, 2022]

In [None]:
def read_data(file_path):
    df = pd.read_csv(file_path)
    return df

In [None]:
def get_cwb_artefacts_imbalance(train_type, years):
    # get data
    contradiction_train_data_dict = {}
    cw_aug_data_need_dict = {}
    coliee_data_dict = {}
    for year in years:
        if train_type == "before":
            data_file_path = f"/content/drive/MyDrive/data/task 4/train/coliee_train_{year}.csv"
        elif train_type == "after":
            data_file_path = f"/content/drive/MyDrive/data/task 4/train/coliee_aug_train_{year}.csv"
        df = read_data(data_file_path)
        df['instance_type'] = "original"
        print(f"Year : {year}")

        contradiction_df = df[df['has_negation']==True]
        print(f"Number of total instances with contradiction : {contradiction_df.shape}")

        entailment_contradiction_df = contradiction_df[contradiction_df['labels']=="Y"]
        print(f"Number of entailment instances with contradiction : {entailment_contradiction_df.shape}")
        non_entailment_contradiction_df = contradiction_df[contradiction_df['labels']=="N"]
        print(f"Number of non-entailment instances with contradiction : {non_entailment_contradiction_df.shape}")

        diff = non_entailment_contradiction_df.shape[0] - entailment_contradiction_df.shape[0]
        print(f"Difference in contradiction instances between non-entailment and entailment labels: {diff}")
        coliee_data_dict[year] = df
        cw_aug_data_need_dict[year] = diff
        contradiction_train_data_dict[year] = contradiction_df

    return coliee_data_dict,cw_aug_data_need_dict, contradiction_train_data_dict


coliee_data_dict,cw_aug_data_need_dict, contradiction_train_data_dict = get_cwb_artefacts_imbalance("before", years)

Year : 2018
Number of total instances with contradiction : (288, 16)
Number of entailment instances with contradiction : (138, 16)
Number of non-entailment instances with contradiction : (150, 16)
Difference in contradiction instances between non-entailment and entailment labels: 12
Year : 2019
Number of total instances with contradiction : (324, 16)
Number of entailment instances with contradiction : (152, 16)
Number of non-entailment instances with contradiction : (172, 16)
Difference in contradiction instances between non-entailment and entailment labels: 20
Year : 2020
Number of total instances with contradiction : (365, 16)
Number of entailment instances with contradiction : (171, 16)
Number of non-entailment instances with contradiction : (194, 16)
Difference in contradiction instances between non-entailment and entailment labels: 23
Year : 2021
Number of total instances with contradiction : (418, 16)
Number of entailment instances with contradiction : (195, 16)
Number of non-ent

In [None]:
cw_aug_data_need_dict

{2018: 12, 2019: 20, 2020: 23, 2021: 28, 2022: 29}

In [None]:
def get_wob_artefacts_imbalance(train_type, years):
    # get data
    print(years)
    wo_aug_data_need_dict = {}
    for year in years:
        print(year)
        if train_type == "before":
            data_file_path = f"/content/drive/MyDrive/data/task 4/train/coliee_train_{year}.csv"
        elif train_type == "after":
            data_file_path = f"/content/drive/MyDrive/data/task 4/train/coliee_aug_train_{year}.csv"

        df = read_data(data_file_path)
        print(f"Year : {year}")

        df['percent_overlap'] = (df['overlap'] / df['hyp_length']) * 100

        wob_df = df[(df['is_word_overlap']==True) & (df['has_negation']==False) & (df['is_subsequence_heuristic']==False)]
        print(f"Number of total instances with Word Overlap : {wob_df.shape}")

        # More than 50%

        entailment_wob_df = wob_df[(wob_df['labels']=="Y")  & (wob_df['percent_overlap']>=50)]
        print(f"Number of entailment instances with Word Overlaps : {entailment_wob_df.shape}")
        non_entailment_wob_df = wob_df[(wob_df['labels']=="N")  & (wob_df['percent_overlap']>=50)]
        print(f"Number of non-entailment instances with Word Overlaps : {non_entailment_wob_df.shape}")

        diff = entailment_wob_df.shape[0] - non_entailment_wob_df.shape[0]
        print(f"Difference in word overlap instances between entailment and non-entailment labels above 50%: {diff}")
        wo_aug_data_need_dict[year] = diff

        # Less than 50 %

        # ls_entailment_wob_df = wob_df[(wob_df['labels']=="Y")  & (wob_df['percent_overlap']<50)]
        # print(f"Number of entailment instances with Word Overlaps : {ls_entailment_wob_df.shape}")
        # ls_non_entailment_wob_df = wob_df[(wob_df['labels']=="N")  & (wob_df['percent_overlap']<50)]
        # print(f"Number of non-entailment instances with Word Overlaps : {ls_non_entailment_wob_df.shape}")

        # ls_diff = ls_entailment_wob_df.shape[0] - ls_non_entailment_wob_df.shape[0]
        # print(f"Difference in word overlap instances between entailment and non-entailment labels below 50%: {ls_diff}")

    return wo_aug_data_need_dict

wo_aug_data_need_dict = get_wob_artefacts_imbalance(train_type="before", years=years)

[2018, 2019, 2020, 2021, 2022]
2018
Year : 2018
Number of total instances with Word Overlap : (277, 16)
Number of entailment instances with Word Overlaps : (109, 16)
Number of non-entailment instances with Word Overlaps : (70, 16)
Difference in word overlap instances between entailment and non-entailment labels above 50%: 39
2019
Year : 2019
Number of total instances with Word Overlap : (298, 16)
Number of entailment instances with Word Overlaps : (117, 16)
Number of non-entailment instances with Word Overlaps : (79, 16)
Difference in word overlap instances between entailment and non-entailment labels above 50%: 38
2020
Year : 2020
Number of total instances with Word Overlap : (327, 16)
Number of entailment instances with Word Overlaps : (129, 16)
Number of non-entailment instances with Word Overlaps : (87, 16)
Difference in word overlap instances between entailment and non-entailment labels above 50%: 42
2021
Year : 2021
Number of total instances with Word Overlap : (384, 16)
Number o

In [None]:
wo_aug_data_need_dict

{2018: 39, 2019: 38, 2020: 42, 2021: 49, 2022: 39}

In [None]:
cw_aug_df = read_data("/content/drive/MyDrive/data/task 4/train/data_aug_contradiction_instances.csv")
wo_aug_df = read_data("/content/drive/MyDrive/data/task 4/train/data_aug_word_overlap_instances.csv")

In [None]:
cw_aug_df = cw_aug_df[['id','label','premise','hypothesis','labels']]
wo_aug_df = wo_aug_df[['id','label','premise','hypothesis','labels']]

In [None]:
cw_aug_df.columns

Index(['id', 'label', 'premise', 'hypothesis', 'labels'], dtype='object')

In [None]:
# prepare the aug_df with the extra columns needed for merging
# addition of features
# sentence length, number of word overlaps, is there word overlap, is there a negation word, is there a subsequence heuristic

# List of negation words
negation_words = ["not", "no", "n't", "none", "neither", "never", "nobody", "nothing", "nowhere", "hardly", "scarcely", "barely", "rarely", "seldom"]


def extract_negation(tokens):
    negations = [token for token in tokens if token in negation_words]
    return negations

def calculate_overlap(premise, hypothesis):
    """
    This function takes in two sentences as arguments (the premise and the hypothesis), tokenizes them,
    and calculates the count of words that overlap between them if the overlap is high (above the threshold).
    The threshold is defined as a ratio of the total words in both sentences.
    """

    # Tokenize the sentences
    premise_tokens = nltk.word_tokenize(premise.lower())
    hypothesis_tokens = nltk.word_tokenize(hypothesis.lower())

    # Count the words in each sentence
    premise_counter = Counter(premise_tokens)
    hypothesis_counter = Counter(hypothesis_tokens)

    # Calculate the overlap using the intersection of the two Counters
    overlap_counter = premise_counter & hypothesis_counter
    overlap = sum(overlap_counter.values())

    return overlap

def detect_word_overlap_bias(df):
    """
    This function takes in a dataframe with 'premise', 'hypothesis', and 'label' columns,
    calculates the number of overlapping words for each row and adds this as a new 'overlap' column,
    then calculates the average overlap for 'entailment' cases and for 'non-entailment' cases,
    and compares the two.
    It also counts the number of instances in the 'entailment' and 'non-entailment' cases
    where the overlap is above a given threshold.
    """

    # Calculate overlap for each row and add it as a new 'overlap' column
    df['overlap'] = df.apply(lambda row: calculate_overlap(row['premise'], row['hypothesis']), axis=1)

    # create a new column to indicate if there is word overlap
    df['is_word_overlap'] = df['overlap'].apply(lambda x: True if x > 0 else False)

    return df

def remove_punctuation(input_string):
    translator = str.maketrans('', '', string.punctuation)
    return input_string.translate(translator)

def detect_subsequence(premise: str, hypothesis: str) -> Tuple[str, bool]:
    premise_words = remove_punctuation(premise.lower())
    hypothesis_words = remove_punctuation(hypothesis.lower())
    # Add word boundaries to the hypothesis
    hypothesis_words = r"\b" + hypothesis_words + r"\b"
    if re.search(hypothesis_words, premise_words):
        return (hypothesis, True)
    else:
        return ("", False)

def add_features(df):
    df['hyp_tokens'] = df['hypothesis'].apply(lambda x: nltk.word_tokenize(x.lower()))
    df['hyp_length'] = df['hyp_tokens'].apply(lambda x: len(x))
    df['prem_tokens'] = df['premise'].apply(lambda x: nltk.word_tokenize(x.lower()))
    df['prem_length'] = df['prem_tokens'].apply(lambda x: len(x))
    df = detect_word_overlap_bias(df)
    df['negations'] = df['hyp_tokens'].apply(lambda x: extract_negation(x))
    df['has_negation'] = df['negations'].apply(lambda x: True if len(x) > 0 else False)
    df['detected_subsequence'], df['is_subsequence_heuristic'] = zip(*df.apply(lambda row: detect_subsequence(row['premise'], row['hypothesis']), axis=1))

    return df

In [None]:
cw_aug_df = add_features(cw_aug_df)
wo_aug_df = add_features(wo_aug_df)

In [None]:
cw_aug_df

Unnamed: 0,id,label,premise,hypothesis,labels,hyp_tokens,hyp_length,prem_tokens,prem_length,overlap,is_word_overlap,negations,has_negation,detected_subsequence,is_subsequence_heuristic
0,CW-AD-H18-2-4,1,Article 702\n(1) If a manager has incurred ben...,"""When someone fixes a neighbor's fence affecte...",Y,"[``, when, someone, fixes, a, neighbor, 's, fe...",52,"[article, 702, (, 1, ), if, a, manager, has, i...",98,18,True,[not],True,,False
1,CW-AD-H18-23-I,1,Article 537\n(1) If one of the parties promise...,"""In a scenario where individual A transfers a ...",Y,"[``, in, a, scenario, where, individual, a, tr...",106,"[article, 537, (, 1, ), if, one, of, the, part...",120,32,True,[not],True,,False
2,CW-AD-H18-26-1,1,Article 255\nIf one of co-owners waives intere...,"""If one co-owner of a property, shared equally...",Y,"[``, if, one, co-owner, of, a, property, ,, sh...",38,"[article, 255, if, one, of, co-owners, waives,...",22,6,True,[no],True,,False
3,CW-AD-H19-11-3,1,"Article 177\nAcquisitions of, losses of and ch...","""If A purchases a registered building from B, ...",Y,"[``, if, a, purchases, a, registered, building...",52,"[article, 177, acquisitions, of, ,, losses, of...",54,16,True,[not],True,,False
4,CW-AD-H19-12-4,1,Article 343\nA thing that cannot be transferre...,"""If a commitment is established to create a pl...",Y,"[``, if, a, commitment, is, established, to, c...",45,"[article, 343, a, thing, that, can, not, be, t...",22,11,True,[not],True,,False
5,CW-AD-H20-23-5,1,Article 588\nIf any person has an obligation t...,A quasi-loan contract shall not become effecti...,Y,"[a, quasi-loan, contract, shall, not, become, ...",21,"[article, 588, if, any, person, has, an, oblig...",48,8,True,"[not, not]",True,,False
6,CW-AD-H21-19-A,1,Article 447\n(1) The guarantee obligation incl...,"""In instances where a contract cancellation re...",Y,"[``, in, instances, where, a, contract, cancel...",86,"[article, 447, (, 1, ), the, guarantee, obliga...",62,21,True,[not],True,,False
7,CW-AD-H22-15-U,1,Article 677\nA partner's creditor may not exer...,A debtor to a partnership is not able to set o...,Y,"[a, debtor, to, a, partnership, is, not, able,...",20,"[article, 677, a, partner, 's, creditor, may, ...",19,5,True,[not],True,,False
8,CW-AD-H22-21-4,1,"Article 492\nUpon tendering the performance, t...","""When the person obliged to pay a financial ob...",Y,"[``, when, the, person, obliged, to, pay, a, f...",42,"[article, 492, upon, tendering, the, performan...",26,9,True,[not],True,,False
9,CW-AD-H23-9-2,1,Article 192\nA person that commences the posse...,Provisions for immediate acquisition are not a...,Y,"[provisions, for, immediate, acquisition, are,...",28,"[article, 192, a, person, that, commences, the...",41,4,True,[not],True,,False


In [None]:
wo_aug_df

Unnamed: 0,id,label,premise,hypothesis,labels,hyp_tokens,hyp_length,prem_tokens,prem_length,overlap,is_word_overlap,negations,has_negation,detected_subsequence,is_subsequence_heuristic
0,WO-AD-H18-28-4,0,Article 465-3\n(1) If a contract for revolving...,"""If the principal crystallization date for a c...",N,"[``, if, the, principal, crystallization, date...",36,"[article, 465-3, (, 1, ), if, a, contract, for...",458,31,True,[],False,,False
1,WO-AD-H19-12-1,0,Article 350\nThe provisions of Articles 296 th...,"A holder of a statutory lien, a pledge, or a m...",N,"[a, holder, of, a, statutory, lien, ,, a, pled...",46,"[article, 350, the, provisions, of, articles, ...",142,32,True,[],False,,False
2,WO-AD-H19-16-2,0,Article 387\n(1) A registered lease may be dul...,A registered lease can be asserted against a m...,N,"[a, registered, lease, can, be, asserted, agai...",38,"[article, 387, (, 1, ), a, registered, lease, ...",94,25,True,[],False,,False
3,WO-AD-H22-26-U,0,Article 646\n(1) A mandatary must deliver to t...,If the mandatary has received monies and other...,N,"[if, the, mandatary, has, received, monies, an...",26,"[article, 646, (, 1, ), a, mandatary, must, de...",64,19,True,[],False,,False
4,WO-AD-H23-11-O,0,Article 702\n(1) If a manager has incurred ben...,Even if the manager has gone against the princ...,N,"[even, if, the, manager, has, gone, against, t...",30,"[article, 702, (, 1, ), if, a, manager, has, i...",98,20,True,[],False,,False
5,WO-AD-H24-8-5,0,Article 424\n(1) An obligee may demand the cou...,An obligee (B) against (A) may demand the cour...,N,"[an, obligee, (, b, ), against, (, a, ), may, ...",57,"[article, 424, (, 1, ), an, obligee, may, dema...",179,39,True,[],False,,False
6,WO-AD-H24-20-I,0,Article 465-3\n(1) If a contract for revolving...,If the day three years have passed from the da...,N,"[if, the, day, three, years, have, passed, fro...",39,"[article, 465-3, (, 1, ), if, a, contract, for...",481,32,True,[],False,,False
7,WO-AD-H24-27-O,0,Article 637\n(1) In the case prescribed in the...,Where the contractor delivers the subject matt...,N,"[where, the, contractor, delivers, the, subjec...",40,"[article, 637, (, 1, ), in, the, case, prescri...",152,34,True,[],False,,False
8,WO-AD-H25-13-3,0,Article 366\n(1) A pledgee may directly collec...,If monies are the subject matter of a pledged ...,N,"[if, monies, are, the, subject, matter, of, a,...",35,"[article, 366, (, 1, ), a, pledgee, may, direc...",152,31,True,[],False,,False
9,WO-AD-H25-16-4,0,Article 370\nA mortgage extends to the things ...,A mortgage shall extend to the buildings on th...,N,"[a, mortgage, shall, extend, to, the, building...",17,"[article, 370, a, mortgage, extends, to, the, ...",87,12,True,[],False,,False


In [None]:
def augment_train_data(train_data, aug_data, data_needed):
    aug_data = aug_data[:data_needed]
    concat_data = pd.concat([train_data, aug_data])
    return concat_data

In [None]:
coliee_data_dict[2022]

Unnamed: 0,id,label,premise,hypothesis,labels,hyp_tokens,hyp_length,prem_tokens,prem_length,overlap,is_word_overlap,negations,has_negation,detected_subsequence,is_subsequence_heuristic,instance_type
0,H18-1-1,1,Article 572\nEven if the seller makes a specia...,A special provision that releases warranty can...,Y,"['a', 'special', 'provision', 'that', 'release...",39,"['article', '572', 'even', 'if', 'the', 'selle...",81,22,True,['not'],True,,False,original
1,H18-1-2,0,Article 565\nThe provisions of the preceding t...,There is a limitation period on pursuance of w...,N,"['there', 'is', 'a', 'limitation', 'period', '...",44,"['article', '565', 'the', 'provisions', 'of', ...",176,18,True,['no'],True,,False,original
2,H18-1-3,0,Article 568\n(1) The successful bidder at an a...,"A compulsory auction is also a sale, so warran...",N,"['a', 'compulsory', 'auction', 'is', 'also', '...",20,"['article', '568', '(', '1', ')', 'the', 'succ...",208,10,True,[],False,,False,original
3,H18-2-1,1,Article 697\n(1) A person that has begun to ma...,In cases where a person plans to prevent crime...,Y,"['in', 'cases', 'where', 'a', 'person', 'plans...",34,"['article', '697', '(', '1', ')', 'a', 'person...",111,15,True,[],False,,False,original
4,H18-2-2,1,Article 698\nIf a manager engages in benevolen...,In cases where an individual rescues another p...,Y,"['in', 'cases', 'where', 'an', 'individual', '...",45,"['article', '698', 'if', 'a', 'manager', 'enga...",59,16,True,['not'],True,,False,original
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
882,R02-36-I,0,Article 153 (1) The postponement of the expiry...,The postponement of expiry of prescription per...,N,"['the', 'postponement', 'of', 'expiry', 'of', ...",31,"['article', '153', '(', '1', ')', 'the', 'post...",99,29,True,['not'],True,,False,original
883,R02-36-U,1,Article 187 (1) A successor to a possessor may...,A universal successor to a possessor may also ...,Y,"['a', 'universal', 'successor', 'to', 'a', 'po...",16,"['article', '187', '(', '1', ')', 'a', 'succes...",39,14,True,[],False,,False,original
884,R02-36-E,0,Article 254 A claim that one of the co-owners ...,A claim that a co-owner (A) holds against anot...,N,"['a', 'claim', 'that', 'a', 'co-owner', '(', '...",32,"['article', '254', 'a', 'claim', 'that', 'one'...",28,19,True,['not'],True,,False,original
885,R02-37-A,1,Article 406 If the subject matter of the claim...,If the obligor of an alternative obligation ma...,Y,"['if', 'the', 'obligor', 'of', 'an', 'alternat...",38,"['article', '406', 'if', 'the', 'subject', 'ma...",81,27,True,['not'],True,,False,original


In [None]:
cw_aug_df['instance_type'] = "cw adversarial"
wo_aug_df['instance_type'] = "wo adversarial"
aug_train_data_dict = {}
for year in years:
    # now get the dataset
    train_data = coliee_data_dict[year]
    print(f"Year : {year} : Original Train data: {train_data.shape[0]}")
    # augment cw data
    cw_aug_train_data = augment_train_data(coliee_data_dict[year], cw_aug_df, cw_aug_data_need_dict[year])
    print(f"CW Augmented train data: {cw_aug_train_data.shape[0]}")
    # assert
    assert cw_aug_data_need_dict[year] == (cw_aug_train_data.shape[0] - train_data.shape[0]) , "The data added is not matching for CW aug"

    # augment wo data
    wo_aug_train_data = augment_train_data(cw_aug_train_data, wo_aug_df, wo_aug_data_need_dict[year])
    print(f"WO + CW Augmented train data: {wo_aug_train_data.shape[0]}")
    # assert
    assert wo_aug_data_need_dict[year] == (wo_aug_train_data.shape[0] - cw_aug_train_data.shape[0]) , "The data added is not matching for WO aug"

    aug_train_data_dict[year] = wo_aug_train_data

Year : 2018 : Original Train data: 567
CW Augmented train data: 579
WO + CW Augmented train data: 618
Year : 2019 : Original Train data: 625
CW Augmented train data: 645
WO + CW Augmented train data: 683
Year : 2020 : Original Train data: 695
CW Augmented train data: 718
WO + CW Augmented train data: 760
Year : 2021 : Original Train data: 806
CW Augmented train data: 834
WO + CW Augmented train data: 883
Year : 2022 : Original Train data: 887
CW Augmented train data: 916
WO + CW Augmented train data: 955


In [None]:
aug_train_data_dict[2022]

Unnamed: 0,id,label,premise,hypothesis,labels,hyp_tokens,hyp_length,prem_tokens,prem_length,overlap,is_word_overlap,negations,has_negation,detected_subsequence,is_subsequence_heuristic,instance_type
0,H18-1-1,1,Article 572\nEven if the seller makes a specia...,A special provision that releases warranty can...,Y,"['a', 'special', 'provision', 'that', 'release...",39,"['article', '572', 'even', 'if', 'the', 'selle...",81,22,True,['not'],True,,False,original
1,H18-1-2,0,Article 565\nThe provisions of the preceding t...,There is a limitation period on pursuance of w...,N,"['there', 'is', 'a', 'limitation', 'period', '...",44,"['article', '565', 'the', 'provisions', 'of', ...",176,18,True,['no'],True,,False,original
2,H18-1-3,0,Article 568\n(1) The successful bidder at an a...,"A compulsory auction is also a sale, so warran...",N,"['a', 'compulsory', 'auction', 'is', 'also', '...",20,"['article', '568', '(', '1', ')', 'the', 'succ...",208,10,True,[],False,,False,original
3,H18-2-1,1,Article 697\n(1) A person that has begun to ma...,In cases where a person plans to prevent crime...,Y,"['in', 'cases', 'where', 'a', 'person', 'plans...",34,"['article', '697', '(', '1', ')', 'a', 'person...",111,15,True,[],False,,False,original
4,H18-2-2,1,Article 698\nIf a manager engages in benevolen...,In cases where an individual rescues another p...,Y,"['in', 'cases', 'where', 'an', 'individual', '...",45,"['article', '698', 'if', 'a', 'manager', 'enga...",59,16,True,['not'],True,,False,original
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34,WO-AD-R01-9-U,0,Article 233\n(1) If a tree or bamboo branch fr...,If a tree or bamboo branch from a neighboring ...,N,"[if, a, tree, or, bamboo, branch, from, a, nei...",28,"[article, 233, (, 1, ), if, a, tree, or, bambo...",61,20,True,[],False,,False,wo adversarial
35,WO-AD-R1-17-I,0,Article 450\n(1) If an obligor has the obligat...,A person who can act needs to be the guarantor...,N,"[a, person, who, can, act, needs, to, be, the,...",20,"[article, 450, (, 1, ), if, an, obligor, has, ...",114,15,True,[],False,,False,wo adversarial
36,WO-AD-R1-21-E,0,Article 518\n(1) To the extent of the amount o...,To the extent of the amount of the obligation ...,N,"[to, the, extent, of, the, amount, of, the, ob...",49,"[article, 518, (, 1, ), to, the, extent, of, t...",137,39,True,[],False,,False,wo adversarial
37,WO-AD-R01-36-I,0,Article 663\n(1) If the parties have not speci...,If the depositor dies and the timing of the re...,N,"[if, the, depositor, dies, and, the, timing, o...",25,"[article, 663, (, 1, ), if, the, parties, have...",66,19,True,[],False,,False,wo adversarial


In [None]:
for year in years:
    aug_train_data_file_path = f"/content/drive/MyDrive/data/task 4/train/coliee_aug_train_{year}.csv"
    aug_train_data_dict[year].to_csv(aug_train_data_file_path, index=False)

In [None]:
_,_, _ = get_cwb_artefacts_imbalance("after", years)

Year : 2018
Number of total instances with contradiction : (300, 16)
Number of entailment instances with contradiction : (150, 16)
Number of non-entailment instances with contradiction : (150, 16)
Difference in contradiction instances between non-entailment and entailment labels: 0
Year : 2019
Number of total instances with contradiction : (344, 16)
Number of entailment instances with contradiction : (172, 16)
Number of non-entailment instances with contradiction : (172, 16)
Difference in contradiction instances between non-entailment and entailment labels: 0
Year : 2020
Number of total instances with contradiction : (388, 16)
Number of entailment instances with contradiction : (194, 16)
Number of non-entailment instances with contradiction : (194, 16)
Difference in contradiction instances between non-entailment and entailment labels: 0
Year : 2021
Number of total instances with contradiction : (446, 16)
Number of entailment instances with contradiction : (223, 16)
Number of non-entail

In [None]:
_ = get_wob_artefacts_imbalance(train_type="after", years=years)

[2018, 2019, 2020, 2021, 2022]
2018
Year : 2018
Number of total instances with Word Overlap : (316, 17)
Number of entailment instances with Word Overlaps : (109, 17)
Number of non-entailment instances with Word Overlaps : (109, 17)
Difference in word overlap instances between entailment and non-entailment labels above 50%: 0
2019
Year : 2019
Number of total instances with Word Overlap : (336, 17)
Number of entailment instances with Word Overlaps : (117, 17)
Number of non-entailment instances with Word Overlaps : (117, 17)
Difference in word overlap instances between entailment and non-entailment labels above 50%: 0
2020
Year : 2020
Number of total instances with Word Overlap : (369, 17)
Number of entailment instances with Word Overlaps : (129, 17)
Number of non-entailment instances with Word Overlaps : (129, 17)
Difference in word overlap instances between entailment and non-entailment labels above 50%: 0
2021
Year : 2021
Number of total instances with Word Overlap : (433, 17)
Number o

In [None]:
file_dict = {
            "train" : "/content/drive/MyDrive/data/task 4/train/coliee_aug_train_2018.csv",
        }

dataset = load_dataset(
    'csv',
    data_files=file_dict,
    delimiter=',',
    column_names=['id', 'label', 'premise', 'hypothesis', 'labels', 'hyp_tokens','hyp_length', 'prem_tokens', 'prem_length', 'overlap', 'is_word_overlap', 'negations', 'has_negation', 'detected_subsequence', 'is_subsequence_heuristic', 'instance_type'],
    skiprows=1
)

dataset = dataset.remove_columns('detected_subsequence')

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'label', 'premise', 'hypothesis', 'labels', 'hyp_tokens', 'hyp_length', 'prem_tokens', 'prem_length', 'overlap', 'is_word_overlap', 'negations', 'has_negation', 'is_subsequence_heuristic', 'instance_type'],
        num_rows: 618
    })
})

In [None]:
datas = dataset['train'].train_test_split(test_size=0.1, seed=42)

datas

DatasetDict({
    train: Dataset({
        features: ['id', 'label', 'premise', 'hypothesis', 'labels', 'hyp_tokens', 'hyp_length', 'prem_tokens', 'prem_length', 'overlap', 'is_word_overlap', 'negations', 'has_negation', 'is_subsequence_heuristic', 'instance_type'],
        num_rows: 556
    })
    test: Dataset({
        features: ['id', 'label', 'premise', 'hypothesis', 'labels', 'hyp_tokens', 'hyp_length', 'prem_tokens', 'prem_length', 'overlap', 'is_word_overlap', 'negations', 'has_negation', 'is_subsequence_heuristic', 'instance_type'],
        num_rows: 62
    })
})

In [None]:
datas['test']

Dataset({
    features: ['id', 'label', 'premise', 'hypothesis', 'labels', 'hyp_tokens', 'hyp_length', 'prem_tokens', 'prem_length', 'overlap', 'is_word_overlap', 'negations', 'has_negation', 'is_subsequence_heuristic', 'instance_type'],
    num_rows: 62
})

In [None]:
data_type = "augmented"
if data_type == "normal":
    num_train = int(0.9 * len(dataset['train']))
    num_eval = len(dataset['train']) - num_train

    # Split the dataset
    train_datasets = dataset['train'].select(range(num_train))
    eval_datasets = dataset['train'].select(range(num_train, num_train + num_eval))
    test_datasets = dataset['test']

elif data_type == "augmented":
    data_split = dataset['train'].train_test_split(test_size=0.1, seed=42)

    # Split the dataset
    train_datasets = data_split['train']
    eval_datasets = data_split['test']
    test_datasets = dataset['test']


In [None]:
file_dict = {
            "test" : "/content/drive/MyDrive/data/task 4/test/adversarial_test_set/adversarial_test_set.csv"
        }

dataset = load_dataset(
    'csv',
    data_files=file_dict,
    delimiter=',',
    column_names=['id', 'label', 'premise', 'hypothesis', 'labels', 'hyp_tokens','hyp_length', 'prem_tokens', 'prem_length', 'overlap', 'is_word_overlap', 'negations', 'has_negation', 'detected_subsequence', 'is_subsequence_heuristic', 'Artefact Type', 'Adv Type'],
    skiprows=1
)

dataset = dataset.remove_columns('detected_subsequence')

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
tmp_dataset = dataset.filter(lambda row: row['Artefact Type']=="Word Overlap")

Filter:   0%|          | 0/42 [00:00<?, ? examples/s]

In [None]:
tmp_dataset

DatasetDict({
    test: Dataset({
        features: ['id', 'label', 'premise', 'hypothesis', 'labels', 'hyp_tokens', 'hyp_length', 'prem_tokens', 'prem_length', 'overlap', 'is_word_overlap', 'negations', 'has_negation', 'is_subsequence_heuristic', 'Artefact Type', 'Adv Type', '__index_level_0__'],
        num_rows: 9
    })
})

In [None]:
dataset['test']['Artefact Type']

['Word Overlap',
 'Word Overlap',
 'Word Overlap',
 'Word Overlap',
 'Word Overlap',
 'Word Overlap',
 'Word Overlap',
 'Word Overlap',
 'Word Overlap',
 'Contradiction Word',
 'Contradiction Word',
 'Contradiction Word',
 'Contradiction Word',
 'Contradiction Word',
 'Contradiction Word',
 'Contradiction Word',
 'Contradiction Word',
 'Contradiction Word',
 'Contradiction Word',
 'Annotation Artefact (Y-Building)',
 'Annotation Artefact (Y-Building)',
 'Annotation Artefact (Y-Building)',
 'Annotation Artefact (Y-Building)',
 'Annotation Artefact (Y-Building)',
 'Annotation Artefact (Y-Building)',
 'Annotation Artefact (Y-Person)',
 'Annotation Artefact (Y-Person)',
 'Annotation Artefact (Y-Person)',
 'Annotation Artefact (Y-Person)',
 'Annotation Artefact (Y-Person)',
 'Annotation Artefact (Y-Person)',
 'Annotation Artefact (N - Rescind)',
 'Annotation Artefact (N - Rescind)',
 'Annotation Artefact (N - Rescind)',
 'Annotation Artefact (N - Rescind)',
 'Annotation Artefact (N - Rescin