In [None]:
!pip install openai
!pip install -U sentence-transformers

Collecting openai
  Downloading openai-0.28.1-py3-none-any.whl (76 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/77.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/77.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/77.0 kB[0m [31m485.1 kB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m71.7/77.0 kB[0m [31m677.6 kB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m635.9 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llmx 0.0.15a0 requires cohere, which is not installed.
llmx 0.0.

In [None]:
import pandas as pd
from ast import literal_eval
import openai
from sentence_transformers import SentenceTransformer, util
import nltk
nltk.download('punkt')
from collections import Counter
from tqdm import tqdm
import spacy
import re
import string

OPENAI_API_KEY = "" # Enter your API key here
openai.api_key = OPENAI_API_KEY

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Utils

In [None]:
# List of negation words
negation_words = ["not", "no", "n't", "none", "neither", "never", "nobody", "nothing", "nowhere", "hardly", "scarcely", "barely", "rarely", "seldom"]


def extract_negation(tokens):
    negations = [token for token in tokens if token in negation_words]
    return negations

def calculate_overlap(premise, hypothesis):
    """
    This function takes in two sentences as arguments (the premise and the hypothesis), tokenizes them,
    and calculates the count of words that overlap between them if the overlap is high (above the threshold).
    The threshold is defined as a ratio of the total words in both sentences.
    """

    # Tokenize the sentences
    premise_tokens = nltk.word_tokenize(premise.lower())
    hypothesis_tokens = nltk.word_tokenize(hypothesis.lower())

    # Count the words in each sentence
    premise_counter = Counter(premise_tokens)
    hypothesis_counter = Counter(hypothesis_tokens)

    # Calculate the overlap using the intersection of the two Counters
    overlap_counter = premise_counter & hypothesis_counter
    overlap = sum(overlap_counter.values())

    return overlap

def detect_word_overlap_bias(df):
    """
    This function takes in a dataframe with 'premise', 'hypothesis', and 'label' columns,
    calculates the number of overlapping words for each row and adds this as a new 'overlap' column,
    then calculates the average overlap for 'entailment' cases and for 'non-entailment' cases,
    and compares the two.
    It also counts the number of instances in the 'entailment' and 'non-entailment' cases
    where the overlap is above a given threshold.
    """

    # Calculate overlap for each row and add it as a new 'overlap' column
    df['overlap'] = df.apply(lambda row: calculate_overlap(row['premise'], row['hypothesis']), axis=1)

    # create a new column to indicate if there is word overlap
    df['is_word_overlap'] = df['overlap'].apply(lambda x: True if x > 0 else False)

    return df

def remove_punctuation(input_string):
    translator = str.maketrans('', '', string.punctuation)
    return input_string.translate(translator)

def detect_subsequence(premise: str, hypothesis: str):
    premise_words = remove_punctuation(premise.lower())
    hypothesis_words = remove_punctuation(hypothesis.lower())
    # Add word boundaries to the hypothesis
    hypothesis_words = r"\b" + hypothesis_words + r"\b"
    if re.search(hypothesis_words, premise_words):
        return (hypothesis, True)
    else:
        return ("", False)

def add_features(df):
    df['hyp_tokens'] = df['hypothesis'].apply(lambda x: nltk.word_tokenize(x.lower()))
    df['hyp_length'] = df['hyp_tokens'].apply(lambda x: len(x))
    df['prem_tokens'] = df['premise'].apply(lambda x: nltk.word_tokenize(x.lower()))
    df['prem_length'] = df['prem_tokens'].apply(lambda x: len(x))
    df = detect_word_overlap_bias(df)
    df['negations'] = df['hyp_tokens'].apply(lambda x: extract_negation(x))
    df['has_negation'] = df['negations'].apply(lambda x: True if len(x) > 0 else False)
    df['detected_subsequence'], df['is_subsequence_heuristic'] = zip(*df.apply(lambda row: detect_subsequence(row['premise'], row['hypothesis']), axis=1))

    return df

# Contradiction word artefacts data augmentation

In [None]:
def read_data(file_path):
    df = pd.read_csv(file_path)

    return df

In [None]:


# get data
years = [2018, 2019, 2020, 2021, 2022]
contradiction_train_data_dict = {}
for year in years:
    data_file_path = f"/content/drive/MyDrive/data/task 4/train/coliee_train_{year}.csv"

    df = read_data(data_file_path)
    print(f"Year : {year}")

    contradiction_df = df[df['has_negation']==True]
    print(f"Number of total instances with contradiction : {contradiction_df.shape}")

    entailment_contradiction_df = contradiction_df[contradiction_df['labels']=="Y"]
    print(f"Number of entailment instances with contradiction : {entailment_contradiction_df.shape}")
    non_entailment_contradiction_df = contradiction_df[contradiction_df['labels']=="N"]
    print(f"Number of non-entailment instances with contradiction : {non_entailment_contradiction_df.shape}")

    diff = non_entailment_contradiction_df.shape[0] - entailment_contradiction_df.shape[0]
    print(f"Difference in contradiction instances between non-entailment and entailment labels: {diff}")

    contradiction_train_data_dict[year] = contradiction_df

Year : 2018
Number of total instances with contradiction : (288, 15)
Number of entailment instances with contradiction : (138, 15)
Number of non-entailment instances with contradiction : (150, 15)
Difference in contradiction instances between non-entailment and entailment labels: 12
Year : 2019
Number of total instances with contradiction : (324, 15)
Number of entailment instances with contradiction : (152, 15)
Number of non-entailment instances with contradiction : (172, 15)
Difference in contradiction instances between non-entailment and entailment labels: 20
Year : 2020
Number of total instances with contradiction : (365, 15)
Number of entailment instances with contradiction : (171, 15)
Number of non-entailment instances with contradiction : (194, 15)
Difference in contradiction instances between non-entailment and entailment labels: 23
Year : 2021
Number of total instances with contradiction : (418, 15)
Number of entailment instances with contradiction : (195, 15)
Number of non-ent

In [None]:
cwd_df = contradiction_train_data_dict[2022]
cwd_df

Unnamed: 0,id,label,premise,hypothesis,labels,hyp_tokens,hyp_length,prem_tokens,prem_length,overlap,is_word_overlap,negations,has_negation,detected_subsequence,is_subsequence_heuristic
0,H18-1-1,1,Article 572\nEven if the seller makes a specia...,A special provision that releases warranty can...,Y,"['a', 'special', 'provision', 'that', 'release...",39,"['article', '572', 'even', 'if', 'the', 'selle...",81,22,True,['not'],True,,False
1,H18-1-2,0,Article 565\nThe provisions of the preceding t...,There is a limitation period on pursuance of w...,N,"['there', 'is', 'a', 'limitation', 'period', '...",44,"['article', '565', 'the', 'provisions', 'of', ...",176,18,True,['no'],True,,False
4,H18-2-2,1,Article 698\nIf a manager engages in benevolen...,In cases where an individual rescues another p...,Y,"['in', 'cases', 'where', 'an', 'individual', '...",45,"['article', '698', 'if', 'a', 'manager', 'enga...",59,16,True,['not'],True,,False
6,H18-2-4,1,Article 702\n(1) If a manager has incurred ben...,In cases where a person repairs the fence of a...,Y,"['in', 'cases', 'where', 'a', 'person', 'repai...",62,"['article', '702', '(', '1', ')', 'if', 'a', '...",98,21,True,['no'],True,,False
11,H18-9-2,0,Article 295\n(1) If a possessor of a thing bel...,Statutory real rights granted by way of securi...,N,"['statutory', 'real', 'rights', 'granted', 'by...",18,"['article', '295', '(', '1', ')', 'if', 'a', '...",196,8,True,['not'],True,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
880,R02-29-A,1,Article 509 The obligor of either of the follo...,If a person that holds a monetary claim has bo...,Y,"['if', 'a', 'person', 'that', 'holds', 'a', 'm...",50,"['article', '509', 'the', 'obligor', 'of', 'ei...",94,38,True,['not'],True,,False
882,R02-36-I,0,Article 153 (1) The postponement of the expiry...,The postponement of expiry of prescription per...,N,"['the', 'postponement', 'of', 'expiry', 'of', ...",31,"['article', '153', '(', '1', ')', 'the', 'post...",99,29,True,['not'],True,,False
884,R02-36-E,0,Article 254 A claim that one of the co-owners ...,A claim that a co-owner (A) holds against anot...,N,"['a', 'claim', 'that', 'a', 'co-owner', '(', '...",32,"['article', '254', 'a', 'claim', 'that', 'one'...",28,19,True,['not'],True,,False
885,R02-37-A,1,Article 406 If the subject matter of the claim...,If the obligor of an alternative obligation ma...,Y,"['if', 'the', 'obligor', 'of', 'an', 'alternat...",38,"['article', '406', 'if', 'the', 'subject', 'ma...",81,27,True,['not'],True,,False


In [None]:
cwd_df['percent_overlap'] = cwd_df['overlap'] / cwd_df['hyp_length']

In [None]:
cwd_df['num_negations'] = cwd_df['negations'].apply(lambda x: len(literal_eval(x)))
ent_cwd_df = cwd_df[cwd_df['labels']=="Y"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cwd_df['num_negations'] = cwd_df['negations'].apply(lambda x: len(literal_eval(x)))


In [None]:
ent_cwd_df['num_negations'].describe()

count    218.000000
mean       1.298165
std        0.605730
min        1.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        5.000000
Name: num_negations, dtype: float64

In [None]:
ent_cwd_df[ent_cwd_df['num_negations'] > 2].shape

(8, 16)

In [None]:
ent_cwd_df[(ent_cwd_df['percent_overlap']<0.35) & (ent_cwd_df['num_negations'] <= 2)].shape

(34, 17)

In [None]:
contradiction_examples_df = ent_cwd_df[(ent_cwd_df['percent_overlap']<0.35) & (ent_cwd_df['num_negations'] <= 2)]

In [None]:
contradiction_examples_df

Unnamed: 0,id,label,premise,hypothesis,labels,hyp_tokens,hyp_length,prem_tokens,prem_length,overlap,is_word_overlap,negations,has_negation,detected_subsequence,is_subsequence_heuristic,percent_overlap,num_negations
6,H18-2-4,1,Article 702\n(1) If a manager has incurred ben...,In cases where a person repairs the fence of a...,Y,"['in', 'cases', 'where', 'a', 'person', 'repai...",62,"['article', '702', '(', '1', ')', 'if', 'a', '...",98,21,True,['no'],True,,False,0.33871,1
23,H18-23-I,1,Article 537\n(1) If one of the parties promise...,In cases where person A sold a jewel to person...,Y,"['in', 'cases', 'where', 'person', 'a', 'sold'...",101,"['article', '537', '(', '1', ')', 'if', 'one',...",120,32,True,['not'],True,,False,0.316832,1
26,H18-26-1,1,Article 255\nIf one of co-owners waives intere...,In cases where person A and person B co-own bu...,Y,"['in', 'cases', 'where', 'person', 'a', 'and',...",40,"['article', '255', 'if', 'one', 'of', 'co-owne...",22,7,True,['no'],True,,False,0.175,1
46,H19-11-3,1,"Article 177\nAcquisitions of, losses of and ch...",In a case where A bought a registered building...,Y,"['in', 'a', 'case', 'where', 'a', 'bought', 'a...",52,"['article', '177', 'acquisitions', 'of', ',', ...",54,15,True,['not'],True,,False,0.288462,1
50,H19-12-4,1,Article 343\nA thing that cannot be transferre...,In cases where a contract that creates a pled...,Y,"['in', 'cases', 'where', 'a', 'contract', 'tha...",45,"['article', '343', 'a', 'thing', 'that', 'can'...",22,11,True,['not'],True,,False,0.244444,1
96,H20-23-5,1,Article 588\nIf any person has an obligation t...,A quasi-loan contract shall not become effecti...,Y,"['a', 'quasi-loan', 'contract', 'shall', 'not'...",18,"['article', '588', 'if', 'any', 'person', 'has...",48,5,True,"['not', 'not']",True,,False,0.277778,2
145,H21-19-A,1,Article 447\n(1) The guarantee obligation incl...,Assuming that a recovery obligation due to can...,Y,"['assuming', 'that', 'a', 'recovery', 'obligat...",103,"['article', '447', '(', '1', ')', 'the', 'guar...",62,24,True,"['not', 'not']",True,,False,0.23301,2
190,H22-15-U,1,Article 677\nA partner's creditor may not exer...,An obligor of a partnership cannot set off his...,Y,"['an', 'obligor', 'of', 'a', 'partnership', 'c...",18,"['article', '677', 'a', 'partner', ""'s"", 'cred...",19,6,True,['not'],True,,False,0.333333,1
196,H22-21-4,1,"Article 492\nUpon tendering the performance, t...",In cases where the obligor of a monetary debt ...,Y,"['in', 'cases', 'where', 'the', 'obligor', 'of...",42,"['article', '492', 'upon', 'tendering', 'the',...",26,8,True,['not'],True,,False,0.190476,1
230,H23-9-2,1,Article 192\nA person that commences the posse...,Provisions for immediate acquisition do not ap...,Y,"['provisions', 'for', 'immediate', 'acquisitio...",26,"['article', '192', 'a', 'person', 'that', 'com...",41,7,True,['not'],True,,False,0.269231,1


In [None]:
system_prompt = '''You are a seasoned legal NLP Researcher. You are skilled in understanding complex legal texts and paraphrase them
effectively based on the instructions provided.
'''

def build_contradiction_paraphrase_prompt(premise, hypothesis):
    contradiction_paraphrase_prompt = f'''You will be provided with a Legal Natural Language Inference pair containing premise and hypothesis.
    Your task is to paraphrase the provided original hypothesis by following the provided instructions.
    INSTRUCTIONS:
    1. In the paraphrased hypothesis negation words MUST to be preserved. For example words such as "no", "not", "cannot" etc. If needed, you can introduce more negation words but the original
    negation words must not be eliminated.
    2. There should not be more overlap between premise and hypothesis.
    3. Provided only the paraphrased hypothesis and nothing else.

    Example Input:

        "Premise":"Article 705
        A person that has paid money or delivered anything as performance of an obligation may not demand the return of the money paid or
        thing delivered if the person knew, at the time, that the obligation did not exist..",
        "Hypothesis": "A person who has tendered anything as performance of an obligation may not demand the return of the thing tendered
        if the person were negligent in not knowing that the obligation did not exist."

    Example Output:
        "Paraphrased Hypothesis": "If an individual was careless and failed to realize that an obligation was nonexistent at the time they fulfilled it,
        they are not entitled to ask for the return of the items they provided."


    Real Input:
        "Premise":{premise},
        "Hypothesis:{hypothesis}

    Real Output:
        "Paraphrased Hypothesis":

    '''

    return contradiction_paraphrase_prompt


def build_validation_prompt(hypothesis, paraphrased_hypothesis):
    validation_prompt = f'''You are given two sentences. Does the following two sentences contain the same meaning?
        You must provide the answer only as YES or NO.

        Sentence 1:{hypothesis}
        Sentence 2:{paraphrased_hypothesis}
    '''

    return validation_prompt


In [None]:
paraphrased_texts = []
validations = []
scores = []

for i in range(contradiction_examples_df.shape[0]):

    premise = contradiction_examples_df['premise'].iloc[i]
    hypothesis = contradiction_examples_df['hypothesis'].iloc[i]

    contradiction_paraphrase_prompt = build_contradiction_paraphrase_prompt(premise, hypothesis)

    data_completion = openai.ChatCompletion.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": contradiction_paraphrase_prompt}
    ]
    )

    # print(data_completion.choices[0].message)

    paraphrased_hypothesis = data_completion.choices[0].message['content'].strip()

    paraphrased_texts.append(paraphrased_hypothesis)

    validation_prompt = build_validation_prompt(hypothesis, paraphrased_hypothesis)

    val_completion = openai.ChatCompletion.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": "You are tasked with validating provided sentences."},
        {"role": "user", "content": validation_prompt}
    ]
    )

    # print(val_completion.choices[0].message)

    validation = val_completion.choices[0].message['content'].strip()

    validations.append(validation)

    model = SentenceTransformer('BAAI/bge-large-en-v1.5')

    sentences_1 = [hypothesis]
    sentences_2 = [paraphrased_hypothesis]

    #Compute embedding for both lists
    embeddings1 = model.encode(sentences_1, convert_to_tensor=True)
    embeddings2 = model.encode(sentences_2, convert_to_tensor=True)

    #Compute cosine-similarities
    cosine_scores = util.cos_sim(embeddings1, embeddings2)
    similarity_score = cosine_scores[0][0]

    scores.append(similarity_score)


contradiction_examples_df['Mod Hypothesis'] = paraphrased_texts
contradiction_examples_df['Validation'] = validations
contradiction_examples_df['similarity_scores'] = scores

In [None]:
contradiction_examples_df

Unnamed: 0,id,label,premise,hypothesis,labels,hyp_tokens,hyp_length,prem_tokens,prem_length,overlap,is_word_overlap,negations,has_negation,detected_subsequence,is_subsequence_heuristic,percent_overlap,num_negations,Mod Hypothesis,Validation,similarity_scores
6,H18-2-4,1,Article 702\n(1) If a manager has incurred ben...,In cases where a person repairs the fence of a...,Y,"['in', 'cases', 'where', 'a', 'person', 'repai...",62,"['article', '702', '(', '1', ')', 'if', 'a', '...",98,21,True,['no'],True,,False,0.33871,1,"""When someone fixes a neighbor's fence affecte...",YES,tensor(0.8492)
23,H18-23-I,1,Article 537\n(1) If one of the parties promise...,In cases where person A sold a jewel to person...,Y,"['in', 'cases', 'where', 'person', 'a', 'sold'...",101,"['article', '537', '(', '1', ')', 'if', 'one',...",120,32,True,['not'],True,,False,0.316832,1,"""In a scenario where individual A transfers a ...",YES,tensor(0.8756)
26,H18-26-1,1,Article 255\nIf one of co-owners waives intere...,In cases where person A and person B co-own bu...,Y,"['in', 'cases', 'where', 'person', 'a', 'and',...",40,"['article', '255', 'if', 'one', 'of', 'co-owne...",22,7,True,['no'],True,,False,0.175,1,"""If one co-owner of a property, shared equally...",YES,tensor(0.8055)
46,H19-11-3,1,"Article 177\nAcquisitions of, losses of and ch...",In a case where A bought a registered building...,Y,"['in', 'a', 'case', 'where', 'a', 'bought', 'a...",52,"['article', '177', 'acquisitions', 'of', ',', ...",54,15,True,['not'],True,,False,0.288462,1,"""If A purchases a registered building from B, ...",YES,tensor(0.9387)
50,H19-12-4,1,Article 343\nA thing that cannot be transferre...,In cases where a contract that creates a pled...,Y,"['in', 'cases', 'where', 'a', 'contract', 'tha...",45,"['article', '343', 'a', 'thing', 'that', 'can'...",22,11,True,['not'],True,,False,0.244444,1,"""If a commitment is established to create a pl...",YES,tensor(0.8117)
96,H20-23-5,1,Article 588\nIf any person has an obligation t...,A quasi-loan contract shall not become effecti...,Y,"['a', 'quasi-loan', 'contract', 'shall', 'not'...",18,"['article', '588', 'if', 'any', 'person', 'has...",48,5,True,"['not', 'not']",True,,False,0.277778,2,"""A sham loan agreement will not come into effe...",YES,tensor(0.7973)
145,H21-19-A,1,Article 447\n(1) The guarantee obligation incl...,Assuming that a recovery obligation due to can...,Y,"['assuming', 'that', 'a', 'recovery', 'obligat...",103,"['article', '447', '(', '1', ')', 'the', 'guar...",62,24,True,"['not', 'not']",True,,False,0.23301,2,"""If we look at an obligation to recover upon c...",YES,tensor(0.9093)
190,H22-15-U,1,Article 677\nA partner's creditor may not exer...,An obligor of a partnership cannot set off his...,Y,"['an', 'obligor', 'of', 'a', 'partnership', 'c...",18,"['article', '677', 'a', 'partner', ""'s"", 'cred...",19,6,True,['not'],True,,False,0.333333,1,"""A partner's debtor does not have the ability ...",YES,tensor(0.7270)
196,H22-21-4,1,"Article 492\nUpon tendering the performance, t...",In cases where the obligor of a monetary debt ...,Y,"['in', 'cases', 'where', 'the', 'obligor', 'of...",42,"['article', '492', 'upon', 'tendering', 'the',...",26,8,True,['not'],True,,False,0.190476,1,"""When the person obliged to pay a financial ob...",YES,tensor(0.7586)
230,H23-9-2,1,Article 192\nA person that commences the posse...,Provisions for immediate acquisition do not ap...,Y,"['provisions', 'for', 'immediate', 'acquisitio...",26,"['article', '192', 'a', 'person', 'that', 'com...",41,7,True,['not'],True,,False,0.269231,1,"""If an inheritor takes over another individual...",YES,tensor(0.8679)


In [None]:
contradiction_examples_df.to_csv("/content/drive/MyDrive/data/task 4/train/data_aug_contradiction_instances.csv", index=False)

In [None]:
df = read_data("/content/drive/MyDrive/data/task 4/train/data_aug_contradiction_instances.csv")
df

Unnamed: 0,id,label,premise,hypothesis,labels,hyp_tokens,hyp_length,prem_tokens,prem_length,overlap,...,detected_subsequence,is_subsequence_heuristic,percent_overlap,num_negations,Mod Hypothesis,Validation,similarity_scores,Mod negations,Mod negation_count,Mod contains_negation
0,H18-2-4,1.0,Article 702\n(1) If a manager has incurred ben...,In cases where a person repairs the fence of a...,Y,"['in', 'cases', 'where', 'a', 'person', 'repai...",62.0,"['article', '702', '(', '1', ')', 'if', 'a', '...",98.0,21.0,...,,False,0.33871,1.0,"""When someone fixes a neighbor's fence affecte...",YES,tensor(0.8492),['not'],1.0,True
1,H18-23-I,1.0,Article 537\n(1) If one of the parties promise...,In cases where person A sold a jewel to person...,Y,"['in', 'cases', 'where', 'person', 'a', 'sold'...",101.0,"['article', '537', '(', '1', ')', 'if', 'one',...",120.0,32.0,...,,False,0.316832,1.0,"""In a scenario where individual A transfers a ...",YES,tensor(0.8756),['not'],1.0,True
2,H18-26-1,1.0,Article 255\nIf one of co-owners waives intere...,In cases where person A and person B co-own bu...,Y,"['in', 'cases', 'where', 'person', 'a', 'and',...",40.0,"['article', '255', 'if', 'one', 'of', 'co-owne...",22.0,7.0,...,,False,0.175,1.0,"""If one co-owner of a property, shared equally...",YES,tensor(0.8055),['no'],1.0,True
3,H19-11-3,1.0,"Article 177\nAcquisitions of, losses of and ch...",In a case where A bought a registered building...,Y,"['in', 'a', 'case', 'where', 'a', 'bought', 'a...",52.0,"['article', '177', 'acquisitions', 'of', ',', ...",54.0,15.0,...,,False,0.288462,1.0,"""If A purchases a registered building from B, ...",YES,tensor(0.9387),['not'],1.0,True
4,H19-12-4,1.0,Article 343\nA thing that cannot be transferre...,In cases where a contract that creates a pled...,Y,"['in', 'cases', 'where', 'a', 'contract', 'tha...",45.0,"['article', '343', 'a', 'thing', 'that', 'can'...",22.0,11.0,...,,False,0.244444,1.0,"""If a commitment is established to create a pl...",YES,tensor(0.8117),['not'],1.0,True
5,H20-23-5,1.0,Article 588\nIf any person has an obligation t...,A quasi-loan contract shall not become effecti...,Y,"['a', 'quasi-loan', 'contract', 'shall', 'not'...",18.0,"['article', '588', 'if', 'any', 'person', 'has...",48.0,5.0,...,,False,0.277778,2.0,"""A sham loan agreement will not come into effe...",YES,tensor(0.7973),['not'],1.0,True
6,H21-19-A,1.0,Article 447\n(1) The guarantee obligation incl...,Assuming that a recovery obligation due to can...,Y,"['assuming', 'that', 'a', 'recovery', 'obligat...",103.0,"['article', '447', '(', '1', ')', 'the', 'guar...",62.0,24.0,...,,False,0.23301,2.0,"""In instances where a contract cancellation re...",YES,tensor(0.9141),['not'],1.0,True
7,H22-15-U,1.0,Article 677\nA partner's creditor may not exer...,An obligor of a partnership cannot set off his...,Y,"['an', 'obligor', 'of', 'a', 'partnership', 'c...",18.0,"['article', '677', 'a', 'partner', ""'s"", 'cred...",19.0,6.0,...,,False,0.333333,1.0,"""A partner's debtor does not have the ability ...",YES,tensor(0.7270),['not'],1.0,True
8,H22-21-4,1.0,"Article 492\nUpon tendering the performance, t...",In cases where the obligor of a monetary debt ...,Y,"['in', 'cases', 'where', 'the', 'obligor', 'of...",42.0,"['article', '492', 'upon', 'tendering', 'the',...",26.0,8.0,...,,False,0.190476,1.0,"""When the person obliged to pay a financial ob...",YES,tensor(0.7586),['not'],1.0,True
9,H23-9-2,1.0,Article 192\nA person that commences the posse...,Provisions for immediate acquisition do not ap...,Y,"['provisions', 'for', 'immediate', 'acquisitio...",26.0,"['article', '192', 'a', 'person', 'that', 'com...",41.0,7.0,...,,False,0.269231,1.0,"""If an inheritor takes over another individual...",YES,tensor(0.8679),['not'],1.0,True


In [None]:
# Load a SpaCy model and disable unneeded components
nlp = spacy.load('en_core_web_sm', disable=['tagger', 'ner'])

# List of negation words
# The word cannot is not listed as spacy tokenizes it as can and not.
negation_words = ["not", "no", "n't", "none", "neither", "never", "nobody", "nothing", "nowhere", "hardly", "scarcely", "barely", "rarely", "seldom"]

# Function to count the number of negations in a sentence
def extract_negations(doc):
    return [token.text for token in doc if token.text.lower() in negation_words]

# Use the pipe method to process the texts in batches
docs = list(nlp.pipe(df['Mod Hypothesis']))

# Apply the function to each Doc object
df['Mod negations'] = [extract_negations(doc) for doc in docs]
# create a new column to store the number of negations in each sentence
df['Mod negation_count'] = df['Mod negations'].apply(lambda x: len(x))
# Create the contains_negation column
df['Mod contains_negation'] = df['Mod negation_count'] > 0



In [None]:
df

Unnamed: 0,id,label,premise,hypothesis,labels,hyp_tokens,hyp_length,prem_tokens,prem_length,overlap,...,detected_subsequence,is_subsequence_heuristic,percent_overlap,num_negations,Mod Hypothesis,Validation,similarity_scores,Mod negations,Mod negation_count,Mod contains_negation
0,H18-2-4,1,Article 702\n(1) If a manager has incurred ben...,In cases where a person repairs the fence of a...,Y,"['in', 'cases', 'where', 'a', 'person', 'repai...",62,"['article', '702', '(', '1', ')', 'if', 'a', '...",98,21,...,,False,0.33871,1,"""When someone fixes a neighbor's fence affecte...",YES,tensor(0.8492),[not],1,True
1,H18-23-I,1,Article 537\n(1) If one of the parties promise...,In cases where person A sold a jewel to person...,Y,"['in', 'cases', 'where', 'person', 'a', 'sold'...",101,"['article', '537', '(', '1', ')', 'if', 'one',...",120,32,...,,False,0.316832,1,"""In a scenario where individual A transfers a ...",YES,tensor(0.8756),[not],1,True
2,H18-26-1,1,Article 255\nIf one of co-owners waives intere...,In cases where person A and person B co-own bu...,Y,"['in', 'cases', 'where', 'person', 'a', 'and',...",40,"['article', '255', 'if', 'one', 'of', 'co-owne...",22,7,...,,False,0.175,1,"""If one co-owner of a property, shared equally...",YES,tensor(0.8055),[no],1,True
3,H19-11-3,1,"Article 177\nAcquisitions of, losses of and ch...",In a case where A bought a registered building...,Y,"['in', 'a', 'case', 'where', 'a', 'bought', 'a...",52,"['article', '177', 'acquisitions', 'of', ',', ...",54,15,...,,False,0.288462,1,"""If A purchases a registered building from B, ...",YES,tensor(0.9387),[not],1,True
4,H19-12-4,1,Article 343\nA thing that cannot be transferre...,In cases where a contract that creates a pled...,Y,"['in', 'cases', 'where', 'a', 'contract', 'tha...",45,"['article', '343', 'a', 'thing', 'that', 'can'...",22,11,...,,False,0.244444,1,"""If a commitment is established to create a pl...",YES,tensor(0.8117),[not],1,True
5,H20-23-5,1,Article 588\nIf any person has an obligation t...,A quasi-loan contract shall not become effecti...,Y,"['a', 'quasi-loan', 'contract', 'shall', 'not'...",18,"['article', '588', 'if', 'any', 'person', 'has...",48,5,...,,False,0.277778,2,"""A sham loan agreement will not come into effe...",YES,tensor(0.7973),[not],1,True
6,H21-19-A,1,Article 447\n(1) The guarantee obligation incl...,Assuming that a recovery obligation due to can...,Y,"['assuming', 'that', 'a', 'recovery', 'obligat...",103,"['article', '447', '(', '1', ')', 'the', 'guar...",62,24,...,,False,0.23301,2,"""If we look at an obligation to recover upon c...",YES,tensor(0.9093),[],0,False
7,H22-15-U,1,Article 677\nA partner's creditor may not exer...,An obligor of a partnership cannot set off his...,Y,"['an', 'obligor', 'of', 'a', 'partnership', 'c...",18,"['article', '677', 'a', 'partner', ""'s"", 'cred...",19,6,...,,False,0.333333,1,"""A partner's debtor does not have the ability ...",YES,tensor(0.7270),[not],1,True
8,H22-21-4,1,"Article 492\nUpon tendering the performance, t...",In cases where the obligor of a monetary debt ...,Y,"['in', 'cases', 'where', 'the', 'obligor', 'of...",42,"['article', '492', 'upon', 'tendering', 'the',...",26,8,...,,False,0.190476,1,"""When the person obliged to pay a financial ob...",YES,tensor(0.7586),[not],1,True
9,H23-9-2,1,Article 192\nA person that commences the posse...,Provisions for immediate acquisition do not ap...,Y,"['provisions', 'for', 'immediate', 'acquisitio...",26,"['article', '192', 'a', 'person', 'that', 'com...",41,7,...,,False,0.269231,1,"""If an inheritor takes over another individual...",YES,tensor(0.8679),[not],1,True


In [None]:
df_bad = df[df['Mod contains_negation']==False]
df_bad

Unnamed: 0,id,label,premise,hypothesis,labels,hyp_tokens,hyp_length,prem_tokens,prem_length,overlap,...,detected_subsequence,is_subsequence_heuristic,percent_overlap,num_negations,Mod Hypothesis,Validation,similarity_scores,Mod negations,Mod negation_count,Mod contains_negation
31,R1-24-I,1.0,Article 541\nIf one of the parties does not pe...,(A) and (B) agreed that (A) sell the used bicy...,Y,"['(', 'a', ')', 'and', '(', 'b', ')', 'agreed'...",110.0,"['article', '541', 'if', 'one', 'of', 'the', '...",129.0,29.0,...,,False,0.263636,1.0,"""If (A) and (B) established a contract of sale...",YES,tensor(0.9208),[],0.0,False


In [None]:
df_bad['hypothesis'].loc[31]

'(A) and (B) agreed that (A) sell the used bicycle which (A) owned (hereinafter referred to as P) to (B) on April 1 with the provision for the delivery on april 10 and the payment on April 20. (P) has a latent defect which is created before the sale, but (A) delivered (P) to (B) on April 10. In such case, (B) may cancel the sale, if (B) cannot achieve his/her purpose of the sale on account of the defect.'

In [None]:
df_bad['Mod Hypothesis'].loc[31]

'"In the event where (A) has sold a used bicycle (referred to as P) that they owned to (B) on April 1, arranging for the delivery and payments to be done on April 10 and April 20 respectively, and this bicycle P turns out to have a hidden defect that was present even before the sale was made and (A) proceeded with the delivery to (B) on the promised date, then (B) is entitled to nullify the sales agreement if the defect renders (B) unable to fulfill the intended purpose of the purchase."'

In [None]:
import time
paraphrased_texts = []
validations = []
scores = []

for i in tqdm(range(df_bad.shape[0])):

    premise = df_bad['premise'].iloc[i]
    hypothesis = df_bad['hypothesis'].iloc[i]
    negation_count = 0
    num_retries = 0

    contradiction_paraphrase_prompt = build_contradiction_paraphrase_prompt(premise, hypothesis)

    while negation_count <= 0 and num_retries <=5:

        data_completion = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": contradiction_paraphrase_prompt}
        ]
        )

        # print(data_completion.choices[0].message)

        paraphrased_hypothesis = data_completion.choices[0].message['content'].strip()

        doc = nlp(paraphrased_hypothesis)
        negations = len(extract_negations(doc))

        if negations:
            negation_count = negation_count + 1

        num_retries = num_retries + 1

        time.sleep(5)

    paraphrased_texts.append(paraphrased_hypothesis)

    validation_prompt = build_validation_prompt(hypothesis, paraphrased_hypothesis)

    val_completion = openai.ChatCompletion.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": "You are tasked with validating provided sentences."},
        {"role": "user", "content": validation_prompt}
    ]
    )

    # print(val_completion.choices[0].message)

    validation = val_completion.choices[0].message['content'].strip()

    validations.append(validation)

    model = SentenceTransformer('BAAI/bge-large-en-v1.5')

    sentences_1 = [hypothesis]
    sentences_2 = [paraphrased_hypothesis]

    #Compute embedding for both lists
    embeddings1 = model.encode(sentences_1, convert_to_tensor=True)
    embeddings2 = model.encode(sentences_2, convert_to_tensor=True)

    #Compute cosine-similarities
    cosine_scores = util.cos_sim(embeddings1, embeddings2)
    similarity_score = cosine_scores[0][0]

    scores.append(similarity_score)


df_bad['Mod Hypothesis'] = paraphrased_texts
df_bad['Validation'] = validations
df_bad['similarity_scores'] = scores

100%|██████████| 1/1 [01:25<00:00, 85.39s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bad['Mod Hypothesis'] = paraphrased_texts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bad['Validation'] = validations
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bad['similarity_scores'] = scores


In [None]:
df_bad.loc[(31, 'Mod Hypothesis')] = "(A) and (B) concurred on a deal for (A) to sell the pre-owned bicycle, referred to as P, which was owned by (A), to (B) on April 1, with the understanding that the delivery would take place on April 10 and the payment on April 20. P had a hidden flaw originating prior to the transaction; nonetheless, (A) proceeded with the delivery to (B) on April 10. Under these circumstances, should the defect hinder (B) from fulfilling the intended purpose of the purchase, (B) is not barred from rescinding the sale."

In [None]:
df_bad

Unnamed: 0,id,label,premise,hypothesis,labels,hyp_tokens,hyp_length,prem_tokens,prem_length,overlap,...,detected_subsequence,is_subsequence_heuristic,percent_overlap,num_negations,Mod Hypothesis,Validation,similarity_scores,Mod negations,Mod negation_count,Mod contains_negation
31,R1-24-I,1.0,Article 541\nIf one of the parties does not pe...,(A) and (B) agreed that (A) sell the used bicy...,Y,"['(', 'a', ')', 'and', '(', 'b', ')', 'agreed'...",110.0,"['article', '541', 'if', 'one', 'of', 'the', '...",129.0,29.0,...,,False,0.263636,1.0,(A) and (B) concurred on a deal for (A) to sel...,YES,tensor(0.9129),[],0,False


In [None]:
scores = []
# Function to count the number of negations in a sentence
def extract_negations(doc):
    return [token.text for token in doc if token.text.lower() in negation_words]

# Use the pipe method to process the texts in batches
docs = list(nlp.pipe(df_bad['Mod Hypothesis']))

# Apply the function to each Doc object
df_bad['Mod negations'] = [extract_negations(doc) for doc in docs]
# create a new column to store the number of negations in each sentence
df_bad['Mod negation_count'] = df_bad['Mod negations'].apply(lambda x: len(x))
# Create the contains_negation column
df_bad['Mod contains_negation'] = df_bad['Mod negation_count'] > 0


model = SentenceTransformer('BAAI/bge-large-en-v1.5')

sentences_1 = [hypothesis]
sentences_2 = [paraphrased_hypothesis]

#Compute embedding for both lists
embeddings1 = model.encode(sentences_1, convert_to_tensor=True)
embeddings2 = model.encode(sentences_2, convert_to_tensor=True)

#Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)
similarity_score = cosine_scores[0][0]

scores.append(similarity_score)

df_bad['similarity_scores'] = scores

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bad['Mod negations'] = [extract_negations(doc) for doc in docs]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bad['Mod negation_count'] = df_bad['Mod negations'].apply(lambda x: len(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bad['Mod contains_negation'] = df_bad['Mod negation_coun

In [None]:
display(df_bad)

Unnamed: 0,id,label,premise,hypothesis,labels,hyp_tokens,hyp_length,prem_tokens,prem_length,overlap,...,detected_subsequence,is_subsequence_heuristic,percent_overlap,num_negations,Mod Hypothesis,Validation,similarity_scores,Mod negations,Mod negation_count,Mod contains_negation
31,R1-24-I,1.0,Article 541\nIf one of the parties does not pe...,(A) and (B) agreed that (A) sell the used bicy...,Y,"['(', 'a', ')', 'and', '(', 'b', ')', 'agreed'...",110.0,"['article', '541', 'if', 'one', 'of', 'the', '...",129.0,29.0,...,,False,0.263636,1.0,(A) and (B) concurred on a deal for (A) to sel...,YES,tensor(0.9129),[not],1,True


In [None]:
df.set_index('id', inplace=True)
df_bad.set_index('id', inplace=True)

df.update(df_bad)

df.reset_index(inplace=True)

In [None]:
df

Unnamed: 0,id,label,premise,hypothesis,labels,hyp_tokens,hyp_length,prem_tokens,prem_length,overlap,...,detected_subsequence,is_subsequence_heuristic,percent_overlap,num_negations,Mod Hypothesis,Validation,similarity_scores,Mod negations,Mod negation_count,Mod contains_negation
0,H18-2-4,1.0,Article 702\n(1) If a manager has incurred ben...,In cases where a person repairs the fence of a...,Y,"['in', 'cases', 'where', 'a', 'person', 'repai...",62.0,"['article', '702', '(', '1', ')', 'if', 'a', '...",98.0,21.0,...,,False,0.33871,1.0,"""When someone fixes a neighbor's fence affecte...",YES,tensor(0.8492),['not'],1.0,True
1,H18-23-I,1.0,Article 537\n(1) If one of the parties promise...,In cases where person A sold a jewel to person...,Y,"['in', 'cases', 'where', 'person', 'a', 'sold'...",101.0,"['article', '537', '(', '1', ')', 'if', 'one',...",120.0,32.0,...,,False,0.316832,1.0,"""In a scenario where individual A transfers a ...",YES,tensor(0.8756),['not'],1.0,True
2,H18-26-1,1.0,Article 255\nIf one of co-owners waives intere...,In cases where person A and person B co-own bu...,Y,"['in', 'cases', 'where', 'person', 'a', 'and',...",40.0,"['article', '255', 'if', 'one', 'of', 'co-owne...",22.0,7.0,...,,False,0.175,1.0,"""If one co-owner of a property, shared equally...",YES,tensor(0.8055),['no'],1.0,True
3,H19-11-3,1.0,"Article 177\nAcquisitions of, losses of and ch...",In a case where A bought a registered building...,Y,"['in', 'a', 'case', 'where', 'a', 'bought', 'a...",52.0,"['article', '177', 'acquisitions', 'of', ',', ...",54.0,15.0,...,,False,0.288462,1.0,"""If A purchases a registered building from B, ...",YES,tensor(0.9387),['not'],1.0,True
4,H19-12-4,1.0,Article 343\nA thing that cannot be transferre...,In cases where a contract that creates a pled...,Y,"['in', 'cases', 'where', 'a', 'contract', 'tha...",45.0,"['article', '343', 'a', 'thing', 'that', 'can'...",22.0,11.0,...,,False,0.244444,1.0,"""If a commitment is established to create a pl...",YES,tensor(0.8117),['not'],1.0,True
5,H20-23-5,1.0,Article 588\nIf any person has an obligation t...,A quasi-loan contract shall not become effecti...,Y,"['a', 'quasi-loan', 'contract', 'shall', 'not'...",18.0,"['article', '588', 'if', 'any', 'person', 'has...",48.0,5.0,...,,False,0.277778,2.0,"""A sham loan agreement will not come into effe...",YES,tensor(0.7973),['not'],1.0,True
6,H21-19-A,1.0,Article 447\n(1) The guarantee obligation incl...,Assuming that a recovery obligation due to can...,Y,"['assuming', 'that', 'a', 'recovery', 'obligat...",103.0,"['article', '447', '(', '1', ')', 'the', 'guar...",62.0,24.0,...,,False,0.23301,2.0,"""In instances where a contract cancellation re...",YES,tensor(0.9141),['not'],1.0,True
7,H22-15-U,1.0,Article 677\nA partner's creditor may not exer...,An obligor of a partnership cannot set off his...,Y,"['an', 'obligor', 'of', 'a', 'partnership', 'c...",18.0,"['article', '677', 'a', 'partner', ""'s"", 'cred...",19.0,6.0,...,,False,0.333333,1.0,"""A partner's debtor does not have the ability ...",YES,tensor(0.7270),['not'],1.0,True
8,H22-21-4,1.0,"Article 492\nUpon tendering the performance, t...",In cases where the obligor of a monetary debt ...,Y,"['in', 'cases', 'where', 'the', 'obligor', 'of...",42.0,"['article', '492', 'upon', 'tendering', 'the',...",26.0,8.0,...,,False,0.190476,1.0,"""When the person obliged to pay a financial ob...",YES,tensor(0.7586),['not'],1.0,True
9,H23-9-2,1.0,Article 192\nA person that commences the posse...,Provisions for immediate acquisition do not ap...,Y,"['provisions', 'for', 'immediate', 'acquisitio...",26.0,"['article', '192', 'a', 'person', 'that', 'com...",41.0,7.0,...,,False,0.269231,1.0,"""If an inheritor takes over another individual...",YES,tensor(0.8679),['not'],1.0,True


In [None]:
df.to_csv("/content/drive/MyDrive/data/task 4/train/data_aug_contradiction_instances.csv", index=False)

# Word Overlap Bias

In [None]:
def calculate_percent_overlap(premise, hypothesis):
    """
    This function takes in two sentences as arguments (the premise and the hypothesis), tokenizes them,
    and calculates the count of words that overlap between them if the overlap is high (above the threshold).
    The threshold is defined as a ratio of the total words in both sentences.
    """

    # Tokenize the sentences
    premise_tokens = nltk.word_tokenize(premise.lower())
    hypothesis_tokens = nltk.word_tokenize(hypothesis.lower())

    # Count the words in each sentence
    premise_counter = Counter(premise_tokens)
    hypothesis_counter = Counter(hypothesis_tokens)

    # Calculate the overlap using the intersection of the two Counters
    overlap_counter = premise_counter & hypothesis_counter
    overlap = sum(overlap_counter.values())

    percent_overlap = (overlap/sum(hypothesis_counter.values())) * 100


    return percent_overlap

In [None]:
# get data
ids_dict = {}
years = [2018, 2019, 2020, 2021, 2022]
wo_train_data_dict = {}
for year in years:
    data_file_path = f"/content/drive/MyDrive/data/task 4/train/coliee_train_{year}.csv"

    df = read_data(data_file_path)
    print(f"Year : {year}")

    df['percent_overlap'] = (df['overlap'] / df['hyp_length']) * 100

    wob_df = df[(df['is_word_overlap']==True) & (df['has_negation']==False) & (df['is_subsequence_heuristic']==False)]
    print(f"Number of total instances with Word Overlap : {wob_df.shape}")

    # More than 50%

    entailment_wob_df = wob_df[(wob_df['labels']=="Y")  & (wob_df['percent_overlap']>=50)]
    print(f"Number of entailment instances with Word Overlaps : {entailment_wob_df.shape}")
    non_entailment_wob_df = wob_df[(wob_df['labels']=="N")  & (wob_df['percent_overlap']>=50)]
    print(f"Number of non-entailment instances with Word Overlaps : {non_entailment_wob_df.shape}")

    diff = entailment_wob_df.shape[0] - non_entailment_wob_df.shape[0]
    print(f"Difference in word overlap instances between entailment and non-entailment labels above 50%: {diff}")

    # Less than 50 %

    ls_entailment_wob_df = wob_df[(wob_df['labels']=="Y")  & (wob_df['percent_overlap']<50)]
    print(f"Number of entailment instances with Word Overlaps : {ls_entailment_wob_df.shape}")
    ls_non_entailment_wob_df = wob_df[(wob_df['labels']=="N")  & (wob_df['percent_overlap']<50)]
    print(f"Number of non-entailment instances with Word Overlaps : {ls_non_entailment_wob_df.shape}")

    ls_diff = ls_entailment_wob_df.shape[0] - ls_non_entailment_wob_df.shape[0]
    print(f"Difference in word overlap instances between entailment and non-entailment labels below 50%: {ls_diff}")




    wo_train_data_dict[year] = wob_df

Year : 2018
Number of total instances with Word Overlap : (277, 16)
Number of entailment instances with Word Overlaps : (109, 16)
Number of non-entailment instances with Word Overlaps : (70, 16)
Difference in word overlap instances between entailment and non-entailment labels above 50%: 39
Number of entailment instances with Word Overlaps : (38, 16)
Number of non-entailment instances with Word Overlaps : (60, 16)
Difference in word overlap instances between entailment and non-entailment labels below 50%: -22
Year : 2019
Number of total instances with Word Overlap : (298, 16)
Number of entailment instances with Word Overlaps : (117, 16)
Number of non-entailment instances with Word Overlaps : (79, 16)
Difference in word overlap instances between entailment and non-entailment labels above 50%: 38
Number of entailment instances with Word Overlaps : (42, 16)
Number of non-entailment instances with Word Overlaps : (60, 16)
Difference in word overlap instances between entailment and non-entai

In [None]:
wob_df = wo_train_data_dict[2022]

In [None]:
non_ent_wob_df = wob_df[(wob_df['labels']=="N") & (wob_df['percent_overlap']>=70)]
non_ent_wob_df.shape

(49, 16)

In [None]:
def build_word_overlap_paraphrase_prompt(premise, hypothesis):
    word_overlap_paraphrase_prompt = f'''You will be provided with a Legal Natural Language Inference pair containing premise, hypothesis and its respective label.
    Label "Y" indicates entailment and "N" indicates non entailment.
    Your task is to paraphrase the provided original hypothesis by following the provided instructions.
    INSTRUCTIONS:
    1. In the paraphrased hypothesis the number of word overlaps between premise and hypothesis MUST be same or more but it MUST NOT be reduced.
    2. There MUST be more overlap between premise and hypothesis while preserving the label.
    3. There MUST not be any negation words such as "no", "n't", "not", "cannot" etc., in the paraphrased hypothesis at all.
    3. Provide ONLY the paraphrased hypothesis and nothing else.

    Example Input:

        "Premise":"Article 299
        (1) If the holder of a right of retention incurs necessary expenses with respect to the thing retained, that holder may have the owner reimburse the same.
        (2) If the holder of a right of retention incurs beneficial expenses with respect to the thing retained, to the extent that there is
        currently an increase in value as a result of the same,
        that holder may have the expenses incurred or the increase in value reimbursed at the owner's choice;
        provided, however, that the court may, at the request of the owner,
        grant a reasonable period of time for the reimbursement of the same..",
        "Hypothesis": "If a holder of a right of retention incurs ordinary unnecessary expenses with respect to the Thing retained,
        he/she may have the owner reimburse the same."
        "Label": "N"

    Example Output:
        "Paraphrased Hypothesis": "If an individual was careless and failed to realize that an obligation was nonexistent at the time
        they fulfilled it, they are deprived of the right to demand the return of the items they provided."


    Real Input:
        "Premise":{premise},
        "Hypothesis:{hypothesis}

    Real Output:
        "Paraphrased Hypothesis":

    '''

    return word_overlap_paraphrase_prompt

In [None]:
import time
paraphrased_texts = []
validations = []
scores = []
mod_hyp_percent_overlaps = []

for i in tqdm(range(non_ent_wob_df.shape[0])):

    premise = non_ent_wob_df['premise'].iloc[i]
    hypothesis = non_ent_wob_df['hypothesis'].iloc[i]
    paraphrased_hyp_percent_overlap = 0
    num_retries = 0

    while paraphrased_hyp_percent_overlap < 60 and num_retries <= 5:

        word_overlap_paraphrase_prompt = build_word_overlap_paraphrase_prompt(premise, hypothesis)

        data_completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": word_overlap_paraphrase_prompt}
        ]
        )

        # print(data_completion.choices[0].message)

        paraphrased_hypothesis = data_completion.choices[0].message['content'].strip()

        paraphrased_hyp_percent_overlap = calculate_percent_overlap(premise, paraphrased_hypothesis)

        num_retries = num_retries + 1

        time.sleep(5)

    paraphrased_texts.append(paraphrased_hypothesis)
    mod_hyp_percent_overlaps.append(paraphrased_hyp_percent_overlap)

    validation_prompt = build_validation_prompt(hypothesis, paraphrased_hypothesis)

    val_completion = openai.ChatCompletion.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": "You are tasked with validating provided sentences."},
        {"role": "user", "content": validation_prompt}
    ]
    )


    # print(val_completion.choices[0].message)

    validation = val_completion.choices[0].message['content'].strip()

    validations.append(validation)

    model = SentenceTransformer('BAAI/bge-large-en-v1.5')

    sentences_1 = [hypothesis]
    sentences_2 = [paraphrased_hypothesis]

    #Compute embedding for both lists
    embeddings1 = model.encode(sentences_1, convert_to_tensor=True)
    embeddings2 = model.encode(sentences_2, convert_to_tensor=True)

    #Compute cosine-similarities
    cosine_scores = util.cos_sim(embeddings1, embeddings2)
    similarity_score = cosine_scores[0][0]

    scores.append(similarity_score)


non_ent_wob_df['Mod Hypothesis'] = paraphrased_texts
non_ent_wob_df['Validation'] = validations
non_ent_wob_df['similarity_scores'] = scores
non_ent_wob_df['Mod Hypothesis percent overlap'] = mod_hyp_percent_overlaps

100%|██████████| 49/49 [14:51<00:00, 18.20s/it]


In [None]:
non_ent_wob_df

Unnamed: 0,id,label,premise,hypothesis,labels,hyp_tokens,hyp_length,prem_tokens,prem_length,overlap,is_word_overlap,negations,has_negation,detected_subsequence,is_subsequence_heuristic,percent_overlap,Mod Hypothesis,Validation,similarity_scores,Mod Hypothesis percent overlap
32,H18-28-4,0,Article 465-3\n(1) If a contract for revolving...,For the principal obligation of a contract for...,N,"['for', 'the', 'principal', 'obligation', 'of'...",58,"['article', '465-3', '(', '1', ')', 'if', 'a',...",458,47,True,[],False,,False,81.034483,"""If the principal crystallization date for a c...",YES,tensor(0.7965),86.111111
48,H19-12-1,0,Article 350\nThe provisions of Articles 296 th...,"A right of retention, a pledge, and a mortgage...",N,"['a', 'right', 'of', 'retention', ',', 'a', 'p...",49,"['article', '350', 'the', 'provisions', 'of', ...",142,41,True,[],False,,False,83.673469,"A holder of a statutory lien, a pledge, or a m...",YES,tensor(0.9078),69.565217
60,H19-16-2,0,Article 387\n(1) A registered lease may be dul...,A registered lease may be asserted against th...,N,"['a', 'registered', 'lease', 'may', 'be', 'ass...",37,"['article', '387', '(', '1', ')', 'a', 'regist...",94,26,True,[],False,,False,70.27027,A lease that has been registered can be enforc...,YES,tensor(0.8770),78.947368
204,H22-26-U,0,Article 646\n(1) A mandatary must deliver to t...,"If the mandatary, received monies and other ...",N,"['if', 'the', 'mandatary', ',', 'received', 'm...",28,"['article', '646', '(', '1', ')', 'a', 'mandat...",64,21,True,[],False,,False,75.0,If the mandatary has received monies and other...,YES,tensor(0.8739),73.076923
239,H23-11-O,0,Article 702\n(1) If a manager has incurred ben...,Even in cases where the manager in management ...,N,"['even', 'in', 'cases', 'where', 'the', 'manag...",47,"['article', '702', '(', '1', ')', 'if', 'a', '...",98,37,True,[],False,,False,78.723404,Even if the manager has gone against the princ...,YES,tensor(0.8887),66.666667
277,H24-8-5,0,Article 424\n(1) An obligee may demand the cou...,An obligee (B) against (A) may demand the cour...,N,"['an', 'obligee', '(', 'b', ')', 'against', '(...",60,"['article', '424', '(', '1', ')', 'an', 'oblig...",179,45,True,[],False,,False,75.0,An obligee (B) against (A) may demand the cour...,YES,tensor(0.9278),68.421053
308,H24-20-I,0,Article 465-3\n(1) If a contract for revolving...,If three years have elapsed from the day of th...,N,"['if', 'three', 'years', 'have', 'elapsed', 'f...",37,"['article', '465-3', '(', '1', ')', 'if', 'a',...",481,30,True,[],False,,False,81.081081,If the day three years have passed from the da...,YES,tensor(0.9136),82.051282
324,H24-27-O,0,Article 637\n(1) In the case prescribed in the...,Where delivery of the subject matter is requir...,N,"['where', 'delivery', 'of', 'the', 'subject', ...",33,"['article', '637', '(', '1', ')', 'in', 'the',...",152,27,True,[],False,,False,81.818182,Where the contractor delivers the subject matt...,YES,tensor(0.9723),85.0
355,H25-13-3,0,Article 366\n(1) A pledgee may directly collec...,If monetary claim is the subject matter of a p...,N,"['if', 'monetary', 'claim', 'is', 'the', 'subj...",34,"['article', '366', '(', '1', ')', 'a', 'pledge...",152,29,True,[],False,,False,85.294118,If monies are the subject matter of a pledged ...,YES,tensor(0.8946),88.571429
364,H25-16-4,0,Article 370\nA mortgage extends to the things ...,A mortgage shall extend to the buildings on th...,N,"['a', 'mortgage', 'shall', 'extend', 'to', 'th...",19,"['article', '370', 'a', 'mortgage', 'extends',...",87,15,True,[],False,,False,78.947368,A mortgage extends to the buildings on the mor...,YES,tensor(0.9865),88.888889


In [None]:
non_ent_wob_df.to_csv("/content/drive/MyDrive/data/task 4/train/data_aug_word_overlap_instances.csv", index=False)

In [None]:
df = pd.read_csv("/content/drive/MyDrive/data/task 4/train/data_aug_word_overlap_instances.csv")
df

Unnamed: 0,id,label,premise,orginal_hypothesis,labels,hyp_tokens,hyp_length,prem_tokens,prem_length,overlap,is_word_overlap,negations,has_negation,detected_subsequence,is_subsequence_heuristic,percent_overlap,hypothesis,Validation,similarity_scores,Mod Hypothesis percent overlap
0,WO-AD-H18-28-4,0.0,Article 465-3\n(1) If a contract for revolving...,For the principal obligation of a contract for...,N,"['for', 'the', 'principal', 'obligation', 'of'...",58.0,"['article', '465-3', '(', '1', ')', 'if', 'a',...",458.0,47.0,True,[],False,,False,81.034483,"""If the principal crystallization date for a c...",YES,tensor(0.7965),86.111111
1,WO-AD-H19-12-1,0.0,Article 350\nThe provisions of Articles 296 th...,"A right of retention, a pledge, and a mortgage...",N,"['a', 'right', 'of', 'retention', ',', 'a', 'p...",49.0,"['article', '350', 'the', 'provisions', 'of', ...",142.0,41.0,True,[],False,,False,83.673469,"A holder of a statutory lien, a pledge, or a m...",YES,tensor(0.9078),69.565217
2,WO-AD-H19-16-2,0.0,Article 387\n(1) A registered lease may be dul...,A registered lease may be asserted against th...,N,"['a', 'registered', 'lease', 'may', 'be', 'ass...",37.0,"['article', '387', '(', '1', ')', 'a', 'regist...",94.0,26.0,True,[],False,,False,70.27027,A lease that has been registered can be enforc...,YES,tensor(0.8770),78.947368
3,WO-AD-H22-26-U,0.0,Article 646\n(1) A mandatary must deliver to t...,"If the mandatary, received monies and other ...",N,"['if', 'the', 'mandatary', ',', 'received', 'm...",28.0,"['article', '646', '(', '1', ')', 'a', 'mandat...",64.0,21.0,True,[],False,,False,75.0,If the mandatary has received monies and other...,YES,tensor(0.8739),73.076923
4,WO-AD-H23-11-O,0.0,Article 702\n(1) If a manager has incurred ben...,Even in cases where the manager in management ...,N,"['even', 'in', 'cases', 'where', 'the', 'manag...",47.0,"['article', '702', '(', '1', ')', 'if', 'a', '...",98.0,37.0,True,[],False,,False,78.723404,Even if the manager has gone against the princ...,YES,tensor(0.8887),66.666667
5,WO-AD-H24-8-5,0.0,Article 424\n(1) An obligee may demand the cou...,An obligee (B) against (A) may demand the cour...,N,"['an', 'obligee', '(', 'b', ')', 'against', '(...",60.0,"['article', '424', '(', '1', ')', 'an', 'oblig...",179.0,45.0,True,[],False,,False,75.0,An obligee (B) against (A) may demand the cour...,YES,tensor(0.9278),68.421053
6,WO-AD-H24-20-I,0.0,Article 465-3\n(1) If a contract for revolving...,If three years have elapsed from the day of th...,N,"['if', 'three', 'years', 'have', 'elapsed', 'f...",37.0,"['article', '465-3', '(', '1', ')', 'if', 'a',...",481.0,30.0,True,[],False,,False,81.081081,If the day three years have passed from the da...,YES,tensor(0.9136),82.051282
7,WO-AD-H24-27-O,0.0,Article 637\n(1) In the case prescribed in the...,Where delivery of the subject matter is requir...,N,"['where', 'delivery', 'of', 'the', 'subject', ...",33.0,"['article', '637', '(', '1', ')', 'in', 'the',...",152.0,27.0,True,[],False,,False,81.818182,Where the contractor delivers the subject matt...,YES,tensor(0.9723),85.0
8,WO-AD-H25-13-3,0.0,Article 366\n(1) A pledgee may directly collec...,If monetary claim is the subject matter of a p...,N,"['if', 'monetary', 'claim', 'is', 'the', 'subj...",34.0,"['article', '366', '(', '1', ')', 'a', 'pledge...",152.0,29.0,True,[],False,,False,85.294118,If monies are the subject matter of a pledged ...,YES,tensor(0.8946),88.571429
9,WO-AD-H25-16-4,0.0,Article 370\nA mortgage extends to the things ...,A mortgage shall extend to the buildings on th...,N,"['a', 'mortgage', 'shall', 'extend', 'to', 'th...",19.0,"['article', '370', 'a', 'mortgage', 'extends',...",87.0,15.0,True,[],False,,False,78.947368,A mortgage extends to the buildings on the mor...,YES,tensor(0.9865),88.888889


In [None]:
df = add_features(df)

In [None]:
df

Unnamed: 0,id,label,premise,orginal_hypothesis,labels,hyp_tokens,hyp_length,prem_tokens,prem_length,overlap,is_word_overlap,negations,has_negation,detected_subsequence,is_subsequence_heuristic,percent_overlap,hypothesis,Validation,similarity_scores,Mod Hypothesis percent overlap
0,WO-AD-H18-28-4,0.0,Article 465-3\n(1) If a contract for revolving...,For the principal obligation of a contract for...,N,"[``, if, the, principal, crystallization, date...",36,"[article, 465-3, (, 1, ), if, a, contract, for...",458,31,True,[],False,,False,81.034483,"""If the principal crystallization date for a c...",YES,tensor(0.7965),86.111111
1,WO-AD-H19-12-1,0.0,Article 350\nThe provisions of Articles 296 th...,"A right of retention, a pledge, and a mortgage...",N,"[a, holder, of, a, statutory, lien, ,, a, pled...",46,"[article, 350, the, provisions, of, articles, ...",142,32,True,[],False,,False,83.673469,"A holder of a statutory lien, a pledge, or a m...",YES,tensor(0.9078),69.565217
2,WO-AD-H19-16-2,0.0,Article 387\n(1) A registered lease may be dul...,A registered lease may be asserted against th...,N,"[a, lease, that, has, been, registered, can, b...",38,"[article, 387, (, 1, ), a, registered, lease, ...",94,30,True,[],False,,False,70.27027,A lease that has been registered can be enforc...,YES,tensor(0.8770),78.947368
3,WO-AD-H22-26-U,0.0,Article 646\n(1) A mandatary must deliver to t...,"If the mandatary, received monies and other ...",N,"[if, the, mandatary, has, received, monies, an...",26,"[article, 646, (, 1, ), a, mandatary, must, de...",64,19,True,[],False,,False,75.0,If the mandatary has received monies and other...,YES,tensor(0.8739),73.076923
4,WO-AD-H23-11-O,0.0,Article 702\n(1) If a manager has incurred ben...,Even in cases where the manager in management ...,N,"[even, if, the, manager, has, gone, against, t...",30,"[article, 702, (, 1, ), if, a, manager, has, i...",98,20,True,[],False,,False,78.723404,Even if the manager has gone against the princ...,YES,tensor(0.8887),66.666667
5,WO-AD-H24-8-5,0.0,Article 424\n(1) An obligee may demand the cou...,An obligee (B) against (A) may demand the cour...,N,"[an, obligee, (, b, ), against, (, a, ), may, ...",57,"[article, 424, (, 1, ), an, obligee, may, dema...",179,39,True,[],False,,False,75.0,An obligee (B) against (A) may demand the cour...,YES,tensor(0.9278),68.421053
6,WO-AD-H24-20-I,0.0,Article 465-3\n(1) If a contract for revolving...,If three years have elapsed from the day of th...,N,"[if, the, day, three, years, have, passed, fro...",39,"[article, 465-3, (, 1, ), if, a, contract, for...",481,32,True,[],False,,False,81.081081,If the day three years have passed from the da...,YES,tensor(0.9136),82.051282
7,WO-AD-H24-27-O,0.0,Article 637\n(1) In the case prescribed in the...,Where delivery of the subject matter is requir...,N,"[where, the, contractor, delivers, the, subjec...",40,"[article, 637, (, 1, ), in, the, case, prescri...",152,34,True,[],False,,False,81.818182,Where the contractor delivers the subject matt...,YES,tensor(0.9723),85.0
8,WO-AD-H25-13-3,0.0,Article 366\n(1) A pledgee may directly collec...,If monetary claim is the subject matter of a p...,N,"[if, monies, are, the, subject, matter, of, a,...",35,"[article, 366, (, 1, ), a, pledgee, may, direc...",152,31,True,[],False,,False,85.294118,If monies are the subject matter of a pledged ...,YES,tensor(0.8946),88.571429
9,WO-AD-H25-16-4,0.0,Article 370\nA mortgage extends to the things ...,A mortgage shall extend to the buildings on th...,N,"[a, mortgage, extends, to, the, buildings, on,...",18,"[article, 370, a, mortgage, extends, to, the, ...",87,16,True,[],False,,False,78.947368,A mortgage extends to the buildings on the mor...,YES,tensor(0.9865),88.888889


In [None]:
# df_bad = df[(df['Mod Hypothesis percent overlap'] < 50) | (df['Validation']=="NO")]
df_bad = df[(df['has_negation']==True) | (df['is_subsequence_heuristic']==True)]
df_bad

Unnamed: 0,id,label,premise,orginal_hypothesis,labels,hyp_tokens,hyp_length,prem_tokens,prem_length,overlap,is_word_overlap,negations,has_negation,detected_subsequence,is_subsequence_heuristic,percent_overlap,hypothesis,Validation,similarity_scores,Mod Hypothesis percent overlap
18,WO-AD-H27-28-A,0.0,Article 711\nA person that has taken the life ...,A person who has taken the life of another mus...,N,"[a, person, that, has, taken, the, life, of, a...",42,"[article, 711, a, person, that, has, taken, th...",44,41,True,[not],True,A person that has taken the life of another mu...,True,75.0,A person that has taken the life of another mu...,YES,tensor(0.8243),97.619048
27,WO-AD-H29-26-5,0.0,Article 577\n(1) If a mortgage that does not c...,If any mortgage is registered on immovable pro...,N,"[if, a, mortgage, that, does, n't, meet, the, ...",39,"[article, 577, (, 1, ), if, a, mortgage, that,...",110,30,True,[n't],True,,False,80.952381,If a mortgage that doesn't meet the contract t...,YES,tensor(0.8560),76.923077
40,WO-AD-R02-1-I,0.0,Article 15 (1) The family court may decide to ...,The issuance of a decision for commencement of...,N,"[the, family, court, may, decide, to, commence...",136,"[article, 15, (, 1, ), the, family, court, may...",145,135,True,[not],True,,False,87.179487,The family court may decide to commence an ass...,YES,tensor(0.8432),99.264706
41,WO-AD-R02-1-U,0.0,Article 18 (1) If the grounds prescribed in th...,If the grounds of commencement of assistance c...,N,"[if, the, grounds, of, assistance, are, no, lo...",31,"[article, 18, (, 1, ), if, the, grounds, presc...",127,19,True,[no],True,,False,82.142857,If the grounds of assistance are no longer pre...,YES,tensor(0.8872),61.290323


In [None]:
for i,row in df_bad.iterrows():
    print("Premise")
    print(row['premise'])
    print("Original Hyp")
    print(row['orginal_hypothesis'])
    print("Mod Hyp")
    print(row['hypothesis'])

Premise
Article 711
A person that has taken the life of another must compensate for loss or damage to the father, mother, spouse, and children of the victim, even if the property rights of the same have not been infringed..
Original Hyp
A person who has taken the life of another must compensate for damages only to heirs of the victim.
Mod Hyp
A person that has taken the life of another must compensate for loss or damage to the father, mother, spouse, and children of the victim, even if the property rights of the same have not been infringed.
Premise
Article 577
(1) If a mortgage that does not conform to the terms of the contract is registered on immovables that have been purchased, the buyer may refuse to pay the price until the completion of the procedures of the claim for extinguishment of the mortgage. In such cases, the seller may demand that the buyer file the claim for extinguishment of the mortgage without delay.
(2) The provisions of the preceding paragraph apply mutatis mutand

In [None]:
df_bad.loc[(18, hypothesis)] = "A person responsible for causing another's death is obligated to provide compensation for damages, but solely to the deceased’s heirs."

In [None]:
import time
paraphrased_texts = []
validations = []
scores = []
mod_hyp_percent_overlaps = []

for i in tqdm(range(df_bad.shape[0])):

    premise = df_bad['premise'].iloc[i]
    hypothesis = df_bad['orginal_hypothesis'].iloc[i]
    paraphrased_hyp_percent_overlap = 0
    num_retries = 0

    while paraphrased_hyp_percent_overlap < 50 and num_retries <= 5:

        word_overlap_paraphrase_prompt = build_word_overlap_paraphrase_prompt(premise, hypothesis)

        data_completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": word_overlap_paraphrase_prompt}
        ]
        )

        # print(data_completion.choices[0].message)

        paraphrased_hypothesis = data_completion.choices[0].message['content'].strip()

        paraphrased_hyp_percent_overlap = calculate_percent_overlap(premise, paraphrased_hypothesis)

        num_retries = num_retries + 1

        tokens = nltk.word_tokenize(paraphrased_hypothesis.lower())
        negations = extract_negation(tokens)
        detected_sequence,is_subsequence = detect_subsequence(premise, paraphrased_hypothesis)

        time.sleep(5)

    paraphrased_texts.append(paraphrased_hypothesis)
    mod_hyp_percent_overlaps.append(paraphrased_hyp_percent_overlap)

    validation_prompt = build_validation_prompt(hypothesis, paraphrased_hypothesis)

    val_completion = openai.ChatCompletion.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": "You are tasked with validating provided sentences."},
        {"role": "user", "content": validation_prompt}
    ]
    )


    # print(val_completion.choices[0].message)

    validation = val_completion.choices[0].message['content'].strip()

    validations.append(validation)

    model = SentenceTransformer('BAAI/bge-large-en-v1.5')

    sentences_1 = [hypothesis]
    sentences_2 = [paraphrased_hypothesis]

    #Compute embedding for both lists
    embeddings1 = model.encode(sentences_1, convert_to_tensor=True)
    embeddings2 = model.encode(sentences_2, convert_to_tensor=True)

    #Compute cosine-similarities
    cosine_scores = util.cos_sim(embeddings1, embeddings2)
    similarity_score = cosine_scores[0][0]

    scores.append(similarity_score)


df_bad['Mod Hypothesis'] = paraphrased_texts
df_bad['Validation'] = validations
df_bad['similarity_scores'] = scores
df_bad['Mod Hypothesis percent overlap'] = mod_hyp_percent_overlaps

100%|██████████| 4/4 [01:06<00:00, 16.71s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bad['Mod Hypothesis'] = paraphrased_texts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bad['Validation'] = validations
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bad['similarity_scores'] = scores
A value is trying to be set on a copy of a

In [None]:
df_bad

Unnamed: 0,id,label,premise,orginal_hypothesis,labels,hyp_tokens,hyp_length,prem_tokens,prem_length,overlap,...,negations,has_negation,detected_subsequence,is_subsequence_heuristic,percent_overlap,hypothesis,Validation,similarity_scores,Mod Hypothesis percent overlap,Mod Hypothesis
18,WO-AD-H27-28-A,0.0,Article 711\nA person that has taken the life ...,A person who has taken the life of another mus...,N,"[a, person, that, has, taken, the, life, of, a...",42,"[article, 711, a, person, that, has, taken, th...",44,41,...,[not],True,A person that has taken the life of another mu...,True,75.0,A person that has taken the life of another mu...,NO,tensor(0.7815),67.44186,A person who has caused the death of another i...
27,WO-AD-H29-26-5,0.0,Article 577\n(1) If a mortgage that does not c...,If any mortgage is registered on immovable pro...,N,"[if, a, mortgage, that, does, n't, meet, the, ...",39,"[article, 577, (, 1, ), if, a, mortgage, that,...",110,30,...,[n't],True,,False,80.952381,If a mortgage that doesn't meet the contract t...,YES,tensor(0.8514),80.952381,If a mortgage that does not adhere to the term...
40,WO-AD-R02-1-I,0.0,Article 15 (1) The family court may decide to ...,The issuance of a decision for commencement of...,N,"[the, family, court, may, decide, to, commence...",136,"[article, 15, (, 1, ), the, family, court, may...",145,135,...,[not],True,,False,87.179487,The family court may decide to commence an ass...,YES,tensor(0.9751),88.888889,The issuance of a decision for commencement of...
41,WO-AD-R02-1-U,0.0,Article 18 (1) If the grounds prescribed in th...,If the grounds of commencement of assistance c...,N,"[if, the, grounds, of, assistance, are, no, lo...",31,"[article, 18, (, 1, ), if, the, grounds, presc...",127,19,...,[no],True,,False,82.142857,If the grounds of assistance are no longer pre...,YES,tensor(0.9163),60.0,If the grounds for the commencement of assista...


In [None]:
for i, row in df_bad.iterrows():
    print(row['Mod Hypothesis'])

A person who has caused the death of another is obligated to compensate for any loss or harm suffered by the father, mother, spouse, and children of the victim, regardless of whether their property rights have been violated.
If a mortgage that does not adhere to the terms of the contract is registered on immovables that have been purchased, the buyer is entitled to withhold payment for the price until the completion of the procedures for extinguishing the mortgage. In such situations, the seller can request that the buyer promptly initiates the process to eliminate the mortgage.
The issuance of a decision for commencement of assistance at the request of someone other than the person in question requires the consent of the person in question provided the family court deems it appropriate.
If the grounds for the commencement of assistance are no longer present, the family court may cancel the decision for the commencement of assistance without the need for any party to request it.


In [None]:
df.set_index('id', inplace=True)
df_bad.set_index('id', inplace=True)

df.update(df_bad)

df.reset_index(inplace=True)

In [None]:
df

Unnamed: 0,id,label,premise,hypothesis,labels,hyp_tokens,hyp_length,prem_tokens,prem_length,overlap,is_word_overlap,negations,has_negation,detected_subsequence,is_subsequence_heuristic,percent_overlap,Mod Hypothesis,Validation,similarity_scores,Mod Hypothesis percent overlap
0,H18-28-4,0.0,Article 465-3\n(1) If a contract for revolving...,For the principal obligation of a contract for...,N,"['for', 'the', 'principal', 'obligation', 'of'...",58.0,"['article', '465-3', '(', '1', ')', 'if', 'a',...",458.0,47.0,True,[],False,,False,81.034483,"""If the principal crystallization date for a c...",YES,tensor(0.7965),86.111111
1,H19-12-1,0.0,Article 350\nThe provisions of Articles 296 th...,"A right of retention, a pledge, and a mortgage...",N,"['a', 'right', 'of', 'retention', ',', 'a', 'p...",49.0,"['article', '350', 'the', 'provisions', 'of', ...",142.0,41.0,True,[],False,,False,83.673469,"A holder of a statutory lien, a pledge, or a m...",YES,tensor(0.9078),69.565217
2,H19-16-2,0.0,Article 387\n(1) A registered lease may be dul...,A registered lease may be asserted against th...,N,"['a', 'registered', 'lease', 'may', 'be', 'ass...",37.0,"['article', '387', '(', '1', ')', 'a', 'regist...",94.0,26.0,True,[],False,,False,70.27027,A lease that has been registered can be enforc...,YES,tensor(0.8770),78.947368
3,H22-26-U,0.0,Article 646\n(1) A mandatary must deliver to t...,"If the mandatary, received monies and other ...",N,"['if', 'the', 'mandatary', ',', 'received', 'm...",28.0,"['article', '646', '(', '1', ')', 'a', 'mandat...",64.0,21.0,True,[],False,,False,75.0,If the mandatary has received monies and other...,YES,tensor(0.8739),73.076923
4,H23-11-O,0.0,Article 702\n(1) If a manager has incurred ben...,Even in cases where the manager in management ...,N,"['even', 'in', 'cases', 'where', 'the', 'manag...",47.0,"['article', '702', '(', '1', ')', 'if', 'a', '...",98.0,37.0,True,[],False,,False,78.723404,Even if the manager has gone against the princ...,YES,tensor(0.8887),66.666667
5,H24-8-5,0.0,Article 424\n(1) An obligee may demand the cou...,An obligee (B) against (A) may demand the cour...,N,"['an', 'obligee', '(', 'b', ')', 'against', '(...",60.0,"['article', '424', '(', '1', ')', 'an', 'oblig...",179.0,45.0,True,[],False,,False,75.0,An obligee (B) against (A) may demand the cour...,YES,tensor(0.9278),68.421053
6,H24-20-I,0.0,Article 465-3\n(1) If a contract for revolving...,If three years have elapsed from the day of th...,N,"['if', 'three', 'years', 'have', 'elapsed', 'f...",37.0,"['article', '465-3', '(', '1', ')', 'if', 'a',...",481.0,30.0,True,[],False,,False,81.081081,If the day three years have passed from the da...,YES,tensor(0.9136),82.051282
7,H24-27-O,0.0,Article 637\n(1) In the case prescribed in the...,Where delivery of the subject matter is requir...,N,"['where', 'delivery', 'of', 'the', 'subject', ...",33.0,"['article', '637', '(', '1', ')', 'in', 'the',...",152.0,27.0,True,[],False,,False,81.818182,Where the contractor delivers the subject matt...,YES,tensor(0.9723),85.0
8,H25-13-3,0.0,Article 366\n(1) A pledgee may directly collec...,If monetary claim is the subject matter of a p...,N,"['if', 'monetary', 'claim', 'is', 'the', 'subj...",34.0,"['article', '366', '(', '1', ')', 'a', 'pledge...",152.0,29.0,True,[],False,,False,85.294118,If monies are the subject matter of a pledged ...,YES,tensor(0.8946),88.571429
9,H25-16-4,0.0,Article 370\nA mortgage extends to the things ...,A mortgage shall extend to the buildings on th...,N,"['a', 'mortgage', 'shall', 'extend', 'to', 'th...",19.0,"['article', '370', 'a', 'mortgage', 'extends',...",87.0,15.0,True,[],False,,False,78.947368,A mortgage extends to the buildings on the mor...,YES,tensor(0.9865),88.888889


In [None]:
df.to_csv("/content/drive/MyDrive/data/task 4/train/data_aug_word_overlap_instances.csv", index=False)

In [None]:
def build_validation_prompt(hypothesis, paraphrased_hypothesis):
    validation_prompt = f'''
        Sentence 1: {hypothesis}
        Sentence 2: {paraphrased_hypothesis}

        Does the above two sentences contain the same meaning?
        You must provide the answer only as YES or NO.
    '''

    return validation_prompt

In [None]:
for index, row in tqdm(df.iterrows()):
    hypothesis = row['orginal_hypothesis']
    paraphrased_hyp = row['hypothesis']

    val_prompt = build_validation_prompt(hypothesis, paraphrased_hyp)

    val_completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You are tasked with validating the provided sentences."},
        {"role": "user", "content": validation_prompt}
    ]
    )


    # print(val_completion.choices[0].message)

    validation = val_completion.choices[0].message['content'].strip()
    if validation == "NO":
        print(f"Premise : {row['premise']}")
        print(f"Hypothesis: {hypothesis}")
        print(f"Mod : {paraphrased_hyp}")

49it [01:26,  1.77s/it]


# Change ID

In [None]:
cw_aug_df = read_data("/content/drive/MyDrive/data/task 4/train/data_aug_contradiction_instances.csv")
wo_aug_df = read_data("/content/drive/MyDrive/data/task 4/train/data_aug_word_overlap_instances.csv")

In [None]:
def change_cw_id(org_id):
    prefix = "CW-AD"
    new_id = prefix + "-" + org_id
    return new_id

def change_wo_id(org_id):
    prefix = "WO-AD"
    new_id = prefix + "-" + org_id
    return new_id


cw_aug_df['id'] = cw_aug_df['id'].apply(change_cw_id)
wo_aug_df['id'] = wo_aug_df['id'].apply(change_wo_id)


In [None]:
# rename columns
cw_aug_df = cw_aug_df.rename(columns={"hypothesis": "orginal_hypothesis", "Mod Hypothesis": "hypothesis"})
wo_aug_df = wo_aug_df.rename(columns={"hypothesis": "orginal_hypothesis", "Mod Hypothesis": "hypothesis"})

In [None]:
cw_aug_df

Unnamed: 0,id,label,premise,orginal_hypothesis,labels,hyp_tokens,hyp_length,prem_tokens,prem_length,overlap,...,detected_subsequence,is_subsequence_heuristic,percent_overlap,num_negations,hypothesis,Validation,similarity_scores,Mod negations,Mod negation_count,Mod contains_negation
0,CW-AD-H18-2-4,1.0,Article 702\n(1) If a manager has incurred ben...,In cases where a person repairs the fence of a...,Y,"['in', 'cases', 'where', 'a', 'person', 'repai...",62.0,"['article', '702', '(', '1', ')', 'if', 'a', '...",98.0,21.0,...,,False,0.33871,1.0,"""When someone fixes a neighbor's fence affecte...",YES,tensor(0.8492),['not'],1.0,True
1,CW-AD-H18-23-I,1.0,Article 537\n(1) If one of the parties promise...,In cases where person A sold a jewel to person...,Y,"['in', 'cases', 'where', 'person', 'a', 'sold'...",101.0,"['article', '537', '(', '1', ')', 'if', 'one',...",120.0,32.0,...,,False,0.316832,1.0,"""In a scenario where individual A transfers a ...",YES,tensor(0.8756),['not'],1.0,True
2,CW-AD-H18-26-1,1.0,Article 255\nIf one of co-owners waives intere...,In cases where person A and person B co-own bu...,Y,"['in', 'cases', 'where', 'person', 'a', 'and',...",40.0,"['article', '255', 'if', 'one', 'of', 'co-owne...",22.0,7.0,...,,False,0.175,1.0,"""If one co-owner of a property, shared equally...",YES,tensor(0.8055),['no'],1.0,True
3,CW-AD-H19-11-3,1.0,"Article 177\nAcquisitions of, losses of and ch...",In a case where A bought a registered building...,Y,"['in', 'a', 'case', 'where', 'a', 'bought', 'a...",52.0,"['article', '177', 'acquisitions', 'of', ',', ...",54.0,15.0,...,,False,0.288462,1.0,"""If A purchases a registered building from B, ...",YES,tensor(0.9387),['not'],1.0,True
4,CW-AD-H19-12-4,1.0,Article 343\nA thing that cannot be transferre...,In cases where a contract that creates a pled...,Y,"['in', 'cases', 'where', 'a', 'contract', 'tha...",45.0,"['article', '343', 'a', 'thing', 'that', 'can'...",22.0,11.0,...,,False,0.244444,1.0,"""If a commitment is established to create a pl...",YES,tensor(0.8117),['not'],1.0,True
5,CW-AD-H20-23-5,1.0,Article 588\nIf any person has an obligation t...,A quasi-loan contract shall not become effecti...,Y,"['a', 'quasi-loan', 'contract', 'shall', 'not'...",18.0,"['article', '588', 'if', 'any', 'person', 'has...",48.0,5.0,...,,False,0.277778,2.0,"""A sham loan agreement will not come into effe...",YES,tensor(0.7973),['not'],1.0,True
6,CW-AD-H21-19-A,1.0,Article 447\n(1) The guarantee obligation incl...,Assuming that a recovery obligation due to can...,Y,"['assuming', 'that', 'a', 'recovery', 'obligat...",103.0,"['article', '447', '(', '1', ')', 'the', 'guar...",62.0,24.0,...,,False,0.23301,2.0,"""In instances where a contract cancellation re...",YES,tensor(0.9141),['not'],1.0,True
7,CW-AD-H22-15-U,1.0,Article 677\nA partner's creditor may not exer...,An obligor of a partnership cannot set off his...,Y,"['an', 'obligor', 'of', 'a', 'partnership', 'c...",18.0,"['article', '677', 'a', 'partner', ""'s"", 'cred...",19.0,6.0,...,,False,0.333333,1.0,"""A partner's debtor does not have the ability ...",YES,tensor(0.7270),['not'],1.0,True
8,CW-AD-H22-21-4,1.0,"Article 492\nUpon tendering the performance, t...",In cases where the obligor of a monetary debt ...,Y,"['in', 'cases', 'where', 'the', 'obligor', 'of...",42.0,"['article', '492', 'upon', 'tendering', 'the',...",26.0,8.0,...,,False,0.190476,1.0,"""When the person obliged to pay a financial ob...",YES,tensor(0.7586),['not'],1.0,True
9,CW-AD-H23-9-2,1.0,Article 192\nA person that commences the posse...,Provisions for immediate acquisition do not ap...,Y,"['provisions', 'for', 'immediate', 'acquisitio...",26.0,"['article', '192', 'a', 'person', 'that', 'com...",41.0,7.0,...,,False,0.269231,1.0,"""If an inheritor takes over another individual...",YES,tensor(0.8679),['not'],1.0,True


In [None]:
wo_aug_df

Unnamed: 0,id,label,premise,orginal_hypothesis,labels,...,percent_overlap,hypothesis,Validation,similarity_scores,Mod Hypothesis percent overlap
0,WO-AD-H18-28-4,0.0,Article 465-3\n(1) If a contract for revolving...,For the principal obligation of a contract for...,N,...,81.034483,"""If the principal crystallization date for a c...",YES,tensor(0.7965),86.111111
1,WO-AD-H19-12-1,0.0,Article 350\nThe provisions of Articles 296 th...,"A right of retention, a pledge, and a mortgage...",N,...,83.673469,"A holder of a statutory lien, a pledge, or a m...",YES,tensor(0.9078),69.565217
2,WO-AD-H19-16-2,0.0,Article 387\n(1) A registered lease may be dul...,A registered lease may be asserted against th...,N,...,70.27027,A lease that has been registered can be enforc...,YES,tensor(0.8770),78.947368
3,WO-AD-H22-26-U,0.0,Article 646\n(1) A mandatary must deliver to t...,"If the mandatary, received monies and other ...",N,...,75.0,If the mandatary has received monies and other...,YES,tensor(0.8739),73.076923
4,WO-AD-H23-11-O,0.0,Article 702\n(1) If a manager has incurred ben...,Even in cases where the manager in management ...,N,...,78.723404,Even if the manager has gone against the princ...,YES,tensor(0.8887),66.666667
5,WO-AD-H24-8-5,0.0,Article 424\n(1) An obligee may demand the cou...,An obligee (B) against (A) may demand the cour...,N,...,75.0,An obligee (B) against (A) may demand the cour...,YES,tensor(0.9278),68.421053
6,WO-AD-H24-20-I,0.0,Article 465-3\n(1) If a contract for revolving...,If three years have elapsed from the day of th...,N,...,81.081081,If the day three years have passed from the da...,YES,tensor(0.9136),82.051282
7,WO-AD-H24-27-O,0.0,Article 637\n(1) In the case prescribed in the...,Where delivery of the subject matter is requir...,N,...,81.818182,Where the contractor delivers the subject matt...,YES,tensor(0.9723),85.0
8,WO-AD-H25-13-3,0.0,Article 366\n(1) A pledgee may directly collec...,If monetary claim is the subject matter of a p...,N,...,85.294118,If monies are the subject matter of a pledged ...,YES,tensor(0.8946),88.571429
9,WO-AD-H25-16-4,0.0,Article 370\nA mortgage extends to the things ...,A mortgage shall extend to the buildings on th...,N,...,78.947368,A mortgage extends to the buildings on the mor...,YES,tensor(0.9865),88.888889


In [None]:
cw_aug_df.to_csv("/content/drive/MyDrive/data/task 4/train/data_aug_contradiction_instances.csv", index=False)
wo_aug_df.to_csv("/content/drive/MyDrive/data/task 4/train/data_aug_word_overlap_instances.csv", index=False)