In [None]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence-transformers)
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m65.3 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m67.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 (from sentence-transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 k

In [None]:
import pandas as pd
from ast import literal_eval
from sentence_transformers import SentenceTransformer, util
import nltk
nltk.download('punkt')
from collections import Counter
from tqdm import tqdm
import spacy
import re
import string


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# List of negation words
negation_words = ["not", "no", "n't", "none", "neither", "never", "nobody", "nothing", "nowhere", "hardly", "scarcely", "barely", "rarely", "seldom"]


def extract_negation(tokens):
    negations = [token for token in tokens if token in negation_words]
    return negations

def calculate_overlap(premise, hypothesis):
    """
    This function takes in two sentences as arguments (the premise and the hypothesis), tokenizes them,
    and calculates the count of words that overlap between them if the overlap is high (above the threshold).
    The threshold is defined as a ratio of the total words in both sentences.
    """

    # Tokenize the sentences
    premise_tokens = nltk.word_tokenize(premise.lower())
    hypothesis_tokens = nltk.word_tokenize(hypothesis.lower())

    # Count the words in each sentence
    premise_counter = Counter(premise_tokens)
    hypothesis_counter = Counter(hypothesis_tokens)

    # Calculate the overlap using the intersection of the two Counters
    overlap_counter = premise_counter & hypothesis_counter
    overlap = sum(overlap_counter.values())

    return overlap

def detect_word_overlap_bias(df):
    """
    This function takes in a dataframe with 'premise', 'hypothesis', and 'label' columns,
    calculates the number of overlapping words for each row and adds this as a new 'overlap' column,
    then calculates the average overlap for 'entailment' cases and for 'non-entailment' cases,
    and compares the two.
    It also counts the number of instances in the 'entailment' and 'non-entailment' cases
    where the overlap is above a given threshold.
    """

    # Calculate overlap for each row and add it as a new 'overlap' column
    df['overlap'] = df.apply(lambda row: calculate_overlap(row['premise'], row['hypothesis']), axis=1)

    # create a new column to indicate if there is word overlap
    df['is_word_overlap'] = df['overlap'].apply(lambda x: True if x > 0 else False)

    return df

def remove_punctuation(input_string):
    translator = str.maketrans('', '', string.punctuation)
    return input_string.translate(translator)

def detect_subsequence(premise: str, hypothesis: str):
    premise_words = remove_punctuation(premise.lower())
    hypothesis_words = remove_punctuation(hypothesis.lower())
    # Add word boundaries to the hypothesis
    hypothesis_words = r"\b" + hypothesis_words + r"\b"
    if re.search(hypothesis_words, premise_words):
        return (hypothesis, True)
    else:
        return ("", False)

def add_features(df):
    df['hyp_tokens'] = df['hypothesis'].apply(lambda x: nltk.word_tokenize(x.lower()))
    df['hyp_length'] = df['hyp_tokens'].apply(lambda x: len(x))
    df['prem_tokens'] = df['premise'].apply(lambda x: nltk.word_tokenize(x.lower()))
    df['prem_length'] = df['prem_tokens'].apply(lambda x: len(x))
    df = detect_word_overlap_bias(df)
    df['negations'] = df['hyp_tokens'].apply(lambda x: extract_negation(x))
    df['has_negation'] = df['negations'].apply(lambda x: True if len(x) > 0 else False)
    df['detected_subsequence'], df['is_subsequence_heuristic'] = zip(*df.apply(lambda row: detect_subsequence(row['premise'], row['hypothesis']), axis=1))

    return df

def calculate_similarity(hypothesis, paraphrased_hypothesis):
    model = SentenceTransformer('BAAI/bge-large-en-v1.5')

    sentences_1 = [hypothesis]
    sentences_2 = [paraphrased_hypothesis]

    #Compute embedding for both lists
    embeddings1 = model.encode(sentences_1, convert_to_tensor=True)
    embeddings2 = model.encode(sentences_2, convert_to_tensor=True)

    #Compute cosine-similarities
    cosine_scores = util.cos_sim(embeddings1, embeddings2)
    similarity_score = cosine_scores[0][0]

    return similarity_score.detach().numpy()

In [None]:
def read_data(file_path):
    df = pd.read_csv(file_path)
    return df

In [None]:
cw_aug_df = read_data("/content/drive/MyDrive/data/task 4/train/data_aug_contradiction_instances.csv")
wo_aug_df = read_data("/content/drive/MyDrive/data/task 4/train/data_aug_word_overlap_instances.csv")

In [None]:
cw_aug_df = add_features(cw_aug_df)
wo_aug_df = add_features(wo_aug_df)

In [None]:
cw_aug_df['similarity_score'] = cw_aug_df.apply(
    lambda x: calculate_similarity(x['orginal_hypothesis'], x['hypothesis']), axis=1
)

wo_aug_df['similarity_score'] = wo_aug_df.apply(
    lambda x: calculate_similarity(x['orginal_hypothesis'], x['hypothesis']), axis=1
)


In [None]:
cw_aug_df

Unnamed: 0,id,label,premise,orginal_hypothesis,hypothesis,labels,hyp_tokens,hyp_length,prem_tokens,prem_length,overlap,is_word_overlap,negations,has_negation,detected_subsequence,is_subsequence_heuristic,similarity_score
0,CW-AD-H18-2-4,1,Article 702\n(1) If a manager has incurred ben...,In cases where a person repairs the fence of a...,"""When someone fixes a neighbor's fence affecte...",Y,"[``, when, someone, fixes, a, neighbor, 's, fe...",52,"[article, 702, (, 1, ), if, a, manager, has, i...",98,18,True,[not],True,,False,0.849189
1,CW-AD-H18-23-I,1,Article 537\n(1) If one of the parties promise...,In cases where person A sold a jewel to person...,"""In a scenario where individual A transfers a ...",Y,"[``, in, a, scenario, where, individual, a, tr...",106,"[article, 537, (, 1, ), if, one, of, the, part...",120,32,True,[not],True,,False,0.875629
2,CW-AD-H18-26-1,1,Article 255\nIf one of co-owners waives intere...,In cases where person A and person B co-own bu...,"""If one co-owner of a property, shared equally...",Y,"[``, if, one, co-owner, of, a, property, ,, sh...",38,"[article, 255, if, one, of, co-owners, waives,...",22,6,True,[no],True,,False,0.805493
3,CW-AD-H19-11-3,1,"Article 177\nAcquisitions of, losses of and ch...",In a case where A bought a registered building...,"""If A purchases a registered building from B, ...",Y,"[``, if, a, purchases, a, registered, building...",52,"[article, 177, acquisitions, of, ,, losses, of...",54,16,True,[not],True,,False,0.938702
4,CW-AD-H19-12-4,1,Article 343\nA thing that cannot be transferre...,In cases where a contract that creates a pled...,"""If a commitment is established to create a pl...",Y,"[``, if, a, commitment, is, established, to, c...",45,"[article, 343, a, thing, that, can, not, be, t...",22,11,True,[not],True,,False,0.811719
5,CW-AD-H20-23-5,1,Article 588\nIf any person has an obligation t...,A quasi-loan contract shall not become effecti...,A quasi-loan contract shall not become effecti...,Y,"[a, quasi-loan, contract, shall, not, become, ...",21,"[article, 588, if, any, person, has, an, oblig...",48,8,True,"[not, not]",True,,False,0.9366
6,CW-AD-H21-19-A,1,Article 447\n(1) The guarantee obligation incl...,Assuming that a recovery obligation due to can...,"""In instances where a contract cancellation re...",Y,"[``, in, instances, where, a, contract, cancel...",86,"[article, 447, (, 1, ), the, guarantee, obliga...",62,21,True,[not],True,,False,0.914059
7,CW-AD-H22-15-U,1,Article 677\nA partner's creditor may not exer...,An obligor of a partnership cannot set off his...,A debtor to a partnership is not able to set o...,Y,"[a, debtor, to, a, partnership, is, not, able,...",20,"[article, 677, a, partner, 's, creditor, may, ...",19,5,True,[not],True,,False,0.806879
8,CW-AD-H22-21-4,1,"Article 492\nUpon tendering the performance, t...",In cases where the obligor of a monetary debt ...,"""When the person obliged to pay a financial ob...",Y,"[``, when, the, person, obliged, to, pay, a, f...",42,"[article, 492, upon, tendering, the, performan...",26,9,True,[not],True,,False,0.758647
9,CW-AD-H23-9-2,1,Article 192\nA person that commences the posse...,Provisions for immediate acquisition do not ap...,Provisions for immediate acquisition are not a...,Y,"[provisions, for, immediate, acquisition, are,...",28,"[article, 192, a, person, that, commences, the...",41,4,True,[not],True,,False,0.943218


In [None]:
wo_aug_df

Unnamed: 0,id,label,premise,orginal_hypothesis,hypothesis,labels,hyp_tokens,hyp_length,prem_tokens,prem_length,overlap,is_word_overlap,negations,has_negation,detected_subsequence,is_subsequence_heuristic
0,WO-AD-H18-28-4,0,Article 465-3\n(1) If a contract for revolving...,For the principal obligation of a contract for...,"""If the principal crystallization date for a c...",N,"[``, if, the, principal, crystallization, date...",36,"[article, 465-3, (, 1, ), if, a, contract, for...",458,31,True,[],False,,False
1,WO-AD-H19-12-1,0,Article 350\nThe provisions of Articles 296 th...,"A right of retention, a pledge, and a mortgage...","A holder of a statutory lien, a pledge, or a m...",N,"[a, holder, of, a, statutory, lien, ,, a, pled...",46,"[article, 350, the, provisions, of, articles, ...",142,32,True,[],False,,False
2,WO-AD-H19-16-2,0,Article 387\n(1) A registered lease may be dul...,A registered lease may be asserted against th...,A registered lease can be asserted against a m...,N,"[a, registered, lease, can, be, asserted, agai...",38,"[article, 387, (, 1, ), a, registered, lease, ...",94,25,True,[],False,,False
3,WO-AD-H22-26-U,0,Article 646\n(1) A mandatary must deliver to t...,"If the mandatary, received monies and other ...",If the mandatary has received monies and other...,N,"[if, the, mandatary, has, received, monies, an...",26,"[article, 646, (, 1, ), a, mandatary, must, de...",64,19,True,[],False,,False
4,WO-AD-H23-11-O,0,Article 702\n(1) If a manager has incurred ben...,Even in cases where the manager in management ...,Even if the manager has gone against the princ...,N,"[even, if, the, manager, has, gone, against, t...",30,"[article, 702, (, 1, ), if, a, manager, has, i...",98,20,True,[],False,,False
5,WO-AD-H24-8-5,0,Article 424\n(1) An obligee may demand the cou...,An obligee (B) against (A) may demand the cour...,An obligee (B) against (A) may demand the cour...,N,"[an, obligee, (, b, ), against, (, a, ), may, ...",57,"[article, 424, (, 1, ), an, obligee, may, dema...",179,39,True,[],False,,False
6,WO-AD-H24-20-I,0,Article 465-3\n(1) If a contract for revolving...,If three years have elapsed from the day of th...,If the day three years have passed from the da...,N,"[if, the, day, three, years, have, passed, fro...",39,"[article, 465-3, (, 1, ), if, a, contract, for...",481,32,True,[],False,,False
7,WO-AD-H24-27-O,0,Article 637\n(1) In the case prescribed in the...,Where delivery of the subject matter is requir...,Where the contractor delivers the subject matt...,N,"[where, the, contractor, delivers, the, subjec...",40,"[article, 637, (, 1, ), in, the, case, prescri...",152,34,True,[],False,,False
8,WO-AD-H25-13-3,0,Article 366\n(1) A pledgee may directly collec...,If monetary claim is the subject matter of a p...,If monies are the subject matter of a pledged ...,N,"[if, monies, are, the, subject, matter, of, a,...",35,"[article, 366, (, 1, ), a, pledgee, may, direc...",152,31,True,[],False,,False
9,WO-AD-H25-16-4,0,Article 370\nA mortgage extends to the things ...,A mortgage shall extend to the buildings on th...,A mortgage shall extend to the buildings on th...,N,"[a, mortgage, shall, extend, to, the, building...",17,"[article, 370, a, mortgage, extends, to, the, ...",87,12,True,[],False,,False


In [None]:
wo_aug_df['percent_overlap'] = (wo_aug_df['overlap'] / wo_aug_df['hyp_length']) * 100

wo_aug_df[(wo_aug_df['percent_overlap']<=50) | wo_aug_df['has_negation']=="True"]

Unnamed: 0,id,label,premise,orginal_hypothesis,hypothesis,labels,hyp_tokens,hyp_length,prem_tokens,prem_length,overlap,is_word_overlap,negations,has_negation,detected_subsequence,is_subsequence_heuristic,percent_overlap


In [None]:
cw_aug_df.to_csv("/content/drive/MyDrive/data/task 4/train/data_aug_contradiction_instances.csv", index=False)
wo_aug_df.to_csv("/content/drive/MyDrive/data/task 4/train/data_aug_word_overlap_instances.csv", index=False)