In [None]:
# assigntools package is a course specific collection of useful tools
! rm -fr assigntools # helps to rerun this cell witthout errors, if recloning needed
! git clone https://github.com/kovvalsky/assigntools.git

import spacy
from tqdm import tqdm
import pandas as pd
from assigntools.LoLa.read_nli import snli_jsonl2dict, sen2anno_from_nli_problems
# from assigntools.LoLa.sen_analysis import spacy_process_sen2tok, display_doc_dep
from nltk.tree import Tree

'rm' is not recognized as an internal or external command,
operable program or batch file.
fatal: destination path 'assigntools' already exists and is not an empty directory.


## Read data

In [None]:
# Get SNLI data
# !wget https://nlp.stanford.edu/projects/snli/snli_1.0.zip
# !unzip snli_1.0.zip
# !rm -r __MACOSX/ snli_1.0/*_test*

In [2]:
# takes ~1min to read and pre-process data
# By default it reads the problems that have a gold label.
# SNLI is dict {part: {problem_id: problem_info}}
# S2A is dict {sentence: sentence annotation dict}
SNLI, S2A = snli_jsonl2dict('snli_1.0')

Found .json files for ['dev', 'test', 'train'] parts
processing DEV:	

0it [00:00, ?it/s]

10000it [00:00, 15965.62it/s]


158 problems without a gold label were ignored
0 problems have a wrong annotator label
9842 problems were returned
processing TEST:	

10000it [00:00, 17746.49it/s]


176 problems without a gold label were ignored
0 problems have a wrong annotator label
9824 problems were returned
processing TRAIN:	

550152it [00:30, 18223.03it/s]

785 problems without a gold label were ignored
198 problems have a wrong annotator label
549169 problems were returned
Most common wrong annotator labels: //(198)





In [6]:
# get dictionaries of sentence->annotation mappings for each split 
S2A_train = sen2anno_from_nli_problems(SNLI['train'], S2A)
S2A_dev = sen2anno_from_nli_problems(SNLI['dev'], S2A)
S2A_test = sen2anno_from_nli_problems(SNLI['test'], S2A)

## Create features

In [7]:
from spacy import tokens as spacy_tokens  
import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import wordnet as wn
nlp = spacy.load("en_core_web_sm")
STOPWORDS = set(stopwords.words('english'))

# ============================
# Shallow Features
# ============================

def has_negation(sen, anno):
    neg_words =  ["no" , "n't", "not", "never", "none", "no one"]
    return 1 if any(t.lower() in neg_words for t in anno['tok']) else 0


def giveaway_words(sen, anno):
    """
    Return 1 if the sentence contains any of the giveaway words, otherwise 0.
    (Single-sentence)
    """
    giveaway_list = ["sleep", "sleeping", "asleep", "slept"]
    return 1 if any(t.lower() in giveaway_list for t in anno['tok']) else 0


def lexical_overlap(p_tokens, h_tokens):
    """
    Compare two token lists (premise vs. hypothesis).
    Return overlap ratio = (# shared tokens) / (# tokens in hypothesis).
    (Pairwise)
    """
    p_set = set(w.lower() for w in p_tokens)
    h_set = set(w.lower() for w in h_tokens)
    if len(h_set) == 0:
        return 0.0
    return len(p_set & h_set) / float(len(h_set))


def length_difference(p_tokens, h_tokens):
    """
    Return absolute difference in length (# tokens).
    (Pairwise)
    """
    return len(p_tokens) - len(h_tokens)


def stopword_overlap(p_tokens, h_tokens):
    """
    Return ratio of shared stopwords = (# shared stopwords) / (# stopwords in hypothesis).
    (Pairwise)
    """
    p_stop = set(w.lower() for w in p_tokens if w.lower() in STOPWORDS)
    h_stop = set(w.lower() for w in h_tokens if w.lower() in STOPWORDS)
    if len(h_stop) == 0:
        return 0.0
    return len(p_stop & h_stop) / float(len(h_stop))


def length_ratios(p_tokens, h_tokens):
    """
    Return [len_p/len_h, len_h/len_p], handling division-by-zero.
    (Pairwise)
    """
    len_p = len(p_tokens)
    len_h = len(h_tokens)
    if len_p == 0 or len_h == 0:
        return [0.0, 0.0]
    return [len_p / float(len_h), len_h / float(len_p)]


def ngram_overlap(p_tokens, h_tokens, n=2):
    """
    Returns ratio of shared n-grams of size n.
    (Pairwise)
    """
    p_ngrams = set(ngrams(p_tokens, n))
    h_ngrams = set(ngrams(h_tokens, n))
    if len(h_ngrams) == 0:
        return 0.0
    return len(p_ngrams & h_ngrams) / float(len(h_ngrams))



def contains_numbers(tokens):
    """
    Single-sentence helper that returns 1 if there's any digit in these tokens, else 0.
    (But you can also use it pairwise if you do contains_numbers(p_tokens) and contains_numbers(h_tokens).)
    """
    return int(any(w.isdigit() for w in tokens))


def is_question(h_tokens):
    """
    Single-sentence helper (but typically used for hypothesis).
    Returns 1 if the sentence has a question word or a '?', else 0.
    """
    question_words = {"who", "what", "where", "when", "why", "how"}
    if any(w.lower() in question_words for w in h_tokens):
        return 1
    if any('?' in w for w in h_tokens):
        return 1
    return 0


def negation_only_in_hypothesis(p_tokens, h_tokens):
    """
    Pairwise: returns 1 if the hypothesis has negation but the premise does not, else 0.
    """
    neg_words = {
        "no", "n't", "not", "never", "none", 
        "no one", "nobody"
    }
    p_has = any(w.lower() in neg_words for w in p_tokens)
    h_has = any(w.lower() in neg_words for w in h_tokens)
    return int(h_has and not p_has)

# ============================
# Smart Features
# ============================

wordnet_similarity_cache = {}

def cached_wordnet_similarity(word1, word2):
    """
    Example: a memorized approach for WordNet similarity 
    to avoid recomputing the same pair many times.
    """
    if word1 not in wordnet_similarity_cache:
        wordnet_similarity_cache[word1] = {}
    if word2 in wordnet_similarity_cache[word1]:
        return wordnet_similarity_cache[word1][word2]

    syn1 = wn.synsets(word1)
    syn2 = wn.synsets(word2)
    if syn1 and syn2:
        sim = syn1[0].wup_similarity(syn2[0]) or 0.0
    else:
        sim = 0.0
    wordnet_similarity_cache[word1][word2] = sim
    return sim

def wordnet_lexical_relations(p_tokens, h_tokens, threshold=0.8):
    """
    Count how many tokens in hypothesis have a WordNet similarity
    >= threshold with some token in premise. 
    Then normalize by length of hypothesis. 
    """
    if not h_tokens:
        return 0.0
    matches = 0
    for h in h_tokens:
        for p in p_tokens:
            sim = cached_wordnet_similarity(h.lower(), p.lower())
            if sim >= threshold:
                matches += 1
                break  # once found a match for h, we move to next h
    return matches / float(len(h_tokens))

def cosine_similarity_feature(p_sentence, h_sentence, tfidf_vectorizer):
    """
    Use a TF-IDF vectorizer to compute cosine similarity between 
    premise & hypothesis strings. 
    - 'tfidf_vectorizer' should be a fitted TfidfVectorizer object.
    """
    p_vec = tfidf_vectorizer.transform([p_sentence])
    h_vec = tfidf_vectorizer.transform([h_sentence])
    return cosine_similarity(p_vec, h_vec)[0][0]

def jaccard_similarity(p_tokens, h_tokens):
    """
    Jaccard similarity = (# intersection) / (# union) of sets of tokens.
    """
    p_set, h_set = set(p_tokens), set(h_tokens)
    union = p_set | h_set
    if not union:
        return 0.0
    return len(p_set & h_set) / float(len(union))

def spacy_tree_similarity(p_doc, h_doc):
    """
    Example of a dependency-tree-based similarity measure 
    using spaCy docs. We'll do a Jaccard on edges ignoring 'ROOT'.
    """
    p_edges = set()
    for token in p_doc:
        if token.dep_ != 'ROOT':
            p_edges.add((token.lemma_.lower(), token.dep_, token.head.lemma_.lower()))
    
    h_edges = set()
    for token in h_doc:
        if token.dep_ != 'ROOT':
            h_edges.add((token.lemma_.lower(), token.dep_, token.head.lemma_.lower()))
    
    if not p_edges and not h_edges:
        return 1.0  
    if (p_edges and not h_edges) or (h_edges and not p_edges):
        return 0.0  
    intersection = len(p_edges & h_edges)
    union = len(p_edges | h_edges)
    return intersection / float(union) if union else 0.0

def subject_negation_in_hypothesis_spacy(p_doc, h_doc):
    """
    Returns 1 if a subject in the hypothesis that matches 
    a premise subject lemma is negated. Otherwise 0.
    """
    p_subj_lemmas = set()
    for token in p_doc:
        if token.dep_ in ("nsubj", "nsubjpass"):
            p_subj_lemmas.add(token.lemma_.lower())

    # Check hypothesis
    for token in h_doc:
        if token.dep_ in ("nsubj", "nsubjpass"):
            if token.lemma_.lower() in p_subj_lemmas:
                # check if the subject token or any of its children has negation
                for child in token.children:
                    if child.dep_ == "neg":
                        return 1
    return 0

def contains_adj_adv_in_hypothesis(h_tree_str):
    """
    Check if the hypothesis parse tree has any adjectives or adverbs.
    'h_tree_str' is the Penn Treebank parse (string).
    """
    try:
        tree = Tree.fromstring(h_tree_str)
    except:
        return 0
    count_adj_adv = 0
    for subtree in tree.subtrees():
        if subtree.label() in {"JJ", "JJR", "JJS", "RB", "RBR", "RBS"}:
            count_adj_adv += 1
    return int(count_adj_adv > 0)

def sentiment_difference_feature(p_sentence, h_sentence, sia):
    """
    Compute absolute difference of VADER sentiment compound scores.
    'sia' is a SentimentIntensityAnalyzer instance.
    """
    p_score = sia.polarity_scores(p_sentence)['compound']
    h_score = sia.polarity_scores(h_sentence)['compound']
    return p_score - h_score


In [8]:
nltk.downloader.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

def sen2features(sen, anno):

    feats = {}
    
    feats['tok_num'] = len(anno['tok'])
    feats['neg_num'] = has_negation(sen, anno)
    feats['noun_num'] = len([t for t in anno['pos'] if t == "NNS" or t == "NN"])
    feats['plural_noun'] = len([t for t in anno['pos'] if t == "NNS" or t == "NNPS"])
    
 
    feats['contains_numbers'] = int(any(w.isdigit() for w in anno['tok']))
    
    question_words = {"who", "what", "where", "when", "why", "how"}
    feats['is_question'] = int(
        any(w.lower() in question_words for w in anno['tok']) or "?" in sen
    )
    
    sentiment_scores = sia.polarity_scores(sen)
    feats['sentiment_compound'] = sentiment_scores['compound']
    
    return {**feats, **anno}


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ioann\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [9]:
def problem2features(sen1, anno1, sen2, anno2, 
                     sen_feats=set(['tok_num', 'neg_num', 'noun_num'])):
    """
    Takes two sentences (strings) and their annotations (feature dicts)
    and returns a dictionary of feature:value pairs that characterize the 
    premise–hypothesis pair. This version merges your friend's original 
    pairwise features with your "shallow" and "smart" features.
    """

    features = {}

   
    sen_feats = {}  # or {'neg_num','sentiment_compound'} if you want
    sen1_feats = {f"{k}1": v for k, v in anno1.items() if k in sen_feats}
    sen2_feats = {f"{k}2": v for k, v in anno2.items() if k in sen_feats}
    features.update(sen1_feats)
    features.update(sen2_feats)

    
    features['neg_diff'] = anno1['neg_num'] + anno2['neg_num']

    features['token_diff'] = (
        (anno1['tok_num'] - anno2['tok_num']) 
        / max(anno1['tok_num'], anno2['tok_num'])
    )

    features['same_tokens'] = (
        len(set(anno1['tok']).intersection(set(anno2['tok']))) 
        / max(len(anno1['tok']), len(anno2['tok']))
    )

    features['same_pos_tags'] = (
        len(set(anno1['pos']).intersection(set(anno2['pos']))) 
        / max(len(anno1['pos']), len(anno2['pos']))
    )

    features['giveaway'] = giveaway_words(sen2, anno2)

    features['first_word_match'] = int(
        anno1['tok'][0].lower() == anno2['tok'][0].lower()
    )
    features['last_word_match'] = int(
        anno1['tok'][-1].lower() == anno2['tok'][-1].lower()
    )

    features['plural_noun_diff'] = (
        anno1['plural_noun'] - anno2['plural_noun']
    )

    noun_exist = (
        'NN' in anno1['pos'] and 
        'NN' in anno2['pos']
    )
    features['noun_match'] = 1 if (
        noun_exist and 
        (anno1['pos'].index('NN') == anno2['pos'].index('NN'))
    ) else 0

    pronoun_exist = (
        'PRP$' in anno1['pos'] and
        'PRP$' in anno2['pos']
    )
    features['pronoun_match'] = 0
    if pronoun_exist:
        idx1 = anno1['pos'].index('PRP$')
        idx2 = anno2['pos'].index('PRP$')
        if anno1['tok'][idx1] == anno2['tok'][idx2]:
            features['pronoun_match'] = 1

    numeric = 1 if (
        'CD' in anno1['pos'] and 
        'CD' in anno2['pos']
    ) else 0 if (
        'CD' in anno1['pos'] or 'CD' in anno2['pos']
    ) else -1
    features['numeric_match'] = 0
    if numeric == 1:
        idx1 = anno1['pos'].index('CD')
        idx2 = anno2['pos'].index('CD')
        if anno1['tok'][idx1] == anno2['tok'][idx2]:
            features['numeric_match'] = 1

  
    p_tokens = anno1['tok']
    h_tokens = anno2['tok']

    p_doc = nlp(sen1)
    h_doc = nlp(sen2)

    features['length_difference'] = length_difference(p_tokens, h_tokens)

    
    features['lexical_overlap'] = lexical_overlap(p_tokens, h_tokens)

    features['stopword_overlap'] = stopword_overlap(p_tokens, h_tokens)

    len_ratios_vals = length_ratios(p_tokens, h_tokens)
    features['len_ratio_p_h'] = len_ratios_vals[0]
    features['len_ratio_h_p'] = len_ratios_vals[1]

    features['bigram_overlap'] = ngram_overlap(p_tokens, h_tokens, n=2)
    features['trigram_overlap'] = ngram_overlap(p_tokens, h_tokens, n=3)

    p_neg = has_negation(sen1, anno1)  # how many neg words in premise
    h_neg = has_negation(sen2, anno2)  # how many neg words in hypothesis
    features['negation_only_in_hyp'] = int((h_neg > 0) and (p_neg == 0))

 
    features['is_question_h'] = is_question(h_tokens)

  
    features['contains_numbers_p'] = contains_numbers(p_tokens)
    features['contains_numbers_h'] = contains_numbers(h_tokens)

    features['negation_in_both'] = int(p_neg > 0 and h_neg > 0)

    features['wordnet_lex_rel'] = wordnet_lexical_relations(p_tokens, h_tokens)
    features['jaccard_sim'] = jaccard_similarity(p_tokens, h_tokens)

    features['spacy_tree_sim'] = spacy_tree_similarity(p_doc, h_doc)

    features['subject_neg_hyp'] = subject_negation_in_hypothesis_spacy(p_doc, h_doc)

    features['contains_adj_adv_in_h'] = contains_adj_adv_in_hypothesis(anno2['tree'])


    return features


In [10]:
def problems2df(data_dict, sen2af):
    '''
    Read a dictionary of NLI problems {pid->prob} and
    a dictionary of sentence annotations {sent->anno_feats}
    and represent each problem as a set of feature-values in a DataFrame.
    DataFrame offers an easy way of viewing and manipulating data.

    Separate DataFrames are created for labels, features, and sentence pairs.
    https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html
    '''
    dict_of_feats = {
        pid: problem2features(prob['p'], sen2af[prob['p']], prob['h'], sen2af[prob['h']])
        for pid, prob in tqdm(data_dict.items())
    }

    gold_labels = {
        pid: [prob['g']]
        for pid, prob in data_dict.items()
    }

    pair_df = {
        pid: [f"{prob['p']} ??? {prob['h']}"]
        for pid, prob in tqdm(data_dict.items())
    }

    feat_df = pd.DataFrame(dict_of_feats).transpose()
    lab_df = pd.DataFrame(gold_labels).transpose()
    pair_df = pd.DataFrame(pair_df).transpose()

    lab_df = lab_df.reindex(feat_df.index)
    pair_df = pair_df.reindex(feat_df.index)

    return feat_df, lab_df, pair_df


In [11]:
# Put together the features with the sentence annotations
s2af = { s: sen2features(s, a) for s, a in tqdm(S2A_train.items()) }
s2af_dev = { s: sen2features(s, a) for s, a in tqdm(S2A_dev.items()) }
s2af_test = { s: sen2features(s, a) for s, a in tqdm(S2A_test.items()) }


100%|██████████| 628489/628489 [00:58<00:00, 10671.78it/s]
100%|██████████| 12982/12982 [00:01<00:00, 9926.77it/s] 
100%|██████████| 12961/12961 [00:01<00:00, 10249.89it/s]


In [None]:
feat_df, lab_df, pair_df = problems2df(SNLI['train'], s2af)
feat_df_dev, lab_df_dev, pair_df_dev = problems2df(SNLI['dev'], s2af_dev)
feat_df_test, lab_df_test, pair_df_test = problems2df(SNLI['test'], s2af_test)

In [None]:
# concatenate sentences, features and labels together
data = pd.concat([lab_df, feat_df, pair_df], axis=1)
data_dev = pd.concat([lab_df_dev, feat_df_dev, pair_df_dev], axis=1)
data_test = pd.concat([lab_df_test, feat_df_test, pair_df_test], axis=1)

# save to csv files
# data.to_csv('./data_train.csv', header=True, index_label=0)
# data_dev.to_csv('./data_dev.csv', header=True, index_label=0)
# data_test.to_csv('./data_test.csv', header=True, index_label=0)