# DynaMax-Jaccard

DynaMax-Jaccard is an unsupervised and non-parametric 
measure that dynamically extracts and max-pools good features to find the sentence representations depending on the sentence pair and similarity is measured using jaccard.

It shows that max-pooled word vectors are only a special case of fuzzy BoW and should be compared via fuzzy Jaccard index rather than cosine similarity.

In [17]:
# imports
from vocabulary import Vocabulary
import numpy as np
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
smoother = SmoothingFunction()
from rouge.rouge import rouge_n_sentence_level # pip install easy-rouge
from scipy.stats import pearsonr

In [4]:
# imports for preprocessing
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\d072726\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\d072726\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Pretrained word embeddings

- Download fasttext embeddings [here](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.vec)
- Download glove embeddings [here](http://nlp.stanford.edu/data/glove.840B.300d.zip)

Unzip the glove embeddings and save the embeddings in a folder pretrained_embeddings.

In [37]:
#fasttext = Vocabulary.from_embeddings('pretrained_embeddings/wiki.en.vec', pass_header=True) #pass_header, True for FastText
glove = Vocabulary.from_embeddings('pretrained_embeddings/glove.840B.300d.txt', pass_header=False) #False for Glove

### Load testsets for evaluation

The Automatically generated texts (predictions) from machine translation or text summarization are evaluated against their reference texts. <br> Below are the testsets to be used for evaluation. 

- For **DE-EN** translation, <br>  **reference-** 'testsets/de-en/test2016.en.atok'   **prediction-** 'testsets/de-en/multi30k.test.pred.en.atok' <br>


- For **RO-EN** translation, <br>  **reference-** 'testsets/ro-en/newstest2016_ref_1000.en'  **prediction-**- 'testsets/ro-en/newstest2016_output_1000.en'<br>


- For **giga word** summarization(titles), <br>  **reference-** 'testsets/giga/task1_ref0_giga_450.txt'  **prediction-**'testsets/giga/giga.10_300000_450.txt'


- For **CNN-DM** summariation, <br>  **reference-** 'testsets/cnn/preprocessed.ref'  **prediction-** 'testsets/cnn/preprocessed.pred'


- For **Duc 2003** summarization, <br>  **reference-** 'testsets/duc/task1_ref0_duc2003.txt'  **prediction-** 'testsets/duc/duc2003.10_300000.txt'

In [100]:
reference_doc = 'testsets/de-en/test2016.en.atok'
prediction_doc =  'testsets/de-en/multi30k.test.pred.en.atok'   

with open( reference_doc ,'r') as ref, open( prediction_doc ,'r') as pred:
    reference_en = ref.readlines()
    prediction_en = pred.readlines()

###  Optional preprocessing

In [101]:
def preprocessing(doc, stop_words_remove=False):
    remove_punctuation = []
    preprocessed_doc = []
    # keep only alphanumeric characters(remove punctuations)
    remove_punctuation = [re.sub(r"[^\w]", " ", sent).lower().strip() for sent in doc] 
    
    if stop_words_remove == True:
        # remove stop words requires lower cased tokens
        stop_words = set(stopwords.words("english"))
        for sent in doc:
            filtered_sentence = [word for word in word_tokenize(sent.lower()) if not word in stop_words]
            preprocessed_doc.append(' '.join(filtered_sentence))
        return preprocessed_doc
    else:
        return remove_punctuation  

In [102]:
# use only if you want to preprocess the sentences

reference_en = preprocessing(reference_en, True) # True to remove stopwords, default only removes punctuation
prediction_en = preprocessing(prediction_en, True)

### Semantic similarity scores

In [75]:
# sentence embeddings by aggregating the word embeddings
def document_vector(doc):
    doc_vec = []
    for word in doc.lower().split():
        if word in glove.word2id:  #glove for using glove embeddings
            doc_vec.append(glove.embeddings[glove.word2id[word]])               
    return doc_vec

In [103]:
ref_embedding = []
pred_embedding = []
for doc in reference_en:
    ref_embedding.append(np.array(document_vector(doc)))
for doc in prediction_en:
    pred_embedding.append(np.array(document_vector(doc)))

In [77]:
def fuzzify(s, u):
    """
    Sentence fuzzifier.
    Computes membership vector for the sentence S with respect to the
    universe U
    :param s: list of word embeddings for the sentence
    :param u: the universe matrix U with shape (K, d)
    :return: membership vectors for the sentence
    """
    f_s = np.dot(s, u.T)
    m_s = np.max(f_s, axis=0)
    m_s = np.maximum(m_s, 0, m_s)
    return m_s


def dynamax_jaccard(x, y):
    """
    DynaMax-Jaccard similarity measure between two sentences
    :param x: list of word embeddings for the first sentence
    :param y: list of word embeddings for the second sentence
    :return: similarity score between the two sentences
    """
    u = np.vstack((x, y))
    m_x = fuzzify(x, u)
    m_y = fuzzify(y, u)

    m_inter = np.sum(np.minimum(m_x, m_y))
    m_union = np.sum(np.maximum(m_x, m_y))
    return m_inter / m_union

In [104]:
similarity_dynamax =[]
for i in range(len(ref_embedding)):
    similarity_dynamax.append(dynamax_jaccard(ref_embedding[i], pred_embedding[i]))

### BLEU or ROUGE scores

Use BLEU scores for machine translation evaluation and ROUGE for text summarization evaluation.

In [None]:
# for machine translation evaluation
bleu_scores =[]
for i in range(len(reference_en)):
    bleu_scores.append(sentence_bleu(reference_en[i],prediction_en[i], smoothing_function=smoother.method4))

In [None]:
# for text summarization evaluation
rouge_scores = []
for i in range(len(reference_en)):
    *pr, f = rouge_n_sentence_level(prediction_en[i], reference_en[i], 2) # 2 for ROUGE-2. ROUGE-N, ROUGE-L and ROUGE-W scores can also be obtained.
    rouge_scores.append(f)

### Human annotation scores

Load the human annotation scores from the respective excel files as below,

- For **DE-EN** translation, 'human annotated/DE-EN.xlsx'


- For **RO-EN** translation, 'human annotated/RO-EN.xlsx'


- For **giga word** summarization(titles),'human annotated/giga.xlsx'


- For **CNN-DM** summariation, 'human annotated/CNN_900.xlsx'


- For **Duc 2003** summarization,  'human annotated/duc2003.xlsx'

In [93]:
human_annotation = pd.read_excel('human annotated/DE-EN.xlsx')

In [94]:
human_scores = human_annotation.iloc[:, 3].tolist()

### Pearson correlation coefficient

In [None]:
# correlation between human annotated scores and Bleu or ROUGE scores

#pearson correlation value, p-value
pearsonr(human_scores, bleu_scores) #bleu_scores or rouge_scores

In [105]:
# correlation between human annotated scores and semantic similarity scores

pearsonr(human_scores, similarity_dynamax) # expected to be higher(more correlated) than with Bleu or ROUGE scores

(0.6618757852238534, 4.663116473627758e-127)