# Document Similarity

In [2]:
import numpy as np
import nltk
from nltk.corpus import wordnet as wn
import pandas as pd


def convert_tag(tag):
    """Convert the tag given by nltk.pos_tag to the tag used by wordnet.synsets"""
    
    tag_dict = {'N': 'n', 'J': 'a', 'R': 'r', 'V': 'v'}
    try:
        return tag_dict[tag[0]]
    except KeyError:
        return None


def doc_to_synsets(doc):
    """
    Returns a list of synsets in document.

    Tokenizes and tags the words in the document doc.
    Then finds the first synset for each word/tag combination.
    If a synset is not found for that combination it is skipped.

    Args:
        doc: string to be converted

    Returns:
        list of synsets

    Example:
        doc_to_synsets('Fish are nvqjp friends.')
        Out: [Synset('fish.n.01'), Synset('be.v.01'), Synset('friend.n.01')]
    """
    
    doc_tokens = nltk.word_tokenize(doc)
    tags = nltk.pos_tag(doc_tokens)
    wn_tags = [(tag[0],convert_tag(tag[1])) for tag in tags]
    synsets = [wn.synsets(wn_tag[0],wn_tag[1])[0] for wn_tag in wn_tags 
               if len(wn.synsets(wn_tag[0],wn_tag[1])) > 0]
                
    return synsets


def similarity_score(s1, s2):
    """
    Calculate the normalized similarity score of s1 onto s2

    For each synset in s1, finds the synset in s2 with the largest similarity value.
    Sum of all of the largest similarity values and normalize this value by dividing it by the
    number of largest similarity values found.

    Args:
        s1, s2: list of synsets from doc_to_synsets

    Returns:
        normalized similarity score of s1 onto s2

    Example:
        synsets1 = doc_to_synsets('I like cats')
        synsets2 = doc_to_synsets('I like dogs')
        similarity_score(synsets1, synsets2)
        Out: 0.73333333333333339
    """

    similarity_score = np.mean(
        [np.max(
             [k1.path_similarity(k2) for k2 in s2 if k1.path_similarity(k2) is not None]
        ) 
        for k1 in s1
        if len([k1.path_similarity(k2) for k2 in s2 if k1.path_similarity(k2) is not None]) > 0
        ]
    )
    
    return similarity_score 


def document_path_similarity(doc1, doc2):
    """Finds the symmetrical similarity between doc1 and doc2"""

    synsets1 = doc_to_synsets(doc1)
    synsets2 = doc_to_synsets(doc2)

    return (similarity_score(synsets1, synsets2) + similarity_score(synsets2, synsets1)) / 2

### test_document_path_similarity

Use this function to check if doc_to_synsets and similarity_score are correct.

*This function should return the similarity score as a float.*

In [3]:
def test_document_path_similarity():
    doc1 = 'This is a function to test document_path_similarity.'
    doc2 = 'Use this function to see if your code in doc_to_synsets \
    and similarity_score is correct!'
    return document_path_similarity(doc1, doc2)
test_document_path_similarity() #0.55426587301587305

0.55426587301587305

In [4]:
#nltk.download()

In [5]:
# Testing Case 2
#Out: 0.73333333333333339

synsets1 = doc_to_synsets('I like cats')
synsets2 = doc_to_synsets('I like dogs')
similarity_score(synsets1, synsets2)

0.73333333333333339

In [6]:
# Testing Cases 3
doc1 = 'This is a function to test document_path_similarity.'
doc2 = 'Use this function to see if your code in doc_to_synsets \
and similarity_score is correct!'
synsets1 = doc_to_synsets(doc1)
synsets2 = doc_to_synsets(doc2)
print("synsets1", synsets1) # a list with 4 elements
print("synsets2", synsets2) # a list with 7 elements
s1s2_score = similarity_score(synsets1, synsets2)
s2s1_score = similarity_score(synsets2, synsets1)
print("s1s2_score", s1s2_score) # 0.6?2?0?0?0
print("s2s1_score", s2s1_score) # 0.4?6?3?7?6
print("s1 s2 doc similarity score", (s1s2_score + s2s1_score) / 2) # 0.5?4?6?8?3

synsets1 [Synset('be.v.01'), Synset('angstrom.n.01'), Synset('function.n.01'), Synset('test.v.01')]
synsets2 [Synset('use.v.01'), Synset('function.n.01'), Synset('see.v.01'), Synset('code.n.01'), Synset('inch.n.01'), Synset('be.v.01'), Synset('correct.a.01')]
s1s2_score 0.6125
s2s1_score 0.496031746032
s1 s2 doc similarity score 0.554265873016


<br>
___
`paraphrases` is a DataFrame which contains the following columns: `Quality`, `D1`, and `D2`.

`Quality` is an indicator variable which indicates if the two documents `D1` and `D2` are paraphrases of one another (1 for paraphrase, 0 for not paraphrase).

In [8]:
# Use this dataframe for questions most_similar_docs and label_accuracy
paraphrases = pd.read_csv('paraphrases.csv')
paraphrases.head()

Unnamed: 0,Quality,D1,D2
0,1,"Ms Stewart, the chief executive, was not expec...","Ms Stewart, 61, its chief executive officer an..."
1,1,After more than two years' detention under the...,After more than two years in detention by the ...
2,1,"""It still remains to be seen whether the reven...","""It remains to be seen whether the revenue rec..."
3,0,"And it's going to be a wild ride,"" said Allan ...","Now the rest is just mechanical,"" said Allan H..."
4,1,The cards are issued by Mexico's consulates to...,The card is issued by Mexico's consulates to i...


___

### most_similar_docs

Using `document_path_similarity`, find the pair of documents in paraphrases which has the maximum similarity score.

*This function should return a tuple `(D1, D2, similarity_score)`*

In [9]:
def most_similar_docs():
    
    paraphrases["score"] = list(map(document_path_similarity,paraphrases['D1'],paraphrases['D2']))
    most_similar_docs = tuple(paraphrases.loc[paraphrases['score'].idxmax()])[1:4]
    
    return most_similar_docs 
most_similar_docs() #0.97530864197530864

('"Indeed, Iran should be put on notice that efforts to try to remake Iraq in their image will be aggressively put down," he said.',
 '"Iran should be on notice that attempts to remake Iraq in Iran\'s image will be aggressively put down," he said.\n',
 0.97530864197530864)

### label_accuracy

Provide labels for the twenty pairs of documents by computing the similarity for each pair using `document_path_similarity`. Let the classifier rule be that if the score is greater than 0.75, label is paraphrase (1), else label is not paraphrase (0). Report accuracy of the classifier using scikit-learn's accuracy_score.

*This function should return a float.*

In [10]:
def label_accuracy():
    from sklearn.metrics import accuracy_score
    
    paraphrases["score"] = list(map(document_path_similarity,paraphrases['D1'],paraphrases['D2']))
    paraphrases["pred"] = np.where(paraphrases["score"] > 0.75, 1, 0)
    scores = accuracy_score(paraphrases["Quality"],paraphrases["pred"])
    
    return scores
label_accuracy() #0.80000000000000004

0.80000000000000004