In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import wordnet as wn

In [7]:
def convert_tag(tag):
    """Convert the tag given by nltk.pos_tag to the tag used by wordnet.synsets"""
    
    tag_dict = {'N': 'n', 'J': 'a', 'R': 'r', 'V': 'v'}
    try:
        return tag_dict[tag[0]]
    except KeyError:
        return None


In [29]:
def doc_to_synsets(s1):
    token = nltk.word_tokenize(s1)
    pos = nltk.pos_tag(token)
    tags = [tag[1] for tag in pos]
    wntag = [convert_tag(tag)for tag in tags]
    ans = list(zip(token,wntag))
    sets = [wn.synsets(x,y)for x,y in ans]
    final = [val[0]for val in sets if len(val) > 0]
    return final

In [30]:
s1 = doc_to_synsets("I like dogs")
s2 = doc_to_synsets("I like cats")

In [33]:
def similarity_score(s1,s2):
    s = []
    for i1 in s1:
        r = []
        scores = [x for x in [i1.path_similarity(i2) for i2 in s2] if x is not None]
        if scores:
            s.append(max(scores))
    
    
    return sum(s)/len(s)


In [34]:
similarity_score(s1, s2)

0.7333333333333334

In [36]:
def document_path_similarity(doc1, doc2):
    """Finds the symmetrical similarity between doc1 and doc2"""

    synsets1 = doc_to_synsets(doc1)
    synsets2 = doc_to_synsets(doc2)

    return (similarity_score(synsets1, synsets2) + similarity_score(synsets2, synsets1)) / 2

In [45]:
paraphrases = pd.read_csv("paraphrases.csv")

In [46]:
paraphrases.head()

Unnamed: 0,Quality,D1,D2
0,1,"Ms Stewart, the chief executive, was not expec...","Ms Stewart, 61, its chief executive officer an..."
1,1,After more than two years' detention under the...,After more than two years in detention by the ...
2,1,"""It still remains to be seen whether the reven...","""It remains to be seen whether the revenue rec..."
3,0,"And it's going to be a wild ride,"" said Allan ...","Now the rest is just mechanical,"" said Allan H..."
4,1,The cards are issued by Mexico's consulates to...,The card is issued by Mexico's consulates to i...


In [47]:
def most_similar_docs():
    similarities = [(paraphrase['D1'],paraphrase['D2'],document_path_similarity(paraphrase['D1'],paraphrase['D2'])) \
                  for index, paraphrase in paraphrases.iterrows()]
    similarity = max(similarities,key=lambda item:item[2])
    return similarity
    


In [48]:
most_similar_docs()


('"Indeed, Iran should be put on notice that efforts to try to remake Iraq in their image will be aggressively put down," he said.',
 '"Iran should be on notice that attempts to remake Iraq in Iran\'s image will be aggressively put down," he said.\n',
 0.9753086419753086)

In [52]:
def label_accuracy():
    from sklearn.metrics import accuracy_score
    df = paraphrases.apply(update_score,axis=1)
    score = accuracy_score(df['Quality'].tolist(),df['paraphrase'].tolist())
    return score
    
def update_score(row):
    row['similarity_score'] = document_path_similarity(row['D1'],row['D2'])
    row['paraphrase'] = 1 if row['similarity_score'] > 0.75 else 0
    return row

In [53]:
label_accuracy()

Unnamed: 0,Quality,D1,D2,similarity_score,paraphrase
0,1,"Ms Stewart, the chief executive, was not expec...","Ms Stewart, 61, its chief executive officer an...",0.671682,0
1,1,After more than two years' detention under the...,After more than two years in detention by the ...,0.900198,1
2,1,"""It still remains to be seen whether the reven...","""It remains to be seen whether the revenue rec...",0.856672,1
3,0,"And it's going to be a wild ride,"" said Allan ...","Now the rest is just mechanical,"" said Allan H...",0.786176,1
4,1,The cards are issued by Mexico's consulates to...,The card is issued by Mexico's consulates to i...,0.929902,1
5,1,Their difference was over whether the court sh...,Their difference was over whether the court sh...,0.84318,1
6,1,The only announced Republican to replace Davis...,So far the only declared major party candidate...,0.802245,1
7,1,"Druce will face murder charges, Conte said.",Conte said Druce will be charged with murder.\n,0.670758,0
8,0,"""It's a major victory for Maine, and it's a ma...",The Maine program could be a model for other s...,0.633661,0
9,1,Microsoft said Friday that it is halting devel...,Microsoft will stop developing versions of its...,0.761542,1
