## Paraphrase Identification
#### Nick Reardon

In [1]:
import nltk
#nltk.download('wordnet')
#nltk.download('words')
from nltk.corpus import words
from nltk.corpus import wordnet
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [4]:
wds = words.words()
len(wds)

236736

In [5]:
def normalize(string):
    
    remove = "~!@#$%^&*()_+`-={}|[]\\:\";'<>?,.//*-+.'"
    
    for u in remove:
        try:
            string = string.replace(u,'')
        except:
            pass
    
    tokens = []
    for u in string.lower().split(' '):
        tokens.append(stemmer.stem(u))
    
    return tokens

In [6]:
word = 'anymore'

for synonym in wordnet.synsets(word):
    print(synonym.lemma_names())

['anymore', 'any_longer']


In [7]:
import pickle

In [8]:
synonyms = pickle.load(open('English Synonyms.p','rb'))

In [10]:
# What is Pattern Recognition?
original_phrase = "The ability to determine the statistical impracticality of randomness."

# Let's normalize that for ease of detecting a paraphrase
original_phrase = normalize(original_phrase)

# What is another way to say what Pattern Recognition is?
paraphrase = "The power to ascertain the statistical unfeasability of chaos."

# Again, let's normalize that.
paraphrase = normalize(paraphrase)

In [11]:
# Here we stem the phrases and convert them to individual parts (tokens) in a list so that we can treat these phrases as objects to be statistically analyzed
root_phrase = [stemmer.stem(i) for i in original_phrase]
root_paraphrase = [stemmer.stem(i) for i in paraphrase]

### General EDA
#### Just taking a look at all of our (stemmed) synonym options

In [34]:
for u in root_phrase:
    try:
        if len(synonyms[u]) == 0:
            print (u, "\n")
        else:
            print(u+':', synonyms[u], "\n")
    except:
        print(u, '\n')

the 

abil: ['abil', 'power'] 

to 

determin: ['determin', 'determiningfactor', 'causalfactor', 'antigenicdetermin', 'epitop', 'decid', 'definit', 'find', 'purpos', 'decis', 'conclus', 'findout', 'ascertain', 'shape', 'mold', 'influenc', 'regul', 'set', 'specifi', 'defin', 'fix', 'limit', 'makeuponesmind', 'settl', 'squareoff', 'squareup', 'check', 'see', 'watch', 'learn', 'dictat', 'compuls', 'driven', 'unfalt', 'unshak', 'ambiti', 'clincher'] 

the 

statist: ['statist'] 

impract: ['impract', 'impractic', 'infeas', 'unfeas', 'unwork', 'airi', 'visionari', 'laputan', 'windi'] 

of 

random: ['random', 'randomis', 'indiscrimin', 'haphazard', 'willynilli', 'arbitrarili', 'atrandom', 'everywhichway', 'entropi', 's', 'stochast', 'nois'] 



In [36]:
for u in root_paraphrase:
    try:
        if len(synonyms[u]) == 0:
            print (u, "\n")
        else:
            print(u+':', synonyms[u], "\n")
    except:
        print(u, '\n')

the 

power: ['power', 'abil', 'offic', 'forc', 'expon', 'index', 'might', 'mighti', 'worldpow', 'majorpow', 'greatpow', 'superpow', 'baron', 'bigbusinessman', 'businesslead', 'king', 'magnat', 'mogul', 'topexecut', 'tycoon'] 

to 

ascertain: ['determin', 'find', 'findout', 'ascertain', 'see', 'check', 'insur', 'seetoit', 'ensur', 'control', 'assur', 'watch', 'learn', 'discover'] 

the 

statist: ['statist'] 

unfea: ['unfeas', 'infeas', 'unwork'] 

of 

chao: ['chao', 'pandemonium', 'bedlam', 'topsyturvydom', 'topsyturvy'] 



In [12]:
# We see that the length of each tokenized object is a 9. This will not always be the case but, for demonstration purposes, it is somewhat easy.

print(len(root_phrase), len(root_paraphrase))

9 9


In [16]:
# Since we are seeing something like what we see above, where randomness may not be a word in this dictionary, we have to do a little math.
# Here's one way we can do this
n = 0
i = 0
for u in root_phrase:
    if root_paraphrase[i] in synonyms[u]:
        print("Match!")
        n += 1
    elif u == root_paraphrase[i]:
        print("Match!")
        n += 1
    i += 1

Match!
Match!
Match!
Match!
Match!
Match!
Match!


In [17]:
# We've found 7 matches! That is 7 out of 9 objects in our data that can be used to calculate whether it should be treated as a paraphrase or not.

n

7

In [19]:
likelihood_of_paraphrase = n / len(root_phrase)

In [20]:
likelihood_of_paraphrase

0.7777777777777778

In [21]:
# I guess that at this point we should determine a threshold for when to call something a paraphrase of the original phrase.
# For now let's just use a naive threshold. 75%

if likelihood_of_paraphrase > .75:
    print('This is likely a paraphrase!')

This is likely a paraphrase!


#### Good news! This is likely a match! Let's treat it as if it is (as we already know that it is).
#### Now let's build a function that we can call on for any two inputs to see if we have a paraphrase or not!

In [76]:
# If you're going to import this function to a different script remember that you need the synonym dictionary as well as the normalize function

def detect_paraphrase(phrase_1, phrase_2):
    
    phrase_1 = normalize(phrase_1)
    phrase_2 = normalize(phrase_2)
    
    phrase_1 = [stemmer.stem(i) for i in phrase_1]
    phrase_2 = [stemmer.stem(i) for i in phrase_2]
    
    n = 0
    i = 0
    for u in phrase_1:
        if phrase_2[i] in synonyms[u]:
            n += 1
        elif u == phrase_2[i]:
            n += 1
        i += 1
    
    chance = n / len(phrase_1)
    
    print('%.1f' % (round(chance, 2) * 100) + "% likely to be a paraphrase...")

In [77]:
# Let's test this out

detect_paraphrase('I love you!', 'I like you!')

67.0% likely to be a paraphrase...


In [28]:
detect_paraphrase("Your parents are bad people.", "I think that you're funny!")

0.0% likely to be a paraphrase...


In [29]:
# Now, what happens when the phrases are a different length?

detect_paraphrase("I love you!", "I just got back from the gym.")

33.0% likely to be a paraphrase...


In [92]:
# So we're seeing a 33% match here because the first phrase is of length 3 and the first word matches. Probably not a paraphrase.
# It's probably a good idea to make it more mathematical. Let's change up our detect_paraphrase function.

def detect_paraphrase(phrase_1, phrase_2):
    
    phrase_1 = normalize(phrase_1)
    phrase_2 = normalize(phrase_2)
    
    phrase_1 = [stemmer.stem(i) for i in phrase_1]
    phrase_2 = [stemmer.stem(i) for i in phrase_2]
    
    n = 0
    i = len(phrase_1) if len(phrase_1) > len(phrase_2) else len(phrase_2)
    for token in range(i):
        try:
            if phrase_2[token] in synonyms[phrase_1[token]]:
                n += 1
            elif phrase_1[token] == phrase_2[token]:
                n += 1
        except:
            pass
    
    chance = n / i
    
    print('%.1f' % (round(chance, 2) * 100) + "% likely to be a paraphrase...")

In [93]:
detect_paraphrase("I love you!", "I just got back from the gym.")

14.0% likely to be a paraphrase...


In [98]:
# Now we're getting somewhere! Let's keep testing!

detect_paraphrase("The boy hit the ball", "The kid hit the ball")

80.0% likely to be a paraphrase...


In [96]:
detect_paraphrase("You might like champagne", "You may fancy champagne")

50.0% likely to be a paraphrase...


In [97]:
detect_paraphrase("I think you're on to something!", "You're definitely headed in the right direction!")

0.0% likely to be a paraphrase...


In [99]:
detect_paraphrase("We can try to do it", "We are able to try")

33.0% likely to be a paraphrase...


In [123]:
# So here we see something that we have not yet accounted for. Just because a word is not at the same location as a word in another phrase does not mean that they are not paraphrases.
# Let's try something new and test it out.

# So we're seeing a 33% match here because the first phrase is of length 3 and the first word matches. Probably not a paraphrase.
# It's probably a good idea to make it more mathematical. Let's change up our detect_paraphrase function.

def detect_paraphrase(phrase_1, phrase_2):
    
    phrase_1 = normalize(phrase_1)
    phrase_2 = normalize(phrase_2)
    
    phrase_1 = [stemmer.stem(i) for i in phrase_1]
    phrase_2 = [stemmer.stem(i) for i in phrase_2]
    
    n = 0
    i = len(phrase_1) if len(phrase_1) > len(phrase_2) else len(phrase_2)
    #print(phrase_1)
    #print(phrase_2)
    
    if len(phrase_1) > len(phrase_2):
        for word in phrase_1:
            if synonyms[word] in phrase_2:
                n += 1
            elif word in phrase_2:
                n += 1
    else:
        for word in phrase_2:
            if synonyms[word] in phrase_1:
                n += 1
            elif word in phrase_1:
                n += 1

    chance = n / i
    
    print('%.1f' % (round(chance, 3) * 100) + "% likely to be a paraphrase...")

In [124]:
detect_paraphrase("We can try to do it", "We are able to try it")

66.7% likely to be a paraphrase...


In [125]:
detect_paraphrase("We might be able to do it", "We are trying")

14.3% likely to be a paraphrase...


In [126]:
# This is acceptable... The first example is not present progressive while phrase_2 is...

In [127]:
detect_paraphrase("We're working on it", "We're attempting it")

50.0% likely to be a paraphrase...
