In [1]:
import mmh3
from nltk import ngrams
# from shingles.util import generate_random_seeds, jaccard_similarity

class ShingledText:
    def __init__(self, text, random_seed=5, shingle_length=5, minhash_size=200):
        split_text = text.split()
        if len(split_text) < shingle_length:
            raise ValueError(u'input text is too short for specified shingle length of {}'.format(shingle_length))

        self.minhash = []
        self.shingles = ngrams(split_text, shingle_length)

        for hash_seed in generate_random_seeds(minhash_size, random_seed):
            min_value = float('inf')
            for shingle in ngrams(split_text, shingle_length):
                value = mmh3.hash(' '.join(shingle), hash_seed)
                min_value = min(min_value, value)
            self.minhash.append(min_value)

    def similarity(self, other_shingled_text):
        return jaccard_similarity(set(self.minhash), 
                set(other_shingled_text.minhash))


In [2]:
import random

def generate_random_seeds(n, seed=5):
    random.seed(seed)
    return random.sample(range(1, n+1), n)

def jaccard_similarity(set_a, set_b):
    return len(set_a.intersection(set_b)) / len(set_a.union(set_b))

In [3]:
paragraph = u"""
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
"""

rearranged_paragraph = u"""
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqual. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo conseua
"""

another_paragraph = u"""
Apparently she had forgotten her age and by force of habit employed all
the old feminine arts. But as soon as the prince had gone her face
resumed its former cold, artificial expression. She returned to the
group where the vicomte was still talking, and again pretended to
listen, while waiting till it would be time to leave. Her task was
accomplished.
"""

In [4]:
shingled_text = ShingledText(paragraph, 5, 5, 20)
rearranged_shingled_text = ShingledText( rearranged_paragraph, 5, 5, 20)
shingled_diff_text = ShingledText(another_paragraph, 5, 5, 20)

In [5]:
shingled_text.similarity(rearranged_shingled_text)

0.6

In [6]:
shingled_text.similarity(shingled_diff_text)

0.0

In [16]:
txt="""Long ago, when there was no written history, these islands were the home of millions of happy birds; 
the resort of a hundred times more millions of fishes, sea lions, and other creatures. Here lived 
innumerable creatures predestined from the creation of the world to lay up a store of wealth for the 
British farmer, and a store of quite another sort for an immaculate Republican government."""

In [17]:
txt2="""In ages which have no record these islands were the home of millions of happy birds, the resort of a hundred 
times more millions of fishes, of sea lions, and other creatures whose names are not so common; the marine residence, 
in fact, of innumerable creatures predestined from the creation of the world to lay up a store of wealth for the
British farmer, and a store of quite another sort for an immaculate Republican government"""

In [18]:
shingled_text = ShingledText(txt, 5, 5, 20)

In [19]:
rearranged_shingled_text = ShingledText( txt2, 5, 5, 20)

In [20]:
shingled_text.similarity(rearranged_shingled_text)

0.25