In [65]:
from __future__ import division
import os
import pickle
import codecs
from glob import glob
import re

In [66]:
DATA_SOURCE = './data/source'
SUSPICIOUS_DOCS = './data/suspicious_doc'

### Preprocessing

In [67]:
from datasketch import MinHash, MinHashLSH
import nltk
from nltk.tokenize import wordpunct_tokenize
import pickle
import re

def tokenize_file(file_name: str) -> list:
    return wordpunct_tokenize(codecs.open(file_name).read())

### LSH model

In [68]:
def create_minhash(text, n_gram, is_file=True, num_perm=128):
    """
    Generate minhash from single file or text given ngrams and number of permutation
    """
    if is_file:
        tokens = tokenize_file(text)
    else:
        tokens = wordpunct_tokenize(text)
    minhash = MinHash(num_perm=num_perm)
    for gram in nltk.ngrams(tokens, n_gram):
        minhash.update(" ".join(gram).encode('utf-8')) 
    return minhash

def create_lsh(source_dir, threshold=.3, n_gram=3, num_perm=128, max_num=10) -> MinHashLSH:
    """
    Create an LSH instance from text files in source directory given threshold, n-grams, and number of permutation
    Returns an LSH object and a list of source file numbers 
    """
    lsh = MinHashLSH(num_perm=num_perm, threshold=threshold)#, 
                   #storage_config={'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379, 'db': 1},'name': 1})
    file_names = glob(os.path.join(source_dir, '*.txt'))[:max_num]
    keys = []
    minhashes = []
    for fname in file_names:
        minhash = create_minhash(fname, n_gram, num_perm=num_perm)
        keys.append(re.findall(r'\d+', fname)[0])
        minhashes.append(minhash)
    with lsh.insertion_session() as session:
        for key, minhash in zip(keys, minhashes):
            session.insert(key, minhash)
    return lsh, keys

def save_lsh(lsh, model_dir):
    """
    Dump lsh model to pickle in storage for later use
    """
    pass

def load_lsh(model_dir):
    """
    Load pickled LSH model to memory
    """
    pass


#### Testing

In [74]:
lsh, keys = create_lsh(DATA_SOURCE, threshold=.0, max_num=10)

### Model Evaluation

In [75]:
def eval_single(lsh, text, n_gram, num_perm):
    minhash = create_minhash(text, n_gram, is_file=True, num_perm=128)
    result = lsh.query(minhash)
    return result

def eval_list():
    pass

In [80]:
text = "./data/suspicious/suspicious-document00005.txt"
eval_single(lsh, text, 3, 128) 

['00590', '01842', '01671', '01117']