In [61]:
from __future__ import division
import os
import pickle
import codecs
from glob import glob
import re

In [62]:
DATA_SOURCE = './source'
SUSPICIOUS_DOCS = './test'

In [63]:
glob(os.path.join(DATA_SOURCE, '*.txt'))

['./source/orig_taskd.txt',
 './source/orig_taske.txt',
 './source/orig_taska.txt',
 './source/orig_taskb.txt',
 './source/orig_taskc.txt']

In [64]:
glob(os.path.join(SUSPICIOUS_DOCS, '*.txt'))

['./test/g0pC_taskc.txt',
 './test/g0pE_taskd.txt',
 './test/g0pE_taske.txt',
 './test/g0pC_taskb.txt',
 './test/g1pA_taske.txt',
 './test/g3pB_taskd.txt',
 './test/g3pB_taske.txt',
 './test/g0pC_taska.txt',
 './test/g1pA_taskd.txt',
 './test/g0pC_taske.txt',
 './test/g3pB_taska.txt',
 './test/g0pE_taskb.txt',
 './test/g0pE_taskc.txt',
 './test/g0pC_taskd.txt',
 './test/g1pA_taska.txt',
 './test/g1pA_taskc.txt',
 './test/g3pB_taskb.txt',
 './test/g0pE_taska.txt',
 './test/g3pB_taskc.txt',
 './test/g1pA_taskb.txt',
 './test/g2pA_taskb.txt',
 './test/g0pD_taskd.txt',
 './test/g0pB_taskc.txt',
 './test/g0pB_taskb.txt',
 './test/g0pD_taske.txt',
 './test/g2pA_taskc.txt',
 './test/g3pC_taskd.txt',
 './test/g2pA_taska.txt',
 './test/g0pB_taska.txt',
 './test/g3pC_taske.txt',
 './test/g3pC_taska.txt',
 './test/g2pA_taskd.txt',
 './test/g0pD_taskb.txt',
 './test/g0pB_taske.txt',
 './test/g0pB_taskd.txt',
 './test/g0pD_taskc.txt',
 './test/g2pA_taske.txt',
 './test/g3pC_taskb.txt',
 './test/g0p

### Preprocessing

In [65]:
from datasketch import MinHash, MinHashLSHForest
import nltk
from nltk.tokenize import wordpunct_tokenize
import pickle
import re

def tokenize_file(file_name: str) -> list:
    try:
        text = codecs.open(file_name).read()
    except UnicodeDecodeError:
        text = codecs.open(file_name, encoding='cp1252').read()
    return wordpunct_tokenize(text)

### LSH model

In [66]:
def create_minhash(text, n_gram, is_file=True, num_perm=128):
    """
    Generate minhash from single file or text given ngrams and number of permutation
    """
    if is_file:
        tokens = tokenize_file(text)
    else:
        tokens = wordpunct_tokenize(text)
    minhash = MinHash(num_perm=num_perm)
    for gram in nltk.ngrams(tokens, n_gram):
        minhash.update(" ".join(gram).encode('utf-8')) 
    return minhash

def create_lsh(source_dir, threshold=.3, n_gram=3, num_perm=128, max_num=10) -> MinHashLSH:
    """
    Create an LSH instance from text files in source directory given threshold, n-grams, and number of permutation
    Returns an LSH object and a list of source file numbers 
    """
    lsh = MinHashLSHForest(num_perm=num_perm)
    file_names = glob(os.path.join(source_dir, '*.txt'))[:max_num]
    keys = []
    minhashes = []
    for fname in file_names:
        minhash = create_minhash(fname, n_gram, num_perm=num_perm)
        keys.append(re.findall(r'task\w', fname)[0])
        minhashes.append(minhash)
    for key, minhash in zip(keys, minhashes):
        lsh.add(key, minhash)
    lsh.index()
    return lsh, keys


### Model Evaluation

In [67]:
def eval_single(lsh, text_dir, n_gram, num_perm=128, k=1):
    minhash = create_minhash(text_dir, n_gram, is_file=True, num_perm=num_perm)
    result = lsh.query(minhash, k)
    return result

def eval_list(sus_dir, lsh, n_gram, num_perm, k=1):
    results = {}
    for text_dir in glob(os.path.join(sus_dir, '*.txt')):
        results[re.findall("g[A-Za-z0-9_-]*", text_dir)[0]] = eval_single(lsh, text_dir, n_gram, num_perm=num_perm, k=k)
    return results
        
def find_accuracy(keys, results):
    correct = 0
    for doc in results:
        if len(results[doc])>0 and results[doc][0] in doc:
            correct += 1
    return correct/len(results)

### Testing

In [68]:
n_gram = 3
num_perm = 128
lsh, keys = create_lsh(DATA_SOURCE, n_gram=n_gram, max_num=10, num_perm=num_perm)
results = eval_list(SUSPICIOUS_DOCS, lsh, n_gram, num_perm)
find_accuracy(keys, results)

0.5052631578947369

#### Parameter Tuning

In [69]:

for n_gram in range(10):
    for num_perm in [16, 32, 64, 128, 256]:
        lsh, keys = create_lsh(DATA_SOURCE, n_gram=n_gram, max_num=10, num_perm=num_perm)
        results = eval_list(SUSPICIOUS_DOCS, lsh, n_gram, num_perm)
        acc = find_accuracy(keys, results)
        print(f'{n_gram} grams, accuracy: {acc}')


0 grams, accuracy: 0.2
0 grams, accuracy: 0.2
0 grams, accuracy: 0.2
0 grams, accuracy: 0.2
0 grams, accuracy: 0.2
1 grams, accuracy: 0.5789473684210527
1 grams, accuracy: 0.5052631578947369
1 grams, accuracy: 0.6842105263157895
1 grams, accuracy: 0.6
1 grams, accuracy: 0.6947368421052632
2 grams, accuracy: 0.49473684210526314
2 grams, accuracy: 0.5368421052631579
2 grams, accuracy: 0.5789473684210527
2 grams, accuracy: 0.5789473684210527
2 grams, accuracy: 0.5263157894736842
3 grams, accuracy: 0.49473684210526314
3 grams, accuracy: 0.5157894736842106
3 grams, accuracy: 0.4631578947368421
3 grams, accuracy: 0.5052631578947369
3 grams, accuracy: 0.5263157894736842
4 grams, accuracy: 0.3473684210526316
4 grams, accuracy: 0.43157894736842106
4 grams, accuracy: 0.45263157894736844
4 grams, accuracy: 0.42105263157894735
4 grams, accuracy: 0.42105263157894735
5 grams, accuracy: 0.35789473684210527
5 grams, accuracy: 0.3473684210526316
5 grams, accuracy: 0.37894736842105264
5 grams, accuracy: