In [1]:
from __future__ import division
import os
import pickle
import codecs
from glob import glob
import re

In [23]:
DATA_SOURCE = './source'
SUSPICIOUS_DOCS = './test'

In [24]:
glob(os.path.join(DATA_SOURCE, '*.txt'))

['./source/orig_taskd.txt',
 './source/orig_taske.txt',
 './source/orig_taska.txt',
 './source/orig_taskb.txt',
 './source/orig_taskc.txt']

In [25]:
glob(os.path.join(SUSPICIOUS_DOCS, '*.txt'))

['./test/g0pC_taskc.txt',
 './test/g0pE_taskd.txt',
 './test/g0pE_taske.txt',
 './test/g0pC_taskb.txt',
 './test/g1pA_taske.txt',
 './test/g3pB_taskd.txt',
 './test/g3pB_taske.txt',
 './test/g0pC_taska.txt',
 './test/g1pA_taskd.txt',
 './test/g0pC_taske.txt',
 './test/g3pB_taska.txt',
 './test/g0pE_taskb.txt',
 './test/g0pE_taskc.txt',
 './test/g0pC_taskd.txt',
 './test/g1pA_taska.txt',
 './test/g1pA_taskc.txt',
 './test/g3pB_taskb.txt',
 './test/g0pE_taska.txt',
 './test/g3pB_taskc.txt',
 './test/g1pA_taskb.txt',
 './test/g2pA_taskb.txt',
 './test/g0pD_taskd.txt',
 './test/g0pB_taskc.txt',
 './test/g0pB_taskb.txt',
 './test/g0pD_taske.txt',
 './test/g2pA_taskc.txt',
 './test/g3pC_taskd.txt',
 './test/g2pA_taska.txt',
 './test/g0pB_taska.txt',
 './test/g3pC_taske.txt',
 './test/g3pC_taska.txt',
 './test/g2pA_taskd.txt',
 './test/g0pD_taskb.txt',
 './test/g0pB_taske.txt',
 './test/g0pB_taskd.txt',
 './test/g0pD_taskc.txt',
 './test/g2pA_taske.txt',
 './test/g3pC_taskb.txt',
 './test/g0p

### Preprocessing

In [97]:
from datasketch import MinHash, MinHashLSH
import nltk
from nltk.tokenize import wordpunct_tokenize
import pickle
import re

def tokenize_file(file_name: str) -> list:
    try:
        text = codecs.open(file_name).read()
    except UnicodeDecodeError:
        text = codecs.open(file_name, encoding='cp1252').read()
    return wordpunct_tokenize(text)

### LSH model

In [104]:
def create_minhash(text, n_gram, is_file=True, num_perm=128):
    """
    Generate minhash from single file or text given ngrams and number of permutation
    """
    if is_file:
        tokens = tokenize_file(text)
    else:
        tokens = wordpunct_tokenize(text)
    minhash = MinHash(num_perm=num_perm)
    for gram in nltk.ngrams(tokens, n_gram):
        minhash.update(" ".join(gram).encode('utf-8')) 
    return minhash

def create_lsh(source_dir, threshold=.3, n_gram=3, num_perm=128, max_num=10) -> MinHashLSH:
    """
    Create an LSH instance from text files in source directory given threshold, n-grams, and number of permutation
    Returns an LSH object and a list of source file numbers 
    """
    lsh = MinHashLSH(num_perm=num_perm, threshold=threshold)#, 
                   #storage_config={'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379, 'db': 1},'name': 1})
    file_names = glob(os.path.join(source_dir, '*.txt'))[:max_num]
    keys = []
    minhashes = []
    for fname in file_names:
        minhash = create_minhash(fname, n_gram, num_perm=num_perm)
        keys.append(re.findall(r'task\w', fname)[0])
        minhashes.append(minhash)
    with lsh.insertion_session() as session:
        for key, minhash in zip(keys, minhashes):
            session.insert(key, minhash)
    return lsh, keys


### Model Evaluation

In [134]:
def eval_single(lsh, text_dir, n_gram, num_perm=128):
    minhash = create_minhash(text_dir, n_gram, is_file=True, num_perm=num_perm)
    result = lsh.query(minhash)
    return result

def eval_list(sus_dir, lsh, n_gram, num_perm):
    results = {}
    for text_dir in glob(os.path.join(sus_dir, '*.txt')):
        results[re.findall("g[A-Za-z0-9_-]*", text_dir)[0]] = eval_single(lsh, text_dir, n_gram, num_perm=num_perm)
    return results
        
def find_accuracy(keys, results):
    correct = 0
    for doc in results:
        if len(results[doc])>0 and results[doc][0] in doc:
            correct += 1
    return correct/len(results)

### Testing

In [157]:
n_gram = 3
jaccard_th = .05
num_perm = 128
lsh, keys = create_lsh(DATA_SOURCE, n_gram=n_gram, threshold=jaccard_th, max_num=10, num_perm=num_perm)
results = eval_list(SUSPICIOUS_DOCS, lsh, n_gram, num_perm)
find_accuracy(keys, results)

0.6526315789473685

#### Parameter Tuning

In [159]:
for n_gram in range(2, 10):
    for jaccard_th in [.0, .01, .02, .05, .1]:
        lsh, keys = create_lsh(DATA_SOURCE, n_gram=n_gram, threshold=jaccard_th, max_num=10, num_perm=num_perm)
        results = eval_list(SUSPICIOUS_DOCS, lsh, n_gram, num_perm)
        acc = find_accuracy(keys, results)
        print(f'{n_gram} grams, {jaccard_th} < jaccard, accuracy: {acc}')


2 grams, 0.0 < jaccard, accuracy: 0.2736842105263158
2 grams, 0.01 < jaccard, accuracy: 0.2736842105263158
2 grams, 0.02 < jaccard, accuracy: 0.28421052631578947
2 grams, 0.05 < jaccard, accuracy: 0.4842105263157895
2 grams, 0.1 < jaccard, accuracy: 0.5684210526315789
3 grams, 0.0 < jaccard, accuracy: 0.5789473684210527
3 grams, 0.01 < jaccard, accuracy: 0.5789473684210527
3 grams, 0.02 < jaccard, accuracy: 0.6631578947368421
3 grams, 0.05 < jaccard, accuracy: 0.6526315789473685
3 grams, 0.1 < jaccard, accuracy: 0.4105263157894737
4 grams, 0.0 < jaccard, accuracy: 0.5789473684210527
4 grams, 0.01 < jaccard, accuracy: 0.5789473684210527
4 grams, 0.02 < jaccard, accuracy: 0.5473684210526316
4 grams, 0.05 < jaccard, accuracy: 0.5263157894736842
4 grams, 0.1 < jaccard, accuracy: 0.3684210526315789
5 grams, 0.0 < jaccard, accuracy: 0.5578947368421052
5 grams, 0.01 < jaccard, accuracy: 0.5578947368421052
5 grams, 0.02 < jaccard, accuracy: 0.5578947368421052
5 grams, 0.05 < jaccard, accuracy: