Skip to content

Commit

Permalink
Added implementation of locality-sensitive hashing for near-duplicate…
Browse files Browse the repository at this point in the history
… detection
  • Loading branch information
aecio committed Apr 27, 2017
1 parent b2fd786 commit 9e35f55
Show file tree
Hide file tree
Showing 8 changed files with 1,546 additions and 0 deletions.
68 changes: 68 additions & 0 deletions src/main/java/focusedCrawler/minhash/DuplicatePageIndexer.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
package focusedCrawler.minhash;

import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Set;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import focusedCrawler.util.Tokenizers;
import focusedCrawler.util.Tokenizers.ShingleTokenizer;

public class DuplicatePageIndexer {

public static Logger logger = LoggerFactory.getLogger(DuplicatePageIndexer.class);

private int numHashes = 256;
private double jaccardThreshold = 0.9d;
private int numberOfShingles = 9;

private ShingleTokenizer shinglesTokenizer = Tokenizers.shingles(numberOfShingles);
private MinHasher hasher = new MinHasher(numHashes);
private LSH lsh;

public DuplicatePageIndexer() {
this.lsh = new LSH(numHashes, jaccardThreshold);
}

public DuplicatePageIndexer(String dataPath) {
this.lsh = new LSH(numHashes, jaccardThreshold, dataPath);
}

public void insert(String id, String text) throws Exception {
int[] signatures = computeMinHashSignatures(text);
lsh.insert(id, signatures);
}

public boolean isNearDuplicate(String text) {
Set<String> dupes = findNearDuplicates(text);
return !dupes.isEmpty();
}

public Set<String> findNearDuplicates(String text) {
int[] signatures = computeMinHashSignatures(text);
Set<String> dupCandidates = lsh.query(signatures);
return dupCandidates;
}

private int[] computeMinHashSignatures(String cleanText) {
try {
Set<Integer> hashedShingles = shinglesTokenizer.hashedTokenSet(cleanText);
return hasher.minHashSignature(hashedShingles);
} catch (Exception e) {
logger.warn("Failed to parse clean text into shingles.", e);
return new int[0];
}
}

public static void main(String[] args) throws Exception {
DuplicatePageIndexer dpi = new DuplicatePageIndexer();
String content = new String(Files.readAllBytes(Paths.get("/tmp/ache.html")));
String content2 = new String(Files.readAllBytes(Paths.get("/tmp/ache-dup.html")));
dpi.insert("1", content);
System.out.println(dpi.isNearDuplicate(content));
System.out.println(dpi.isNearDuplicate(content2));
}

}
142 changes: 142 additions & 0 deletions src/main/java/focusedCrawler/minhash/LSH.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
package focusedCrawler.minhash;

import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
import java.util.TreeSet;

import com.google.common.collect.ArrayListMultimap;

import focusedCrawler.util.persistence.rocksdb.RocksDBHashtable;

public class LSH {

private final int nBands;
private final int nRows;
private final LSHStorage bandsStorage;

public LSH(int nHashes, double jaccardThreshold) {
this(nHashes, computeNumberOfBandsForThreshold(nHashes, jaccardThreshold));
}

public LSH(int nHashes, int nBands) {
this(nHashes, nBands, new InMemoryStorage(nBands));
}

public LSH(int nHashes, double jaccardThreshold, String dataPath) {
this(nHashes, computeNumberOfBandsForThreshold(nHashes, jaccardThreshold), new DBStorage(dataPath));
}

public LSH(int nHashes, int nBands, String dataPath) {
this(nHashes, nBands, new DBStorage(dataPath));
}

public LSH(int nHashes, int nBands, LSHStorage bandsStorage) {
if ((nHashes % nBands) != 0) {
throw new IllegalArgumentException("Bands must divide nHashes (" + nHashes + ") evenly");
}
this.nBands = nBands;
this.nRows = nHashes / nBands;
this.bandsStorage = bandsStorage;
}

public double targetThreshold(int nHashes, int nBands) {
return Math.pow(1.0 / nHashes, (1.0 / (nHashes / nBands)));
}


private static int computeNumberOfBandsForThreshold(int nHashes, double jaccardThreshold) {
int bands = nHashes;
while (bands > 1) {
if ((nHashes % bands) == 0) {
double threshold = Math.pow((double) 1.0 / bands, (double) bands / nHashes);
if (threshold > jaccardThreshold) {
break;
}
}
bands--;
}
return bands;
}

public void insert(String key, int[] hashes) {
for (int b = 0; b < nBands; b++) {
StringBuffer sb = new StringBuffer();
for (int r = 0; r < nRows; r++) {
sb.append(Integer.toHexString(hashes[b * nRows + r]));
}
String hh = sb.toString();
bandsStorage.insertToBand(b, hh, key);
}
}

public Set<String> query(int[] hashes) {
Set<String> candidates = new HashSet<String>();
for (int b = 0; b < nBands; b++) {
StringBuffer sb = new StringBuffer();
for (int r = 0; r < nRows; r++) {
sb.append(Integer.toHexString(hashes[b * nRows + r]));
}
String hh = sb.toString();
Collection<String> values = bandsStorage.getValues(b, hh);
if (values != null) {
candidates.addAll(values);
}
}
return candidates;
}

interface LSHStorage {
public void insertToBand(int b, String hh, String key);

public Collection<String> getValues(int b, String hh);
}

static class InMemoryStorage implements LSHStorage {

private final ArrayListMultimap<String, String>[] maps;

@SuppressWarnings("unchecked")
public InMemoryStorage(int nBands) {
maps = new ArrayListMultimap[nBands];
for (int i = 0; i < nBands; i++) {
maps[i] = ArrayListMultimap.create();
}
}

public void insertToBand(int band, String hexHash, String key) {
maps[band].put(hexHash, key);
}

public Collection<String> getValues(int band, String hexHash) {
return maps[band].get(hexHash);
}
}

static class DBStorage implements LSHStorage {

private final RocksDBHashtable<TreeSet<String>> maps;

@SuppressWarnings({"unchecked", "rawtypes"})
public DBStorage(String path) {
maps = new RocksDBHashtable(path, TreeSet.class);
}

public void insertToBand(int band, String hexHash, String key) {
String hashtableKey = band + hexHash;
TreeSet<String> keysSet = maps.get(hashtableKey);
if (keysSet == null) {
keysSet = new TreeSet<>();
}
keysSet.add(key);
maps.put(hashtableKey, keysSet);
}

public Collection<String> getValues(int band, String hexHash) {
String hashtableKey = band + hexHash;
return maps.get(hashtableKey);
}

}

}
55 changes: 55 additions & 0 deletions src/main/java/focusedCrawler/minhash/MinHasher.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package focusedCrawler.minhash;

import java.util.HashSet;
import java.util.Random;
import java.util.Set;

public class MinHasher {

public final int nextPrime = 2147483587;
public final int maxValue = nextPrime - 1;
public int[] coeffA;
public int[] coeffB;
public int numOfHashes;

public MinHasher(int numOfHashes) {
this.numOfHashes = numOfHashes;
this.coeffA = pickRandCoefficients(numOfHashes);
this.coeffB = pickRandCoefficients(numOfHashes);
}

public int[] minHashSignature(Set<Integer> hashedShingles) {
int[] signatures = new int[numOfHashes];
for (int i = 0; i < numOfHashes; i++) {
int min = nextPrime + 1;
for (int shingle : hashedShingles) {
shingle = shingle % maxValue;
int h = (coeffA[i] * shingle + coeffB[i]) % nextPrime;
if (h < min) {
min = h;
}
}
signatures[i] = min;
}
return signatures;
}

private int[] pickRandCoefficients(int k) {
int[] rands = new int[k];
HashSet<Integer> seen = new HashSet<Integer>(k);
Random random = new Random();
int i = 0;
while (k > 0) {
int randIndex = random.nextInt(maxValue);
while (seen.contains(randIndex)) {
randIndex = random.nextInt(maxValue);
}
rands[i] = randIndex;
seen.add(randIndex);
k = k - 1;
i++;
}
return rands;
}

}
66 changes: 66 additions & 0 deletions src/main/java/focusedCrawler/util/Tokenizers.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
package focusedCrawler.util;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.shingle.ShingleAnalyzerWrapper;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

import com.google.common.base.Charsets;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;

public class Tokenizers {

private static HashFunction murmur = Hashing.murmur3_32();

public static class ShingleTokenizer {

private Analyzer analyzer;

public ShingleTokenizer(int size) {
this.analyzer = new ShingleAnalyzerWrapper(new SimpleAnalyzer(), size);
}

public List<String> tokenize(String cleanText) {
try {
TokenStream ts = analyzer.tokenStream("cleanText", cleanText);
CharTermAttribute cattr = ts.addAttribute(CharTermAttribute.class);
ts.reset();
List<String> tokens = new ArrayList<String>();
while (ts.incrementToken()) {
String token = cattr.toString();
tokens.add(token);
}
ts.close();
return tokens;
} catch (IOException e) {
throw new RuntimeException("Shigle tokenization failed for string: "+cleanText, e);
}
}

public Set<String> tokensSet(String cleanText) {
return new HashSet<String>(tokenize(cleanText));
}

public Set<Integer> hashedTokenSet(String cleanText) {
HashSet<Integer> hashedTokens = new HashSet<>();
for(String token : tokenize(cleanText)) {
hashedTokens.add(murmur.hashString(token, Charsets.UTF_8).asInt());
}
return hashedTokens;
}

}

public static ShingleTokenizer shingles(int size) {
return new ShingleTokenizer(size);
}

}
Loading

0 comments on commit 9e35f55

Please sign in to comment.