Skip to content

Commit

Permalink
Added some documentation and code formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
aecio committed Mar 8, 2018
1 parent e725561 commit a17d054
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@
import focusedCrawler.util.Tokenizers;
import focusedCrawler.util.Tokenizers.ShingleTokenizer;

/**
* Builds a "index" of near-duplicate pages using locality-sensitive hashing.
*
* @author aeciosantos
*
*/
public class DuplicatePageIndexer {

public static Logger logger = LoggerFactory.getLogger(DuplicatePageIndexer.class);
Expand Down
46 changes: 29 additions & 17 deletions src/main/java/focusedCrawler/minhash/LSH.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,42 +9,52 @@

import focusedCrawler.util.persistence.rocksdb.RocksDBHashtable;

/**
* Implementation of locality-sensitive hashing algorithm for finding near-duplicate content.
*
* @author aeciosantos
*
*/
public class LSH {

private final int nBands;
private final int nRows;
private final LSHStorage bandsStorage;

public LSH(int nHashes, double jaccardThreshold) {
this(nHashes, computeNumberOfBandsForThreshold(nHashes, jaccardThreshold));
}

public LSH(int nHashes, int nBands) {
this(nHashes, nBands, new InMemoryStorage(nBands));
}

public LSH(int nHashes, double jaccardThreshold, String dataPath) {
this(nHashes, computeNumberOfBandsForThreshold(nHashes, jaccardThreshold), new DBStorage(dataPath));
this(nHashes, computeNumberOfBandsForThreshold(nHashes, jaccardThreshold),
new DBStorage(dataPath));
}

public LSH(int nHashes, int nBands, String dataPath) {
this(nHashes, nBands, new DBStorage(dataPath));
}

public LSH(int nHashes, int nBands, LSHStorage bandsStorage) {
if ((nHashes % nBands) != 0) {
throw new IllegalArgumentException("Bands must divide nHashes (" + nHashes + ") evenly");
throw new IllegalArgumentException(
"Bands must divide nHashes (" + nHashes + ") evenly");
}
this.nBands = nBands;
this.nRows = nHashes / nBands;
this.bandsStorage = bandsStorage;
}

public double targetThreshold(int nHashes, int nBands) {
return Math.pow(1.0 / nHashes, (1.0 / (nHashes / nBands)));
}


/**
* Finds the number of bands that need to be used for a given similarity threshold.
*
* @param nHashes
* @param jaccardThreshold
* @return
*/
private static int computeNumberOfBandsForThreshold(int nHashes, double jaccardThreshold) {
int bands = nHashes;
while (bands > 1) {
Expand Down Expand Up @@ -86,13 +96,15 @@ public Set<String> query(int[] hashes) {
return candidates;
}

interface LSHStorage {
protected interface LSHStorage {

public void insertToBand(int b, String hh, String key);

public Collection<String> getValues(int b, String hh);

}
static class InMemoryStorage implements LSHStorage {

protected static class InMemoryStorage implements LSHStorage {

private final ArrayListMultimap<String, String>[] maps;

Expand All @@ -113,7 +125,7 @@ public Collection<String> getValues(int band, String hexHash) {
}
}

static class DBStorage implements LSHStorage {
protected static class DBStorage implements LSHStorage {

private final RocksDBHashtable<TreeSet<String>> maps;

Expand Down
27 changes: 24 additions & 3 deletions src/main/java/focusedCrawler/minhash/MinHasher.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,37 @@
import java.util.Random;
import java.util.Set;

/*
* Generates a family of hash functions that can be used for locality-sensitive hashing (LSH).
*/
public class MinHasher {

public final int nextPrime = 2147483587;
public final int maxValue = nextPrime - 1;
public int[] coeffA;
public int[] coeffB;
public int numOfHashes;
private int seed;

/**
* Creates a family of universal hash functions. Uses a fixed seed number (chosen randomly) to
* generate the hash functions.
*
* @param numOfHashes The number of hash functions to generate.
*/
public MinHasher(int numOfHashes) {
this(numOfHashes, 1947);
}

/**
* Creates a family of universal hash functions.
*
* @param numOfHashes The number of hash functions to generate.
* @param seed The seed number used to generate the hash functions.
*/
public MinHasher(int numOfHashes, int seed) {
this.numOfHashes = numOfHashes;
this.seed = seed;
this.coeffA = pickRandCoefficients(numOfHashes);
this.coeffB = pickRandCoefficients(numOfHashes);
}
Expand All @@ -37,7 +58,7 @@ public int[] minHashSignature(Set<Integer> hashedShingles) {
private int[] pickRandCoefficients(int k) {
int[] rands = new int[k];
HashSet<Integer> seen = new HashSet<Integer>(k);
Random random = new Random();
Random random = new Random(seed);
int i = 0;
while (k > 0) {
int randIndex = random.nextInt(maxValue);
Expand All @@ -52,4 +73,4 @@ private int[] pickRandCoefficients(int k) {
return rands;
}

}
}
11 changes: 6 additions & 5 deletions src/main/java/focusedCrawler/util/Tokenizers.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,21 +41,22 @@ public List<String> tokenize(String cleanText) {
ts.close();
return tokens;
} catch (IOException e) {
throw new RuntimeException("Shigle tokenization failed for string: "+cleanText, e);
throw new RuntimeException(
"Shigle tokenization failed for string: " + cleanText, e);
}
}

public Set<String> tokensSet(String cleanText) {
return new HashSet<String>(tokenize(cleanText));
}

public Set<Integer> hashedTokenSet(String cleanText) {
HashSet<Integer> hashedTokens = new HashSet<>();
for(String token : tokenize(cleanText)) {
for (String token : tokenize(cleanText)) {
hashedTokens.add(murmur.hashString(token, Charsets.UTF_8).asInt());
}
return hashedTokens;
}
}

}

Expand Down

0 comments on commit a17d054

Please sign in to comment.