From 7f19b0adfdbff6e8504f925843741110d4dc8f88 Mon Sep 17 00:00:00 2001 From: Ken Krugler Date: Wed, 27 Nov 2013 15:41:10 -0800 Subject: [PATCH] Minor cleanup - comments --- .../textfeatures/ReusableStringReader.java | 6 ++++ .../textfeatures/SolrAnalyzer.java | 29 +++++++++++++++---- .../stopwords/StopwordsWorkflowTest.java | 2 +- 3 files changed, 30 insertions(+), 7 deletions(-) diff --git a/src/main/java/com/scaleunlimited/textfeatures/ReusableStringReader.java b/src/main/java/com/scaleunlimited/textfeatures/ReusableStringReader.java index d5fc015..923658c 100644 --- a/src/main/java/com/scaleunlimited/textfeatures/ReusableStringReader.java +++ b/src/main/java/com/scaleunlimited/textfeatures/ReusableStringReader.java @@ -3,6 +3,12 @@ import java.io.IOException; import java.io.Reader; +/** + * A StringReader that has a reset() method, so that we can re-use it + * in the SolrAnalyzer, versus having to create a new one every time + * we get called with a new String to parse. + * + */ public class ReusableStringReader extends Reader { private char[] _chars; diff --git a/src/main/java/com/scaleunlimited/textfeatures/SolrAnalyzer.java b/src/main/java/com/scaleunlimited/textfeatures/SolrAnalyzer.java index b3fa5cf..12d9596 100644 --- a/src/main/java/com/scaleunlimited/textfeatures/SolrAnalyzer.java +++ b/src/main/java/com/scaleunlimited/textfeatures/SolrAnalyzer.java @@ -32,6 +32,11 @@ import org.apache.solr.schema.IndexSchema; import org.xml.sax.SAXException; +/** + * A text analyzer that parses a string and turns it into a list + * of strings, using + * + */ @SuppressWarnings("serial") public class SolrAnalyzer implements Serializable { @@ -63,9 +68,11 @@ private synchronized void init() { _token = new ThreadLocal(); try { + // We hard-code the analyzer to be the one for the field type "text_en", which must + // be defined in the schema.xml file in src/main/resources/solrparser/ _analyzer = getAnalyzer("text_en"); } catch (Exception e) { - throw new RuntimeException("Can't creating Solr-based analyzer", e); + throw new RuntimeException("Can't create Solr-based analyzer", e); } } } @@ -146,11 +153,9 @@ private String filterWord(String curWord) { } /** - * Leverage the Solr schema.xml analysis chain to get the right analyzer for the target language. + * Leverage the Solr schema.xml analysis chain to get the right analyzer for the target field type. * - * @param solrCoreDirName - * @param language target language - * @param modifier field name modifier (e.g. "_raw" for no stemming or special splitting). + * @param fieldTypeName target field type * @return * @throws IOException * @throws ParserConfigurationException @@ -158,7 +163,8 @@ private String filterWord(String curWord) { */ private Analyzer getAnalyzer(String fieldTypeName) throws IOException, ParserConfigurationException, SAXException { // Create a temp location for Solr home, which has a skeleton solr.xml that - // references the Solr core directory. + // references the Solr core directory. Note that as of Solr 4.5, this is no + // longer necessary (we don't need a solr.xml) File tmpSolrHome = new File(FileUtils.getTempDirectory(), UUID.randomUUID().toString()); File solrCoreDir = makeSolrCoreDir(tmpSolrHome); String coreName = solrCoreDir.getName(); @@ -201,6 +207,17 @@ private Analyzer getAnalyzer(String fieldTypeName) throws IOException, ParserCon } } + /** + * We rely on a file (as a resource) that contains a list of Solr-related files + * we need to extract from our jar and write out to a temp location. There's no + * good way to iterate over resources in a jar, unfortunately, so we use this + * somewhat brittle work-around. Which means that if you add or remove or rename + * files in the Solr configuration, you have to remember to edit filelist.txt + * + * @param solrHomeDir temporary directory where we should build our Solr home dir. + * @return location of the collection directory. + * @throws IOException + */ private File makeSolrCoreDir(File solrHomeDir) throws IOException { List filenames = IOUtils.readLines(SolrAnalyzer.class.getResourceAsStream("/solrparser/filelist.txt")); diff --git a/src/test/java/com/scaleunlimited/stopwords/StopwordsWorkflowTest.java b/src/test/java/com/scaleunlimited/stopwords/StopwordsWorkflowTest.java index eb35b4a..0142f45 100644 --- a/src/test/java/com/scaleunlimited/stopwords/StopwordsWorkflowTest.java +++ b/src/test/java/com/scaleunlimited/stopwords/StopwordsWorkflowTest.java @@ -12,7 +12,7 @@ public class StopwordsWorkflowTest { @Test - public void test() throws Exception { + public void testTopDFTerms() throws Exception { StopwordsOptions options = new StopwordsOptions(); options.setTestMode(true); options.setInput("src/test/resources/mahout-emails-big.tsv");