From 7f19b0adfdbff6e8504f925843741110d4dc8f88 Mon Sep 17 00:00:00 2001
From: Ken Krugler <ken@transpac.com>
Date: Wed, 27 Nov 2013 15:41:10 -0800
Subject: [PATCH] Minor cleanup - comments

---
 .../textfeatures/ReusableStringReader.java    |  6 ++++
 .../textfeatures/SolrAnalyzer.java            | 29 +++++++++++++++----
 .../stopwords/StopwordsWorkflowTest.java      |  2 +-
 3 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/src/main/java/com/scaleunlimited/textfeatures/ReusableStringReader.java b/src/main/java/com/scaleunlimited/textfeatures/ReusableStringReader.java
index d5fc015..923658c 100644
--- a/src/main/java/com/scaleunlimited/textfeatures/ReusableStringReader.java
+++ b/src/main/java/com/scaleunlimited/textfeatures/ReusableStringReader.java
@@ -3,6 +3,12 @@
 import java.io.IOException;
 import java.io.Reader;
 
+/**
+ * A StringReader that has a reset() method, so that we can re-use it
+ * in the SolrAnalyzer, versus having to create a new one every time
+ * we get called with a new String to parse.
+ *
+ */
 public class ReusableStringReader extends Reader {
 
     private char[] _chars;
diff --git a/src/main/java/com/scaleunlimited/textfeatures/SolrAnalyzer.java b/src/main/java/com/scaleunlimited/textfeatures/SolrAnalyzer.java
index b3fa5cf..12d9596 100644
--- a/src/main/java/com/scaleunlimited/textfeatures/SolrAnalyzer.java
+++ b/src/main/java/com/scaleunlimited/textfeatures/SolrAnalyzer.java
@@ -32,6 +32,11 @@
 import org.apache.solr.schema.IndexSchema;
 import org.xml.sax.SAXException;
 
+/**
+ * A text analyzer that parses a string and turns it into a list
+ * of strings, using 
+ *
+ */
 @SuppressWarnings("serial")
 public class SolrAnalyzer implements Serializable {
 
@@ -63,9 +68,11 @@ private synchronized void init() {
             _token = new ThreadLocal<CharTermAttribute>();
 
             try {
+                // We hard-code the analyzer to be the one for the field type "text_en", which must
+                // be defined in the schema.xml file in src/main/resources/solrparser/
                 _analyzer = getAnalyzer("text_en");
             } catch (Exception e) {
-                throw new RuntimeException("Can't creating Solr-based analyzer", e);
+                throw new RuntimeException("Can't create Solr-based analyzer", e);
             }
         }
     }
@@ -146,11 +153,9 @@ private String filterWord(String curWord) {
     }
 
     /**
-     * Leverage the Solr schema.xml analysis chain to get the right analyzer for the target language.
+     * Leverage the Solr schema.xml analysis chain to get the right analyzer for the target field type.
      * 
-     * @param solrCoreDirName
-     * @param language target language
-     * @param modifier field name modifier (e.g. "_raw" for no stemming or special splitting).
+     * @param fieldTypeName target field type
      * @return
      * @throws IOException
      * @throws ParserConfigurationException
@@ -158,7 +163,8 @@ private String filterWord(String curWord) {
      */
     private Analyzer getAnalyzer(String fieldTypeName) throws IOException, ParserConfigurationException, SAXException {
         // Create a temp location for Solr home, which has a skeleton solr.xml that
-        // references the Solr core directory.
+        // references the Solr core directory. Note that as of Solr 4.5, this is no
+        // longer necessary (we don't need a solr.xml)
         File tmpSolrHome = new File(FileUtils.getTempDirectory(), UUID.randomUUID().toString());
         File solrCoreDir = makeSolrCoreDir(tmpSolrHome);
         String coreName = solrCoreDir.getName();
@@ -201,6 +207,17 @@ private Analyzer getAnalyzer(String fieldTypeName) throws IOException, ParserCon
         }
     }
 
+    /**
+     * We rely on a file (as a resource) that contains a list of Solr-related files
+     * we need to extract from our jar and write out to a temp location. There's no
+     * good way to iterate over resources in a jar, unfortunately, so we use this
+     * somewhat brittle work-around. Which means that if you add or remove or rename
+     * files in the Solr configuration, you have to remember to edit filelist.txt
+     * 
+     * @param solrHomeDir temporary directory where we should build our Solr home dir.
+     * @return location of the collection directory.
+     * @throws IOException
+     */
     private File makeSolrCoreDir(File solrHomeDir) throws IOException {
         List<String> filenames = IOUtils.readLines(SolrAnalyzer.class.getResourceAsStream("/solrparser/filelist.txt"));
         
diff --git a/src/test/java/com/scaleunlimited/stopwords/StopwordsWorkflowTest.java b/src/test/java/com/scaleunlimited/stopwords/StopwordsWorkflowTest.java
index eb35b4a..0142f45 100644
--- a/src/test/java/com/scaleunlimited/stopwords/StopwordsWorkflowTest.java
+++ b/src/test/java/com/scaleunlimited/stopwords/StopwordsWorkflowTest.java
@@ -12,7 +12,7 @@
 public class StopwordsWorkflowTest {
 
     @Test
-    public void test() throws Exception {
+    public void testTopDFTerms() throws Exception {
         StopwordsOptions options = new StopwordsOptions();
         options.setTestMode(true);
         options.setInput("src/test/resources/mahout-emails-big.tsv");