Skip to content

Commit

Permalink
Minor cleanup - comments
Browse files Browse the repository at this point in the history
  • Loading branch information
kkrugler committed Nov 27, 2013
1 parent 6495316 commit 7f19b0a
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 7 deletions.
Expand Up @@ -3,6 +3,12 @@
import java.io.IOException;
import java.io.Reader;

/**
* A StringReader that has a reset() method, so that we can re-use it
* in the SolrAnalyzer, versus having to create a new one every time
* we get called with a new String to parse.
*
*/
public class ReusableStringReader extends Reader {

private char[] _chars;
Expand Down
29 changes: 23 additions & 6 deletions src/main/java/com/scaleunlimited/textfeatures/SolrAnalyzer.java
Expand Up @@ -32,6 +32,11 @@
import org.apache.solr.schema.IndexSchema;
import org.xml.sax.SAXException;

/**
* A text analyzer that parses a string and turns it into a list
* of strings, using
*
*/
@SuppressWarnings("serial")
public class SolrAnalyzer implements Serializable {

Expand Down Expand Up @@ -63,9 +68,11 @@ private synchronized void init() {
_token = new ThreadLocal<CharTermAttribute>();

try {
// We hard-code the analyzer to be the one for the field type "text_en", which must
// be defined in the schema.xml file in src/main/resources/solrparser/
_analyzer = getAnalyzer("text_en");
} catch (Exception e) {
throw new RuntimeException("Can't creating Solr-based analyzer", e);
throw new RuntimeException("Can't create Solr-based analyzer", e);
}
}
}
Expand Down Expand Up @@ -146,19 +153,18 @@ private String filterWord(String curWord) {
}

/**
* Leverage the Solr schema.xml analysis chain to get the right analyzer for the target language.
* Leverage the Solr schema.xml analysis chain to get the right analyzer for the target field type.
*
* @param solrCoreDirName
* @param language target language
* @param modifier field name modifier (e.g. "_raw" for no stemming or special splitting).
* @param fieldTypeName target field type
* @return
* @throws IOException
* @throws ParserConfigurationException
* @throws SAXException
*/
private Analyzer getAnalyzer(String fieldTypeName) throws IOException, ParserConfigurationException, SAXException {
// Create a temp location for Solr home, which has a skeleton solr.xml that
// references the Solr core directory.
// references the Solr core directory. Note that as of Solr 4.5, this is no
// longer necessary (we don't need a solr.xml)
File tmpSolrHome = new File(FileUtils.getTempDirectory(), UUID.randomUUID().toString());
File solrCoreDir = makeSolrCoreDir(tmpSolrHome);
String coreName = solrCoreDir.getName();
Expand Down Expand Up @@ -201,6 +207,17 @@ private Analyzer getAnalyzer(String fieldTypeName) throws IOException, ParserCon
}
}

/**
* We rely on a file (as a resource) that contains a list of Solr-related files
* we need to extract from our jar and write out to a temp location. There's no
* good way to iterate over resources in a jar, unfortunately, so we use this
* somewhat brittle work-around. Which means that if you add or remove or rename
* files in the Solr configuration, you have to remember to edit filelist.txt
*
* @param solrHomeDir temporary directory where we should build our Solr home dir.
* @return location of the collection directory.
* @throws IOException
*/
private File makeSolrCoreDir(File solrHomeDir) throws IOException {
List<String> filenames = IOUtils.readLines(SolrAnalyzer.class.getResourceAsStream("/solrparser/filelist.txt"));

Expand Down
Expand Up @@ -12,7 +12,7 @@
public class StopwordsWorkflowTest {

@Test
public void test() throws Exception {
public void testTopDFTerms() throws Exception {
StopwordsOptions options = new StopwordsOptions();
options.setTestMode(true);
options.setInput("src/test/resources/mahout-emails-big.tsv");
Expand Down

0 comments on commit 7f19b0a

Please sign in to comment.