wdk-core-topicmodeling/src/main/java/de/tudarmstadt/ukp/experiments/wdk/topicmodeling/estimation/EstimateTopicModel.java

package de.tudarmstadt.ukp.experiments.wdk.topicmodeling.estimation;

import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.io.bincas.BinaryCasReader;
import de.tudarmstadt.ukp.dkpro.core.mallet.lda.MalletLdaTopicModelTrainer;
import de.tudarmstadt.ukp.dkpro.core.textnormalizer.annotations.TrailingCharacterRemover;
import de.tudarmstadt.ukp.dkpro.core.textnormalizer.casfilter.CasFilter_ImplBase;
import de.tudarmstadt.ukp.experiments.wdk.io.util.PipelineUtils;
import de.tudarmstadt.ukp.experiments.wdk.normalization.filter.WdkAnnotationBasedCasFilter;
import org.apache.commons.cli.*;
import org.apache.uima.UIMAException;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.collection.CollectionReaderDescription;
import org.apache.uima.fit.pipeline.SimplePipeline;
import org.apache.uima.resource.ResourceInitializationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;
import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription;

/**
 * Estimate a topic model based on a collection of binary Cas files.
 */
public class EstimateTopicModel
{
    private static final Logger logger = LoggerFactory.getLogger(EstimateTopicModel.class);
    /* Reader parameters */
    private static File casDir;
    private static String[] casFilePatterns;
    /* LDA parameters */
    private static File modelFile;
    private static int nTopics;
    private static int nIterations;
    private static int nThreads;
    private static boolean useSentences; // use sentences rather than whole documents for estimation
    private static float alpha;
    private static float beta;
    /* Pre-processing parameters */
    private static String stopwordsFile;
    private static String typeName;
    private static String textFilterRegex;
    private static boolean useLemma;
    private static String[] pos;
    private static int minLength;
    private static String wordsFile; // use only words from this file
    private static String[] divTypes;

    public static void main(String[] args)
            throws IOException, UIMAException
    {
        setOptions(args);

        CollectionReaderDescription reader = createReaderDescription(BinaryCasReader.class,
                BinaryCasReader.PARAM_SOURCE_LOCATION, casDir,
                BinaryCasReader.PARAM_PATTERNS, casFilePatterns);

        AnalysisEngineDescription[] pipeline = configurePipeline();

        SimplePipeline.runPipeline(reader, pipeline);

    }

    private static AnalysisEngineDescription[] configurePipeline()
            throws ResourceInitializationException
    {
        String featurePath = useLemma
                ? Token.class.getCanonicalName() + "/lemma/value"
                : Token.class.getCanonicalName();

        List<AnalysisEngineDescription> aEngines1 = new ArrayList<>();

        /* add stopwords remover */
        if (stopwordsFile != null) {
            aEngines1.add(PipelineUtils.createStopwordsRemover(stopwordsFile));
        }

        /* add regex token filter */
        if (textFilterRegex != null) {
            aEngines1.add(PipelineUtils.createRegexFilter(textFilterRegex, typeName));
        }

        /* add POS filter */
        if (pos != null) {
            aEngines1.add(PipelineUtils.createPOSFilter(pos, typeName));
        }

        /* add words filter */
        if (wordsFile != null) {
            aEngines1.add(PipelineUtils.createAnnotationByTextFilter(wordsFile, typeName));
        }

        /* remove trailing chars */
        aEngines1.add(createEngineDescription(TrailingCharacterRemover.class,
                TrailingCharacterRemover.PARAM_MIN_TOKEN_LENGTH, minLength));

        aEngines1.add(createEngineDescription(MalletLdaTopicModelTrainer.class,
                MalletLdaTopicModelTrainer.PARAM_N_ITERATIONS, nIterations,
                MalletLdaTopicModelTrainer.PARAM_NUM_THREADS, nThreads,
                MalletLdaTopicModelTrainer.PARAM_N_TOPICS, nTopics,
                MalletLdaTopicModelTrainer.PARAM_TARGET_LOCATION, modelFile,
                MalletLdaTopicModelTrainer.PARAM_TOKEN_FEATURE_PATH, featurePath,
                MalletLdaTopicModelTrainer.PARAM_MIN_TOKEN_LENGTH, minLength,
                MalletLdaTopicModelTrainer.PARAM_ALPHA_SUM, alpha,
                MalletLdaTopicModelTrainer.PARAM_BETA, beta,
                MalletLdaTopicModelTrainer.PARAM_COVERING_ANNOTATION_TYPE,
                useSentences ? Sentence.class.getName() : null));

        AnalysisEngineDescription[] pipeline;

        /* add div type filter */
        if (divTypes != null) {
            aEngines1.add(0, createEngineDescription(
                    WdkAnnotationBasedCasFilter.class,
                    WdkAnnotationBasedCasFilter.PARAM_ALLOWED_DIV_TYPES, divTypes));
            pipeline = new AnalysisEngineDescription[] {
                    CasFilter_ImplBase.createAggregateBuilderDescription(aEngines1) };
        }
        else {
            pipeline = aEngines1.toArray(new AnalysisEngineDescription[aEngines1.size()]);
        }
        return pipeline;
    }

    private static void setOptions(String[] args)
    {
        Options options = new Options();
        options.addOption("s", "sourceDir", true,
                "Base directory containing binary CAS'\n(default: '.').");
        options.addOption("p", "pattern", true,
                "File pattern(s) for binary CAS files\n(default: [+]*/ocr/*/*.bin).");

        options.addOption("m", "modelFile", true, "Target file in which to store model.");
        options.addOption("t", "topics", true, "Number of topics to generate (default: 50).");
        options.addOption("i", "iterations", true,
                "Number of iterations during model generation (default: 500).");
        options.addOption("c", "CPUs", true, "Number of CPUs/threads to use (default: 1).");

        options.addOption("w", "stopWords", true,
                "Stopwords file; if none specified, don't filter stopwords.");
        options.addOption("y", "typeName", true,
                "Type to use for model generation, e.g. NamedEntity (default: Token)");
        options.addOption(
                "r",
                "regex",
                true,
                "Regular expression for filtering: if given, only retain tokens that match this regex, e.g. '[A-Z].{2,}' for tokens that start with a capital letter and have at least a length of three.");
        options.addOption("l", "lemma", false,
                "Use lemma instead of original word form where available.");
        options.addOption("f", "pos", true, "POS tags to use.");
        options.addOption("n", "minLength", true,
                "Minimum token (or other type) length\n(default: 3).");
        options.addOption("W", "wordsFile", true, "Use only words listed in this file.");
        options.addOption("d", "divType", true, "Allowed div type (multiple allowed, e.g. 'Chapter').");
        options.addOption("S", "sentences", false,
                "Use sentences instead of the whole document for model estimation.");
        options.addOption("a", "alpha", true,
                "Alpha value (symmetric for all topics) for Dirichlet process.");
        options.addOption("b", "beta", true, "Beta value for Dirichlet process.");

        CommandLineParser parser = new DefaultParser();
        try {
            CommandLine line = parser.parse(options, args);
            casDir = new File(line.getOptionValue("sourceDir"));
            casFilePatterns = line.hasOption("pattern") ? line.getOptionValues("pattern")
                    : new String[] { "[+]*/ocr/*/*.bin" };

            modelFile = new File(line.getOptionValue("modelFile"));
            nTopics = Integer.parseInt(line.getOptionValue("topics", "50"));
            nIterations = Integer.parseInt(line.getOptionValue("iterations", "500"));
            nThreads = Integer.parseInt(line.getOptionValue("CPUs", "1"));

            stopwordsFile = line.getOptionValue("stopWords");
            typeName = line.getOptionValue("typeName", Token.class.getName());
            textFilterRegex = line.getOptionValue("regex");
            useLemma = line.hasOption("lemma");
            pos = line.getOptionValues("pos");
            minLength = Integer.parseInt(line.getOptionValue("minLength", "3"));
            wordsFile = line.getOptionValue("wordsFile");
            divTypes = line.getOptionValues("divType");
            useSentences = line.hasOption("sentences");
            alpha = Float.parseFloat(line.getOptionValue("alpha", "1.0"));
            beta = Float.parseFloat(line.getOptionValue("beta", "0.01"));
        }
        catch (ParseException | NumberFormatException | NullPointerException e) {
            new HelpFormatter().printHelp("java -jar EstimateTopicModel.jar", options);
            System.exit(1);
        }

        if (typeName.toLowerCase().equals("namedentity")) {
            typeName = NamedEntity.class.getName();
        }
        else if (typeName.toLowerCase().equals("token")) {
            typeName = Token.class.getName();
        }
    }
}