diff --git a/README.md b/README.md index 7d4abd1c..fa274524 100644 --- a/README.md +++ b/README.md @@ -15,20 +15,30 @@ Build jar from source with ```shell ./gradlew build ``` -and find the output JAR file as `build/libs/restructurehdfs-0.3.1-all.jar`. Then run with: +and find the output JAR file as `build/libs/restructurehdfs-0.3.3-all.jar`. Then run with: ```shell -java -jar restructurehdfs-0.3.1-all.jar +java -jar restructurehdfs-0.3.3-all.jar --hdfs-uri --output-directory [ ...] +``` +or you can use the short form as well like - +```shell +java -jar restructurehdfs-0.3.3-all.jar -u -o [ ...] ``` -By default, this will output the data in CSV format. If JSON format is preferred, use the following instead: +To display the usage and all available options you can use the help option as follows - +```shell +java -jar restructurehdfs-0.3.3-all.jar --help ``` -java -Dorg.radarcns.format=json -jar restructurehdfs-0.3.1-all.jar +Note that the options preceded by the `*` in the above output are required to run the app. Also note that there can be multiple input paths from which to read the files. Eg - `/topicAndroidNew/topic1 /topicAndroidNew/topic2 ...`. At least one input path is required. + +By default, this will output the data in CSV format. If JSON format is preferred, use the following instead: +```shell +java -jar restructurehdfs-0.3.3-all.jar --format json --hdfs-uri --output-directory [ ...] ``` Another option is to output the data in compressed form. All files will get the `gz` suffix, and can be decompressed with a GZIP decoder. Note that for a very small number of records, this may actually increase the file size. ``` -java -Dorg.radarcns.compression=gzip -jar restructurehdfs-0.3.1-all.jar +java -jar restructurehdfs-0.3.3-all.jar --compression gzip --hdfs-uri --output-directory [ ...] ``` -Finally, by default, files records are not deduplicated after writing. To enable this behaviour, specify the option `-Dorg.radarcns.deduplicate=true`. This set to false by default because of an issue with Biovotion data. Please see - [issue #16](https://github.com/RADAR-base/Restructure-HDFS-topic/issues/16) before enabling it. +Finally, by default, files records are not deduplicated after writing. To enable this behaviour, specify the option `--deduplicate` or `-d`. This set to false by default because of an issue with Biovotion data. Please see - [issue #16](https://github.com/RADAR-base/Restructure-HDFS-topic/issues/16) before enabling it. diff --git a/build.gradle b/build.gradle index c88bb3e6..ed49280c 100644 --- a/build.gradle +++ b/build.gradle @@ -15,6 +15,7 @@ targetCompatibility = '1.8' ext.avroVersion = '1.8.2' ext.jacksonVersion = '2.8.9' ext.hadoopVersion = '2.7.3' +ext.jCommanderVersion = '1.72' repositories { jcenter() @@ -27,6 +28,8 @@ dependencies { compile group: 'com.fasterxml.jackson.core' , name: 'jackson-databind', version: jacksonVersion compile group: 'com.fasterxml.jackson.dataformat' , name: 'jackson-dataformat-csv', version: jacksonVersion + compile group: 'com.beust', name: 'jcommander', version: jCommanderVersion + runtime group: 'org.apache.hadoop', name: 'hadoop-hdfs', version: hadoopVersion testCompile group: 'junit', name: 'junit', version: '4.12' diff --git a/src/main/java/org/radarcns/RestructureAvroRecords.java b/src/main/java/org/radarcns/RestructureAvroRecords.java index e1855ca6..2c4b6d95 100644 --- a/src/main/java/org/radarcns/RestructureAvroRecords.java +++ b/src/main/java/org/radarcns/RestructureAvroRecords.java @@ -16,6 +16,7 @@ package org.radarcns; +import com.beust.jcommander.JCommander; import com.fasterxml.jackson.databind.JsonMappingException; import org.apache.avro.Schema.Field; import org.apache.avro.file.DataFileReader; @@ -32,6 +33,7 @@ import org.radarcns.util.JsonAvroConverter; import org.radarcns.util.ProgressBar; import org.radarcns.util.RecordConverterFactory; +import org.radarcns.util.commandline.CommandLineArgs; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -70,27 +72,39 @@ public class RestructureAvroRecords { private long processedFileCount; private long processedRecordsCount; - private static final boolean USE_GZIP = "gzip".equalsIgnoreCase(System.getProperty("org.radarcns.compression")); - - // Default set to false because causes loss of records from Biovotion data. https://github.com/RADAR-base/Restructure-HDFS-topic/issues/16 - private static final boolean DO_DEDUPLICATE = "true".equalsIgnoreCase(System.getProperty("org.radarcns.deduplicate", "false")); + private final boolean useGzip; + private final boolean doDeduplicate; public static void main(String [] args) throws Exception { - if (args.length != 3) { - System.out.println("Usage: hadoop jar restructurehdfs-all-0.2.jar "); - System.exit(1); + + final CommandLineArgs commandLineArgs = new CommandLineArgs(); + final JCommander parser = JCommander.newBuilder().addObject(commandLineArgs).build(); + + parser.setProgramName("hadoop jar restructurehdfs-all-0.3.3.jar"); + parser.parse(args); + + if(commandLineArgs.help) { + parser.usage(); + System.exit(0); } logger.info(new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date())); logger.info("Starting..."); - logger.info("In: " + args[0] + args[1]); - logger.info("Out: " + args[2]); long time1 = System.currentTimeMillis(); - RestructureAvroRecords restr = new RestructureAvroRecords(args[0], args[2]); + RestructureAvroRecords restr = new RestructureAvroRecords.Builder(commandLineArgs.hdfsUri, + commandLineArgs.outputDirectory) + .useGzip("gzip".equalsIgnoreCase(commandLineArgs.compression)) + .doDeduplicate(commandLineArgs.deduplicate).format(commandLineArgs.format) + .build(); + try { - restr.start(args[1]); + for(String input : commandLineArgs.inputPaths) { + logger.info("In: " + commandLineArgs.hdfsUri + input); + logger.info("Out: " + commandLineArgs.outputDirectory); + restr.start(input); + } } catch (IOException ex) { logger.error("Processing failed", ex); } @@ -99,12 +113,16 @@ public static void main(String [] args) throws Exception { logger.info("Time taken: {} seconds", (System.currentTimeMillis() - time1)/1000d); } - public RestructureAvroRecords(String inputPath, String outputPath) { - this.setInputWebHdfsURL(inputPath); - this.setOutputPath(outputPath); + private RestructureAvroRecords(RestructureAvroRecords.Builder builder) { + this.setInputWebHdfsURL(builder.hdfsUri); + this.setOutputPath(builder.outputPath); + + this.useGzip = builder.useGzip; + this.doDeduplicate = builder.doDeduplicate; + logger.info("Deduplicate set to {}", doDeduplicate); String extension; - if (System.getProperty("org.radarcns.format", "csv").equalsIgnoreCase("json")) { + if (builder.format.equalsIgnoreCase("json")) { logger.info("Writing output files in JSON format"); converterFactory = JsonAvroConverter.getFactory(); extension = "json"; @@ -113,7 +131,7 @@ public RestructureAvroRecords(String inputPath, String outputPath) { converterFactory = CsvAvroConverter.getFactory(); extension = "csv"; } - if (USE_GZIP) { + if (this.useGzip) { logger.info("Compressing output files in GZIP format"); extension += ".gz"; } @@ -179,7 +197,7 @@ public void start(String directoryName) throws IOException { // Actually process the files for (Map.Entry> entry : topicPaths.entrySet()) { - try (FileCacheStore cache = new FileCacheStore(converterFactory, 100, USE_GZIP, DO_DEDUPLICATE)) { + try (FileCacheStore cache = new FileCacheStore(converterFactory, 100, useGzip, doDeduplicate)) { for (Path filePath : entry.getValue()) { // If JsonMappingException occurs, log the error and continue with other files try { @@ -347,4 +365,37 @@ public static Date getDate(GenericRecord keyField, GenericRecord valueField) { long time = (Long) keyField.get("start"); return new Date(time); } + + public static class Builder { + private boolean useGzip; + private boolean doDeduplicate; + private String hdfsUri; + private String outputPath; + private String format; + + public Builder(final String uri, final String outputPath) { + this.hdfsUri = uri; + this.outputPath = outputPath; + } + + public Builder useGzip(final boolean gzip) { + this.useGzip = gzip; + return this; + } + + public Builder doDeduplicate(final boolean dedup) { + this.doDeduplicate = dedup; + return this; + } + + public Builder format(final String format) { + this.format = format; + return this; + } + + public RestructureAvroRecords build() { + return new RestructureAvroRecords(this); + } + + } } diff --git a/src/main/java/org/radarcns/util/commandline/CommandLineArgs.java b/src/main/java/org/radarcns/util/commandline/CommandLineArgs.java new file mode 100644 index 00000000..63a89a5e --- /dev/null +++ b/src/main/java/org/radarcns/util/commandline/CommandLineArgs.java @@ -0,0 +1,31 @@ +package org.radarcns.util.commandline; + +import com.beust.jcommander.Parameter; + +import java.util.ArrayList; +import java.util.List; + +public class CommandLineArgs { + + @Parameter(description = " [ ...]", variableArity = true, required = true) + public List inputPaths = new ArrayList<>(); + + @Parameter(names = { "-f", "--format" }, description = "Format to use when converting the files. JSON and CSV is available.") + public String format = "csv"; + + @Parameter(names = { "-c", "--compression" }, description = "Compression to use when converting the files. Gzip is available.") + public String compression = "none"; + + // Default set to false because causes loss of records from Biovotion data. https://github.com/RADAR-base/Restructure-HDFS-topic/issues/16 + @Parameter(names = { "-d", "--deduplicate" }, description = "Boolean to define if to use deduplication or not.") + public boolean deduplicate; + + @Parameter(names = { "-u", "--hdfs-uri" }, description = "The HDFS uri to connect to. Eg - 'hdfs://:/'.", required = true, validateWith = { HdfsUriValidator.class, PathValidator.class }) + public String hdfsUri; + + @Parameter(names = { "-o", "--output-directory"}, description = "The output folder where the files are to be extracted.", required = true, validateWith = PathValidator.class) + public String outputDirectory; + + @Parameter(names = { "-h", "--help"}, help = true, description = "Display the usage of the program with available options.") + public boolean help; +} \ No newline at end of file diff --git a/src/main/java/org/radarcns/util/commandline/HdfsUriValidator.java b/src/main/java/org/radarcns/util/commandline/HdfsUriValidator.java new file mode 100644 index 00000000..3a625a5a --- /dev/null +++ b/src/main/java/org/radarcns/util/commandline/HdfsUriValidator.java @@ -0,0 +1,16 @@ +package org.radarcns.util.commandline; + + +import com.beust.jcommander.ParameterException; +import com.beust.jcommander.IParameterValidator; + +public class HdfsUriValidator implements IParameterValidator{ + @Override + public void validate(String name, String value) throws ParameterException { + if (! value.matches("((hdfs)|(webhdfs)):(/?/?)[^\\s]+")) { + throw new ParameterException("Parameter " + name + " should be a valid HDFS or WebHDFS URI. " + + "Eg - hdfs://:/. (found " + value + + "). Please run with --help or -h for more information."); + } + } +} diff --git a/src/main/java/org/radarcns/util/commandline/PathValidator.java b/src/main/java/org/radarcns/util/commandline/PathValidator.java new file mode 100644 index 00000000..d3d3935d --- /dev/null +++ b/src/main/java/org/radarcns/util/commandline/PathValidator.java @@ -0,0 +1,15 @@ +package org.radarcns.util.commandline; + +import com.beust.jcommander.ParameterException; +import com.beust.jcommander.IParameterValidator; + +public class PathValidator implements IParameterValidator{ + @Override + public void validate(String name, String value) throws ParameterException { + if (value == null || value.isEmpty()) { + throw new ParameterException("Parameter " + name + " should be supplied. " + + "It cannot be empty or null. (found " + value +")." + + "Please run with --help or -h for more information."); + } + } +}