Permalink
Browse files

merge

  • Loading branch information...
2 parents 82b889c + 94b2ec0 commit a717bbf2fc28f26e84f8374f6ffce4d30d8c618c @lukasnalezenec lukasnalezenec committed Dec 26, 2013
Showing with 2,232 additions and 1 deletion.
  1. +2 −1 .gitignore
  2. +4 −0 parquet-protobuf/README.md
  3. +157 −0 parquet-protobuf/pom.xml
  4. +34 −0 parquet-protobuf/src/main/java/parquet/proto/ProtoParquetInputFormat.java
  5. +54 −0 parquet-protobuf/src/main/java/parquet/proto/ProtoParquetOutputFormat.java
  6. +40 −0 parquet-protobuf/src/main/java/parquet/proto/ProtoParquetReader.java
  7. +78 −0 parquet-protobuf/src/main/java/parquet/proto/ProtoParquetWriter.java
  8. +66 −0 parquet-protobuf/src/main/java/parquet/proto/ProtoReadSupport.java
  9. +41 −0 parquet-protobuf/src/main/java/parquet/proto/ProtoRecordMaterializer.java
  10. +136 −0 parquet-protobuf/src/main/java/parquet/proto/ProtoSchemaConverter.java
  11. +193 −0 parquet-protobuf/src/main/java/parquet/proto/ProtoWriteSupport.java
  12. +72 −0 parquet-protobuf/src/main/java/parquet/proto/ProtobufferRecordConverter.java
  13. +26 −0 parquet-protobuf/src/main/java/parquet/proto/converters/ParentValueContainer.java
  14. +44 −0 parquet-protobuf/src/main/java/parquet/proto/converters/ProtoArrayConverter.java
  15. +36 −0 parquet-protobuf/src/main/java/parquet/proto/converters/ProtoBinaryConverter.java
  16. +34 −0 parquet-protobuf/src/main/java/parquet/proto/converters/ProtoBooleanConverter.java
  17. +33 −0 parquet-protobuf/src/main/java/parquet/proto/converters/ProtoDoubleConverter.java
  18. +69 −0 parquet-protobuf/src/main/java/parquet/proto/converters/ProtoEnumConverter.java
  19. +33 −0 parquet-protobuf/src/main/java/parquet/proto/converters/ProtoFloatConverter.java
  20. +33 −0 parquet-protobuf/src/main/java/parquet/proto/converters/ProtoIntConverter.java
  21. +33 −0 parquet-protobuf/src/main/java/parquet/proto/converters/ProtoLongConverter.java
  22. +183 −0 parquet-protobuf/src/main/java/parquet/proto/converters/ProtoMessageConverter.java
  23. +37 −0 parquet-protobuf/src/main/java/parquet/proto/converters/ProtobufStringConverter.java
  24. +102 −0 parquet-protobuf/src/test/java/parquet/proto/ProtoInputOutputFormatTest.java
  25. +99 −0 parquet-protobuf/src/test/java/parquet/proto/ProtoSchemaConverterTest.java
  26. +134 −0 parquet-protobuf/src/test/java/parquet/proto/ProtobufferRecordConverterTest.java
  27. +174 −0 parquet-protobuf/src/test/java/parquet/proto/TestUtils.java
  28. +78 −0 parquet-protobuf/src/test/java/parquet/proto/utils/ReadUsingMR.java
  29. +113 −0 parquet-protobuf/src/test/java/parquet/proto/utils/WriteUsingMR.java
  30. +94 −0 parquet-protobuf/src/test/resources/TestProtobuf.proto
View
@@ -13,4 +13,5 @@ target
*.orig
*.rej
dependency-reduced-pom.xml
-.idea/*
+.idea/*
+target/
@@ -0,0 +1,4 @@
+parquet-protobuf
+================
+
+protobuffer support for Parquet columnar format
@@ -0,0 +1,157 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <parent>
+ <groupId>com.twitter</groupId>
+ <artifactId>parquet</artifactId>
+ <relativePath>../pom.xml</relativePath>
+ <version>1.2.10-SNAPSHOT</version>
+ </parent>
+
+ <modelVersion>4.0.0</modelVersion>
+
+ <artifactId>parquet-protobuf</artifactId>
+ <packaging>jar</packaging>
+
+ <properties>
+ <elephant-bird.version>3.0.8</elephant-bird.version>
+ </properties>
+
+
+ <name>Parquet Protobuf</name>
+ <url>https://github.com/lukasnalezenec/parquet-protobuf.git</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>com.google.protobuf</groupId>
+ <artifactId>protobuf-java</artifactId>
+ <version>2.4.1</version>
+ </dependency>
+ <dependency>
+ <groupId>com.twitter</groupId>
+ <artifactId>parquet-hadoop</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>com.twitter.elephantbird</groupId>
+ <artifactId>elephant-bird-core</artifactId>
+ <version>${elephant-bird.version}</version>
+ <exclusions>
+ <exclusion>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-core</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-client</artifactId>
+ <version>${hadoop.version}</version>
+ </dependency>
+
+ </dependencies>
+
+ <developers>
+ <developer>
+ <id>lukasnalezenec</id>
+ <name>Lukas Nalezenec</name>
+ </developer>
+ </developers>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <version>${maven-jar-plugin.version}</version>
+ <configuration>
+ <archive>
+ <manifestEntries>
+ <git-SHA-1>${buildNumber}</git-SHA-1>
+ </manifestEntries>
+ </archive>
+ </configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-shade-plugin</artifactId>
+ <executions>
+ <execution>
+ <phase>package</phase>
+ <goals>
+ <goal>shade</goal>
+ </goals>
+ <configuration>
+ <artifactSet>
+ <includes>
+ <include>org.codehaus.jackson:jackson-mapper-asl</include>
+ <include>org.codehaus.jackson:jackson-core-asl</include>
+ </includes>
+ </artifactSet>
+ <relocations>
+ <relocation>
+ <pattern>org.codehaus.jackson</pattern>
+ <shadedPattern>parquet.org.codehaus.jackson</shadedPattern>
+ </relocation>
+ </relocations>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <!-- Ensure that the specific classes are available during test compile but not included in jar -->
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>build-helper-maven-plugin</artifactId>
+ <version>1.8</version>
+ <executions>
+ <execution>
+ <id>add-test-sources</id>
+ <phase>generate-test-sources</phase>
+ <goals>
+ <goal>add-test-source</goal>
+ </goals>
+ <configuration>
+ <sources>
+ <source>${project.build.directory}/generated-test-sources</source>
+ </sources>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+
+ <plugin>
+ <artifactId>maven-antrun-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>generate-sources</id>
+ <phase>generate-test-sources</phase>
+ <configuration>
+ <tasks>
+ <mkdir dir="${project.build.directory}/generated-test-sources"/>
+ <mkdir dir="${project.build.directory}/generated-test-sources/java"/>
+ <exec failonerror="true" executable="protoc">
+ <arg value="--java_out=${project.build.directory}/generated-test-sources/java"/>
+ <arg value="src/test/resources/TestProtobuf.proto"/>
+ <arg value="-I."/>
+ </exec>
+ </tasks>
+ <sourceRoot>src/main/java</sourceRoot>
+ <sourceRoot>target/generated-sources/java</sourceRoot>
+ </configuration>
+ <goals>
+ <goal>run</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+
+ </plugins>
+ </build>
+
+</project>
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2013 Lukas Nalezenec
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package parquet.proto;
+
+import com.google.protobuf.MessageOrBuilder;
+import org.apache.hadoop.conf.Configuration;
+import parquet.hadoop.ParquetInputFormat;
+
+/**
+ * A Hadoop {@link org.apache.hadoop.mapreduce.InputFormat} for Parquet files.
+ */
+public class ProtoParquetInputFormat<T extends MessageOrBuilder> extends ParquetInputFormat<T> {
+ public ProtoParquetInputFormat() {
+ super(ProtoReadSupport.class);
+ }
+
+ public static void setRequestedProjection(Configuration configuration, String requestedProjection) {
+ ProtoReadSupport.setRequestedProjection(configuration, requestedProjection);
+ }
+
+}
@@ -0,0 +1,54 @@
+/**
+ * Copyright 2013 Lukas Nalezenec
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package parquet.proto;
+
+import com.google.protobuf.Message;
+import com.google.protobuf.MessageOrBuilder;
+import org.apache.hadoop.mapreduce.Job;
+import parquet.hadoop.ParquetOutputFormat;
+import parquet.hadoop.util.ContextUtil;
+
+/**
+ * A Hadoop {@link org.apache.hadoop.mapreduce.OutputFormat} for Protobuffer Parquet files.
+ * <p/>
+ * Usage:
+ * <p/>
+ * <pre>
+ * {@code
+ * final Job job = new Job(conf, "Parquet writing job");
+ * job.setOutputFormatClass(ProtoParquetOutputFormat.class);
+ * ProtoParquetOutputFormat.setOutputPath(job, parquetPath);
+ * ProtoParquetOutputFormat.setProtobufferClass(job, YourProtobuffer.class);
+ * }
+ * </pre>
+ *
+ * @author Lukas Nalezenec
+ */
+public class ProtoParquetOutputFormat<T extends MessageOrBuilder> extends ParquetOutputFormat<T> {
+
+ public static void setProtobufferClass(Job job, Class<? extends Message> protoClass) {
+ ProtoWriteSupport.setSchema(ContextUtil.getConfiguration(job), protoClass);
+ }
+
+ public ProtoParquetOutputFormat(Class<? extends Message> msg) {
+ super(new ProtoWriteSupport(msg));
+ }
+
+ public ProtoParquetOutputFormat() {
+ super(new ProtoWriteSupport());
+ }
+
+}
@@ -0,0 +1,40 @@
+/**
+ * Copyright 2012 Twitter, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package parquet.proto;
+
+import com.google.protobuf.MessageOrBuilder;
+import org.apache.hadoop.fs.Path;
+import parquet.filter.UnboundRecordFilter;
+import parquet.hadoop.ParquetReader;
+import parquet.hadoop.api.ReadSupport;
+
+import java.io.IOException;
+
+/**
+ * Read Avro records from a Parquet file.
+ */
+public class ProtoParquetReader<T extends MessageOrBuilder> extends ParquetReader<T> {
+
+ public ProtoParquetReader(Path file) throws IOException {
+ super(file, (ReadSupport<T>) new ProtoReadSupport());
+ }
+
+ public ProtoParquetReader(Path file, UnboundRecordFilter recordFilter) throws IOException {
+ super(file, (ReadSupport<T>) new ProtoReadSupport(), recordFilter);
+ }
+
+ //TODO here should be option to override pb from file
+}
@@ -0,0 +1,78 @@
+/**
+ * Copyright 2012 Twitter, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package parquet.proto;
+
+import com.google.protobuf.Message;
+import com.google.protobuf.MessageOrBuilder;
+import org.apache.hadoop.fs.Path;
+import parquet.hadoop.ParquetWriter;
+import parquet.hadoop.api.WriteSupport;
+import parquet.hadoop.metadata.CompressionCodecName;
+
+import java.io.IOException;
+
+/**
+ * Write Protobuffer records to a Parquet file.
+ */
+public class ProtoParquetWriter<T extends MessageOrBuilder> extends ParquetWriter<T> {
+
+ /**
+ * Create a new {@link ProtoParquetWriter}.
+ *
+ * @param file
+ * @param compressionCodecName
+ * @param blockSize
+ * @param pageSize
+ * @throws IOException
+ */
+ public ProtoParquetWriter(Path file, Class<? extends Message> protoMessage,
+ CompressionCodecName compressionCodecName, int blockSize,
+ int pageSize) throws IOException {
+ super(file, (WriteSupport<T>) new ProtoWriteSupport(protoMessage),
+ compressionCodecName, blockSize, pageSize);
+ }
+
+ /**
+ * Create a new {@link ProtoParquetWriter}.
+ *
+ * @param file The file name to write to.
+ * @param compressionCodecName Compression code to use, or CompressionCodecName.UNCOMPRESSED
+ * @param blockSize HDFS block size
+ * @param pageSize See parquet write up. Blocks are subdivided into pages for alignment and other purposes.
+ * @param enableDictionary Whether to use a dictionary to compress columns.
+ * @throws IOException
+ */
+ public ProtoParquetWriter(Path file, Class<? extends Message> protoMessage,
+ CompressionCodecName compressionCodecName, int blockSize,
+ int pageSize, boolean enableDictionary) throws IOException {
+ super(file, (WriteSupport<T>)
+ new ProtoWriteSupport(protoMessage),
+ compressionCodecName, blockSize, pageSize, enableDictionary, false);
+ }
+
+ /**
+ * Create a new {@link ProtoParquetWriter}. The default block size is 50 MB.The default
+ * page size is 1 MB. Default compression is no compression. (Inherited from {@link ParquetWriter})
+ *
+ * @param file The file name to write to.
+ * @throws IOException
+ */
+ public ProtoParquetWriter(Path file, Class<? extends Message> protoMessage) throws IOException {
+ this(file, protoMessage, CompressionCodecName.UNCOMPRESSED,
+ DEFAULT_BLOCK_SIZE, DEFAULT_PAGE_SIZE);
+ }
+
+}
Oops, something went wrong. Retry.

0 comments on commit a717bbf

Please sign in to comment.