Skip to content

Commit

Permalink
Initial commit of code.
Browse files Browse the repository at this point in the history
build.xml still uses my.properties, which can be
eliminated.

README docs for running labs are needed.

Processing code isn't yet parsing XML.
  • Loading branch information
Ken Krugler committed Nov 14, 2011
0 parents commit 5968995
Show file tree
Hide file tree
Showing 23 changed files with 3,452 additions and 0 deletions.
11 changes: 11 additions & 0 deletions .gitignore
@@ -0,0 +1,11 @@
.DS_Store
.classpath
.project
.settings/
build
target
build-eclipse
*.dot
*.hprof
junit*.properties

8 changes: 8 additions & 0 deletions README
@@ -0,0 +1,8 @@
This project contains

- A stand-alone tool to convert the Wikipedia article dump (as XML) into multiple
text files, each consisting of one <page>xxxx</page> record per line. This is
then suitable for input to Hadoop.
- A Hadoop-based workflow that processes the dump, extracts ngrams, and
generates counts.

28 changes: 28 additions & 0 deletions build.properties
@@ -0,0 +1,28 @@
name=wikipedia-ngrams

job.name=wikipedia-ngrams-job.jar

main.src.dir=src/main/java
test.src.dir=src/test/java

main.res.dir=src/main/resources
test.res.dir=src/test/resources

lib.dir=lib

build.dir=build
build.dir.main-classes=${build.dir}/classes-main
build.dir.test-classes=${build.dir}/classes-test

build.dir.main-classes-eclipse=${build.dir}/classes-main-eclipse
build.dir.test-classes-eclipse=${build.dir}/classes-test-eclipse

build.dir.test-reports=${build.dir}/test

javac.debug=on
javac.optimize=on
javac.deprecation=off
javac.version=1.6
javac.args=
javac.args.warnings=-Xlint:none
build.encoding=UTF-8
219 changes: 219 additions & 0 deletions build.xml
@@ -0,0 +1,219 @@
<!--
Copyright (c) 2011 Amazon, Inc.
All rights reserved.
-->

<project name="wikipedia-ngrams" default="test">

<property name="root.dir" value="${basedir}" />
<property file="${root.dir}/build.properties" />
<property file="${root.dir}/my.properties" />

<fail unless="hadoop.home">The "hadoop.home" property must be set on the command line or in my.properties!</fail>

<!-- ================================================================== -->
<!-- General cleaning sources -->
<!-- ================================================================== -->

<target name="clean" description="--> clean the project">
<echo>cleaning ${ant.project.name}</echo>
<delete includeemptydirs="true" failonerror="false">
<fileset dir="${build.dir}" excludes="classes-*-eclipse/" />
</delete>
</target>


<!-- ================================================================== -->
<!-- Maven -->
<!-- ================================================================== -->

<target name="mvn-init" unless="compile.classpath" xmlns:artifact="urn:maven-artifact-ant">
<path id="maven.ant.tasks.classpath" path="${lib.dir}/maven-ant-tasks-2.0.10.jar" />

<typedef resource="org/apache/maven/artifact/ant/antlib.xml" uri="urn:maven-artifact-ant"
classpathref="maven.ant.tasks.classpath"/>

<condition property="maven.repo.local" value="${maven.repo.local}" else="${user.home}/.m2/repository">
<isset property="maven.repo.local"/>
</condition>
<artifact:localRepository id="local.repository" path="${maven.repo.local}"/>
<artifact:pom file="pom.xml" id="maven.project"/>
<artifact:dependencies pathId="compile.classpath" filesetId="compile.fileset" useScope="compile">
<pom refid="maven.project"/>
<localRepository refid="local.repository"/>
</artifact:dependencies>
<artifact:dependencies pathId="test.classpath" filesetId="test.fileset" useScope="test">
<pom refid="maven.project"/>
<localRepository refid="local.repository"/>
</artifact:dependencies>
<artifact:dependencies pathId="runtime.classpath" filesetId="runtime.fileset" useScope="runtime">
<pom refid="maven.project"/>
<localRepository refid="local.repository"/>
</artifact:dependencies>
</target>

<!-- ================================================================== -->
<!-- Build sources -->
<!-- ================================================================== -->

<target name="compile"
depends="mvn-init"
description="--> compile main classes">
<mkdir dir="${build.dir.main-classes}" />
<javac encoding="${build.encoding}"
srcdir="${main.src.dir}"
includes="**/*.java"
destdir="${build.dir.main-classes}"
debug="${javac.debug}"
optimize="${javac.optimize}"
target="${javac.version}"
source="${javac.version}"
deprecation="${javac.deprecation}">
<compilerarg line="${javac.args} ${javac.args.warnings}" />
<classpath refid="compile.classpath" />
</javac>
</target>


<!-- ================================================================== -->
<!-- Unit Tests -->
<!-- ================================================================== -->

<target name="compile-test" depends="compile">
<echo>*** Building Unit Tests Sources ***</echo>
<mkdir dir="${build.dir.test-classes}" />
<path id="test.path">
<pathelement location="${build.dir.main-classes}" />
</path>

<javac encoding="${build.encoding}"
srcdir="${test.src.dir}"
includes="**/*.java"
destdir="${build.dir.test-classes}"
debug="${javac.debug}"
optimize="${javac.optimize}"
target="${javac.version}"
source="${javac.version}"
deprecation="${javac.deprecation}">
<compilerarg line="${javac.args} ${javac.args.warnings}" />
<classpath refid="test.classpath" />
<classpath refid="test.path" />
</javac>
</target>

<target name="test" depends="compile-test" description="--> run unit tests">
<delete dir="${build.dir.test-reports}" />
<mkdir dir="${build.dir.test-reports}" />

<junit showoutput="false"
printsummary="yes"
haltonfailure="no"
fork="yes"
maxmemory="256m"
dir="${basedir}"
errorProperty="tests.failed"
failureProperty="tests.failed">
<classpath>
<pathelement location="${build.dir.main-classes}" />
<pathelement location="${build.dir.test-classes}" />
<pathelement location="${test.res.dir}" />
<pathelement location="${main.res.dir}" />
<path refid="test.classpath" />
</classpath>
<formatter type="plain" />
<batchtest fork="yes" todir="${build.dir.test-reports}" unless="testcase">
<fileset dir="${test.src.dir}">
<include name="**/*Test.java" unless="testcase" />
<exclude name="**/Abstract*.java" unless="testcase" />
<include name="${testcase}" if="testcase" />
</fileset>
</batchtest>
<batchtest fork="yes" todir="${build.dir.test-reports}" if="testcase">
<fileset dir="${test.src.dir}" includes="**/${testcase}.java" />
</batchtest>
</junit>
<fail if="tests.failed">Tests failed!</fail>
</target>


<!-- ================================================================== -->
<!-- Hadoop job jar -->
<!-- ================================================================== -->

<target name="job"
depends="compile"
description="--> create a Hadoop ready jar with all dependencies">

<!-- Make sure lib/ dir starts out empty, so we don't get multiple
copies of jars with slightly different versions
-->
<delete dir="${build.dir}/lib" />
<mkdir dir="${build.dir}/lib" />

<copy todir="${build.dir}/lib" flatten="true">
<path refid="runtime.classpath" />
</copy>

<jar destfile="${build.dir}/${job.name}" compress="true">
<fileset dir="${build.dir.main-classes}" />
<fileset dir="${main.res.dir}" />
<fileset dir="${build.dir}" includes="lib/" />

<manifest>
<attribute name="Main-Class" value="com.amazon.aws.training.emr.wikipedia.NgramJob"/>
</manifest>

</jar>
</target>


<!-- ================================================================== -->
<!-- Generating eclipse file -->
<!-- ================================================================== -->

<target name="eclipse"
depends="mvn-init, clean-eclipse"
description="--> create the Eclipse project files">

<taskdef name="eclipse"
classname="prantl.ant.eclipse.EclipseTask"
classpathref="compile.classpath" />
<mkdir dir="${build.dir.main-classes-eclipse}" />
<mkdir dir="${build.dir.test-classes-eclipse}" />
<eclipse>
<settings>
<jdtcore compilercompliance="6.0" />
<resources encoding="UTF-8" />
</settings>
<project name="${ant.project.name}" />
<classpath>
<container path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6" />

<source path="${basedir}/src/main/java"
output="${build.dir.main-classes-eclipse}" />
<source path="${basedir}/src/main/resources"
output="${build.dir.main-classes-eclipse}" />
<source path="${basedir}/src/test/java"
output="${build.dir.test-classes-eclipse}" />
<source path="${basedir}/src/test/resources"
output="${build.dir.test-classes-eclipse}" />

<output path="${build.dir.main-classes-eclipse}" />
<library pathref="test.classpath" exported="false" />
</classpath>
</eclipse>
<concat destfile="${root.dir}/.settings/org.eclipse.jdt.core.prefs" append="true">
<filelist dir="${root.dir}/doc/" files="eclipse-formatter.properties" />
</concat>
</target>

<target name="clean-eclipse" description="--> clean the Eclipse project files">
<delete file=".classpath" />
<delete file=".eclipse" />
<delete file=".project" />
<delete dir=".settings" />
</target>


</project>

0 comments on commit 5968995

Please sign in to comment.