Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
build.xml still uses my.properties, which can be eliminated. README docs for running labs are needed. Processing code isn't yet parsing XML.
- Loading branch information
Ken Krugler
committed
Nov 14, 2011
0 parents
commit 5968995
Showing
23 changed files
with
3,452 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
.DS_Store | ||
.classpath | ||
.project | ||
.settings/ | ||
build | ||
target | ||
build-eclipse | ||
*.dot | ||
*.hprof | ||
junit*.properties | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
This project contains | ||
|
||
- A stand-alone tool to convert the Wikipedia article dump (as XML) into multiple | ||
text files, each consisting of one <page>xxxx</page> record per line. This is | ||
then suitable for input to Hadoop. | ||
- A Hadoop-based workflow that processes the dump, extracts ngrams, and | ||
generates counts. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
name=wikipedia-ngrams | ||
|
||
job.name=wikipedia-ngrams-job.jar | ||
|
||
main.src.dir=src/main/java | ||
test.src.dir=src/test/java | ||
|
||
main.res.dir=src/main/resources | ||
test.res.dir=src/test/resources | ||
|
||
lib.dir=lib | ||
|
||
build.dir=build | ||
build.dir.main-classes=${build.dir}/classes-main | ||
build.dir.test-classes=${build.dir}/classes-test | ||
|
||
build.dir.main-classes-eclipse=${build.dir}/classes-main-eclipse | ||
build.dir.test-classes-eclipse=${build.dir}/classes-test-eclipse | ||
|
||
build.dir.test-reports=${build.dir}/test | ||
|
||
javac.debug=on | ||
javac.optimize=on | ||
javac.deprecation=off | ||
javac.version=1.6 | ||
javac.args= | ||
javac.args.warnings=-Xlint:none | ||
build.encoding=UTF-8 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,219 @@ | ||
<!-- | ||
Copyright (c) 2011 Amazon, Inc. | ||
All rights reserved. | ||
--> | ||
|
||
<project name="wikipedia-ngrams" default="test"> | ||
|
||
<property name="root.dir" value="${basedir}" /> | ||
<property file="${root.dir}/build.properties" /> | ||
<property file="${root.dir}/my.properties" /> | ||
|
||
<fail unless="hadoop.home">The "hadoop.home" property must be set on the command line or in my.properties!</fail> | ||
|
||
<!-- ================================================================== --> | ||
<!-- General cleaning sources --> | ||
<!-- ================================================================== --> | ||
|
||
<target name="clean" description="--> clean the project"> | ||
<echo>cleaning ${ant.project.name}</echo> | ||
<delete includeemptydirs="true" failonerror="false"> | ||
<fileset dir="${build.dir}" excludes="classes-*-eclipse/" /> | ||
</delete> | ||
</target> | ||
|
||
|
||
<!-- ================================================================== --> | ||
<!-- Maven --> | ||
<!-- ================================================================== --> | ||
|
||
<target name="mvn-init" unless="compile.classpath" xmlns:artifact="urn:maven-artifact-ant"> | ||
<path id="maven.ant.tasks.classpath" path="${lib.dir}/maven-ant-tasks-2.0.10.jar" /> | ||
|
||
<typedef resource="org/apache/maven/artifact/ant/antlib.xml" uri="urn:maven-artifact-ant" | ||
classpathref="maven.ant.tasks.classpath"/> | ||
|
||
<condition property="maven.repo.local" value="${maven.repo.local}" else="${user.home}/.m2/repository"> | ||
<isset property="maven.repo.local"/> | ||
</condition> | ||
<artifact:localRepository id="local.repository" path="${maven.repo.local}"/> | ||
<artifact:pom file="pom.xml" id="maven.project"/> | ||
<artifact:dependencies pathId="compile.classpath" filesetId="compile.fileset" useScope="compile"> | ||
<pom refid="maven.project"/> | ||
<localRepository refid="local.repository"/> | ||
</artifact:dependencies> | ||
<artifact:dependencies pathId="test.classpath" filesetId="test.fileset" useScope="test"> | ||
<pom refid="maven.project"/> | ||
<localRepository refid="local.repository"/> | ||
</artifact:dependencies> | ||
<artifact:dependencies pathId="runtime.classpath" filesetId="runtime.fileset" useScope="runtime"> | ||
<pom refid="maven.project"/> | ||
<localRepository refid="local.repository"/> | ||
</artifact:dependencies> | ||
</target> | ||
|
||
<!-- ================================================================== --> | ||
<!-- Build sources --> | ||
<!-- ================================================================== --> | ||
|
||
<target name="compile" | ||
depends="mvn-init" | ||
description="--> compile main classes"> | ||
<mkdir dir="${build.dir.main-classes}" /> | ||
<javac encoding="${build.encoding}" | ||
srcdir="${main.src.dir}" | ||
includes="**/*.java" | ||
destdir="${build.dir.main-classes}" | ||
debug="${javac.debug}" | ||
optimize="${javac.optimize}" | ||
target="${javac.version}" | ||
source="${javac.version}" | ||
deprecation="${javac.deprecation}"> | ||
<compilerarg line="${javac.args} ${javac.args.warnings}" /> | ||
<classpath refid="compile.classpath" /> | ||
</javac> | ||
</target> | ||
|
||
|
||
<!-- ================================================================== --> | ||
<!-- Unit Tests --> | ||
<!-- ================================================================== --> | ||
|
||
<target name="compile-test" depends="compile"> | ||
<echo>*** Building Unit Tests Sources ***</echo> | ||
<mkdir dir="${build.dir.test-classes}" /> | ||
<path id="test.path"> | ||
<pathelement location="${build.dir.main-classes}" /> | ||
</path> | ||
|
||
<javac encoding="${build.encoding}" | ||
srcdir="${test.src.dir}" | ||
includes="**/*.java" | ||
destdir="${build.dir.test-classes}" | ||
debug="${javac.debug}" | ||
optimize="${javac.optimize}" | ||
target="${javac.version}" | ||
source="${javac.version}" | ||
deprecation="${javac.deprecation}"> | ||
<compilerarg line="${javac.args} ${javac.args.warnings}" /> | ||
<classpath refid="test.classpath" /> | ||
<classpath refid="test.path" /> | ||
</javac> | ||
</target> | ||
|
||
<target name="test" depends="compile-test" description="--> run unit tests"> | ||
<delete dir="${build.dir.test-reports}" /> | ||
<mkdir dir="${build.dir.test-reports}" /> | ||
|
||
<junit showoutput="false" | ||
printsummary="yes" | ||
haltonfailure="no" | ||
fork="yes" | ||
maxmemory="256m" | ||
dir="${basedir}" | ||
errorProperty="tests.failed" | ||
failureProperty="tests.failed"> | ||
<classpath> | ||
<pathelement location="${build.dir.main-classes}" /> | ||
<pathelement location="${build.dir.test-classes}" /> | ||
<pathelement location="${test.res.dir}" /> | ||
<pathelement location="${main.res.dir}" /> | ||
<path refid="test.classpath" /> | ||
</classpath> | ||
<formatter type="plain" /> | ||
<batchtest fork="yes" todir="${build.dir.test-reports}" unless="testcase"> | ||
<fileset dir="${test.src.dir}"> | ||
<include name="**/*Test.java" unless="testcase" /> | ||
<exclude name="**/Abstract*.java" unless="testcase" /> | ||
<include name="${testcase}" if="testcase" /> | ||
</fileset> | ||
</batchtest> | ||
<batchtest fork="yes" todir="${build.dir.test-reports}" if="testcase"> | ||
<fileset dir="${test.src.dir}" includes="**/${testcase}.java" /> | ||
</batchtest> | ||
</junit> | ||
<fail if="tests.failed">Tests failed!</fail> | ||
</target> | ||
|
||
|
||
<!-- ================================================================== --> | ||
<!-- Hadoop job jar --> | ||
<!-- ================================================================== --> | ||
|
||
<target name="job" | ||
depends="compile" | ||
description="--> create a Hadoop ready jar with all dependencies"> | ||
|
||
<!-- Make sure lib/ dir starts out empty, so we don't get multiple | ||
copies of jars with slightly different versions | ||
--> | ||
<delete dir="${build.dir}/lib" /> | ||
<mkdir dir="${build.dir}/lib" /> | ||
|
||
<copy todir="${build.dir}/lib" flatten="true"> | ||
<path refid="runtime.classpath" /> | ||
</copy> | ||
|
||
<jar destfile="${build.dir}/${job.name}" compress="true"> | ||
<fileset dir="${build.dir.main-classes}" /> | ||
<fileset dir="${main.res.dir}" /> | ||
<fileset dir="${build.dir}" includes="lib/" /> | ||
|
||
<manifest> | ||
<attribute name="Main-Class" value="com.amazon.aws.training.emr.wikipedia.NgramJob"/> | ||
</manifest> | ||
|
||
</jar> | ||
</target> | ||
|
||
|
||
<!-- ================================================================== --> | ||
<!-- Generating eclipse file --> | ||
<!-- ================================================================== --> | ||
|
||
<target name="eclipse" | ||
depends="mvn-init, clean-eclipse" | ||
description="--> create the Eclipse project files"> | ||
|
||
<taskdef name="eclipse" | ||
classname="prantl.ant.eclipse.EclipseTask" | ||
classpathref="compile.classpath" /> | ||
<mkdir dir="${build.dir.main-classes-eclipse}" /> | ||
<mkdir dir="${build.dir.test-classes-eclipse}" /> | ||
<eclipse> | ||
<settings> | ||
<jdtcore compilercompliance="6.0" /> | ||
<resources encoding="UTF-8" /> | ||
</settings> | ||
<project name="${ant.project.name}" /> | ||
<classpath> | ||
<container path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6" /> | ||
|
||
<source path="${basedir}/src/main/java" | ||
output="${build.dir.main-classes-eclipse}" /> | ||
<source path="${basedir}/src/main/resources" | ||
output="${build.dir.main-classes-eclipse}" /> | ||
<source path="${basedir}/src/test/java" | ||
output="${build.dir.test-classes-eclipse}" /> | ||
<source path="${basedir}/src/test/resources" | ||
output="${build.dir.test-classes-eclipse}" /> | ||
|
||
<output path="${build.dir.main-classes-eclipse}" /> | ||
<library pathref="test.classpath" exported="false" /> | ||
</classpath> | ||
</eclipse> | ||
<concat destfile="${root.dir}/.settings/org.eclipse.jdt.core.prefs" append="true"> | ||
<filelist dir="${root.dir}/doc/" files="eclipse-formatter.properties" /> | ||
</concat> | ||
</target> | ||
|
||
<target name="clean-eclipse" description="--> clean the Eclipse project files"> | ||
<delete file=".classpath" /> | ||
<delete file=".eclipse" /> | ||
<delete file=".project" /> | ||
<delete dir=".settings" /> | ||
</target> | ||
|
||
|
||
</project> |
Oops, something went wrong.