Merge pull request #136 from VertNet/develop

Merge the active develop branch back into master
VertNet · Sep 16, 2016 · 0122992 · 0122992
2 parents d93b367 + cf1813c
commit 0122992
Show file tree

Hide file tree

Showing 116 changed files with 40,744 additions and 169 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,19 @@
+vertnet.pem
 pom.xml
 *jar
 /lib/
 /classes/
 .lein-deps-sum
 .lein-plugins
 rm-dwca-reader-clj-jars.sh
+creds.json
+s3.json
+aws.json
+target/
+#*.*
+*.*~
+*sublime*
+\#*.*\#
+.lein*
+.nrepl*
+.DS_Store
diff --git a/README.md b/README.md
@@ -1,4 +1,54 @@
-gulo
-====
+# What is Gulo?
 
-Shredding Darwin Core Archives with ferocity, strength, and Cascalog.
+![](http://3.bp.blogspot.com/-s1vAPdg_zZM/TZ3bnzUZgVI/AAAAAAAACKo/Mk-Tu-Nil74/s1600/animalangry.jpg)
+
+Gulo is the genus for wolverine, the biggest land-dwelling species of weasel on the planet. It is a stocky and muscular carnivore, resembling a small bear. The wolverine has a reputation for endurance, ferocity, and strength out of proportion to its size, with the capacity to battle with competitors many times its size.
+
+Gulo is also a VertNet project designed for harvesting Darwin Core Archives, shredding them into small pieces, and loading them into [CartoDB](http://cartodb.com). It's written in the [Clojure](http://clojure.org) programming language and rides on [Cascading](http://www.cascading.org) and [Cascalog](https://github.com/nathanmarz/cascalog) for processing "Big Data" on top of [Hadoop](http://hadoop.apache.org) using [MapReduce](http://research.google.com/archive/mapreduce.html).
+
+# Developing
+## AWS credentials
+
+Running Gulo queries with Elastic MapReduce requires adding the following to the file `credentials.json` in the project root:
+
+```json
+{
+   "access-id": "your_aws_access_id",
+   "private-key":"your_aws_private_key",
+   "key-pair-file":"~/.ssh/vertnet.pem",
+   "key-pair":"vertnet"
+}
+```
+
+Working with the `gulo.cdb` namespace requires this to be stored in `resources/aws.json`:
+
+```json
+{
+    "access-id": "your_aws_access_id",
+    "secret-key": "your_aws_private_key"
+}
+```
+
+## CartoDB OAuth credentials
+
+Gulo depends on an authenticated connection to CartoDB. This requires adding the following file in `resources/creds.json`:
+
+```json
+{
+  "key": "your_cartodb_oauth_key",
+  "secret": "your_cartodb_oauth_secret",
+  "user": "your_cartodb_username",
+  "password": "your_cartodb_password"
+}
+```
+
+## Dependencies
+
+For adding BOM bytes to UTF-8 files, so that CartoDB can detect the encoding, we use the `uconv` program which can be installed on Ubuntu like this:
+
+```bash
+$ sudo apt-get install apt-file
+$ sudo apt-file update
+$ apt-file search bin/uconv
+$ sudo apt-get install libicu-dev
+```
diff --git a/dev/bootstrap.sh b/dev/bootstrap.sh
@@ -0,0 +1,30 @@
+# configure EMR cluster for use with VertNet projects
+# put on S3 at s3://vnproject/bootstrap-actions/gulo/bootstrap.sh
+
+# install some helpful utilities
+sudo apt-get update
+sudo apt-get install -y  screen s3cmd zip unzip
+
+# Setup for git
+git config --global user.name "Whizbang Systems"
+git config --global user.email "admin@whizbangsystems.net"
+
+# generate ssh key
+ssh-keygen -t rsa -N "" -f /home/hadoop/.ssh/id_rsa -C "admin@whizbangsystems.net"
+sudo chmod 644 /home/hadoop/.ssh/id_rsa
+
+# Add github to known_hosts
+echo "github.com,207.97.227.239 ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAQEAq2A7hRGmdnm9tUDbO9IDSwBK6TbQa+PXYPCPy6rbTrTtw7PHkccKrpp0yVhp5HdEIcKr6pLlVDBfOLX9QUsyCOV0wzfjIJNlGEYsdlLJizHhbn2mUjvSAHQqZETYP81eFzLQNnPHt4EVVUh7VfDESU84KezmD5QlWpXLmvU31/yMf+Se8xhHTvKSCZIFImWwoG6mbUoWf9nzpIoaSjB+weqqUUmpaaasXVal72J+UX2B+2RPW3RcT0eOzQgqlJL3RKrTJvdsjE3JEAvGq3lGHSZXy28G3skua2SmVi/w4yCE6gbODqnTWlg7+wC604ydGXA8VJiS5ap43JXiUFFAaQ==" >> /home/hadoop/.ssh/known_hosts
+
+
+# simple leiningen install via 'li'
+echo "alias li='cd /home/hadoop/bin; wget https://raw.github.com/technomancy/leiningen/stable/bin/lein; chmod u+x lein; ./lein; cd /home/hadoop;'" >> /home/hadoop/.bashrc
+
+# simple uberjarring
+echo "alias uj='lein do deps, compile :all, uberjar'" >> /home/hadoop/.bashrc
+
+# simple installs & configs
+echo "alias gulo='git clone git://github.com/VertNet/gulo.git'" >> /home/hadoop/.bashrc
+echo "alias teratorn='git clone git://github.com/MapofLife/teratorn.git'" >> /home/hadoop/.bashrc
+
+echo "alias dl='wget https://gist.github.com/robinkraft/5666682/download'" >> /home/hadoop/.bashrc
diff --git a/dev/ec2-bootstrap.sh b/dev/ec2-bootstrap.sh
@@ -0,0 +1,111 @@
+# Run this script to configure an instance for harvesting and bulkloading.
+
+# install a few things
+
+sudo apt-get update
+sudo apt-get -y install screen zip unzip git sqlite3
+http://s3tools.org/repo/deb-all/stable/s3cmd_1.0.0.orig.tar.gz
+tar -xvf s3cmd_1.0.0.orig.tar.gz
+cd s3cmd-1.0.0
+sudo python setup.py install
+cd
+
+# Setup for git
+git config --global user.name "David Bloom"
+git config --global user.email "dbloom@vertnet.org"
+
+# generate ssh key
+ssh-keygen -t rsa -N "" -f /home/$USER/.ssh/id_rsa -C "dbloom@vertnet.org"
+sudo chmod 644 /home/$USER/.ssh/id_rsa
+
+# Add github to known_hosts
+echo "github.com,207.97.227.239 ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAQEAq2A7hRGmdnm9tUDbO9IDSwBK6TbQa+PXYPCPy6rbTrTtw7PHkccKrpp0yVhp5HdEIcKr6pLlVDBfOLX9QUsyCOV0wzfjIJNlGEYsdlLJizHhbn2mUjvSAHQqZETYP81eFzLQNnPHt4EVVUh7VfDESU84KezmD5QlWpXLmvU31/yMf+Se8xhHTvKSCZIFImWwoG6mbUoWf9nzpIoaSjB+weqqUUmpaaasXVal72J+UX2B+2RPW3RcT0eOzQgqlJL3RKrTJvdsjE3JEAvGq3lGHSZXy28G3skua2SmVi/w4yCE6gbODqnTWlg7+wC604ydGXA8VJiS5ap43JXiUFFAaQ==" >> /home/$USER/.ssh/known_hosts
+
+# install Java
+sudo apt-get -y install openjdk-7-jre
+sudo apt-get -y install openjdk-7-jdk
+
+# make ~/bin directory, add to PATH
+mkdir ~/bin
+echo "export PATH=/home/$USER/bin:${PATH}" >> ~/.bashrc
+
+# install lein
+cd ~/bin
+wget https://raw.github.com/technomancy/leiningen/stable/bin/lein
+chmod u+x lein
+./lein
+cd ~/
+
+# install app engine sdk
+cd bin
+wget http://googleappengine.googlecode.com/files/google_appengine_1.8.0.zip
+unzip google_appengine_1.8.0.zip
+echo "export PATH=/home/$USER/bin/google_appengine:${PATH}" >> ~/.bashrc
+cd
+
+# simple uberjarring via uj command
+echo "alias uj='lein do deps, compile :all, uberjar'" >> /home/$USER/.bashrc
+
+# clone projects
+git clone git://github.com/VertNet/gulo.git
+git clone git://github.com/VertNet/webapp.git
+
+# configure EBS volume
+sudo mkfs -t ext3 /dev/xvdb
+sudo mkdir /mnt/beast
+sudo mount /dev/xvdb /mnt/beast
+sudo chown $USER:$USER /mnt/beast
+
+# configure credentials
+
+echo "Configuring CartoDB. Please have your credentials ready and press 'enter' to continue."
+read na
+echo "Oauth key:"
+read OAUTH_KEY
+echo "Oauth secret:"
+read OAUTH_SECRET
+echo "Username:"
+read USERNAME
+echo "Password:"
+read CDB_PASSWORD
+echo "API key:"
+read API_KEY
+
+echo "{
+  \"key\": \"$OAUTH_KEY\",
+  \"secret\": \"$OAUTH_SECRET\",
+  \"user\": \"$USERNAME\",
+  \"password\": \"$CDB_PASSWORD\",
+  \"api_key\": \"$API_KEY\"
+}" > ~/gulo/resources/creds.json
+
+echo "Configuring AWS. Please have your credentials ready and press 'enter' to continue. Note that backslashes in your AWS credentials may cause errors."
+read na
+echo "Access key:"
+read ACCESS_ID
+echo
+echo "Secret key:"
+read SECRET_KEY
+echo
+
+echo "{
+        \"access-id\": \"$ACCESS_ID\",
+        \"secret-key\": \"$SECRET_KEY\"
+}" > ~/gulo/resources/aws.json
+
+echo "Keep those AWS credentials handy for configuring s3cmd. Press 'enter' to continue"
+
+s3cmd --configure
+
+# configure app engine credentials
+
+echo "Please enter your App Engine email address: "
+read EMAIL
+echo "export EMAIL=$EMAIL" >> ~/.bashrc
+
+echo "Please enter your App Engine password: "
+read GAE_PASSWORD
+echo "export GAE_PASSWORD=$GAE_PASSWORD" >> ~/.bashrc
+echo "Credentials are now set up."
+
+echo "Instance configured - go have a beer to celebrate!"
diff --git a/dev/genthrift.sh b/dev/genthrift.sh
@@ -0,0 +1,9 @@
+#!/bin/sh
+
+# Generates Java code from the vn.thrift DSL. Depends on the Apache Thrift compiler.
+
+rm -rf ../src/jvm/gen-java
+rm -rf ../src/jvm/gulo/schema/*
+thrift -o "../src/jvm" -r --gen java:hashcode gulo.thrift
+mv ../src/jvm/gen-java/gulo/schema ../src/jvm/gulo
+rm -rf ../src/jvm/gen-java