Skip to content

Commit

Permalink
Merge pull request #2 from VertNet/feature/build-tables
Browse files Browse the repository at this point in the history
Add Cascalog queries to build tax, occ, loc, tax_loc tables.
  • Loading branch information
eightysteele committed Jun 14, 2012
2 parents 62cdee4 + 5f7ce12 commit d93b367
Show file tree
Hide file tree
Showing 5 changed files with 157 additions and 62 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ pom.xml
/classes/
.lein-deps-sum
.lein-plugins
rm-dwca-reader-clj-jars.sh
5 changes: 3 additions & 2 deletions project.clj
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@
"-Xms1024M" "-Xmx1048M" "-server"]
:plugins [[swank-clojure "1.4.0-SNAPSHOT"]]
:dependencies [[org.clojure/clojure "1.4.0"]
[cascalog "1.9.0-wip8"]
[dwca-reader-clj "0.1.0-SNAPSHOT"]
[cascalog "1.8.7"]
[cascalog-more-taps-eighty "0.2.0"]
[dwca-reader-clj "0.3.0-SNAPSHOT"]
[cartodb-clj "1.0.0-SNAPSHOT"]]
:dev-dependencies [[org.apache.hadoop/hadoop-core "0.20.2-dev"]
[midje-cascalog "0.4.0"]
Expand Down
154 changes: 99 additions & 55 deletions src/clj/gulo/core.clj
Original file line number Diff line number Diff line change
@@ -1,66 +1,110 @@
(ns gulo.core
"This namespace downloads and harvests a set of Darwin Core Archives using
Cascalog and unicorn magic."
(:use [cascalog.api]
[dwca.core]
[cartodb.client :only (query)]
[clojure.string :only (join split)])
(:require [clojure.java.io :as io])
(:import [org.gbif.dwc.record DarwinCoreRecord]
[java.lang.reflect Field]
[com.google.common.io Files]))
Cascalog and unicorn magic."
(:use [gulo.util :as util :only (latlon-valid? gen-uuid)]
[cascalog.api]
[clojure.contrib.string :as s :only (grep)]
[cascalog.more-taps :as taps :only (hfs-delimited)]
[dwca.core :as dwca]
[cartodb.client :as cdb :only (query)]
[clojure.string :only (join split lower-case)]))

(defn dwca-urls
"Return collection of Darwin Core Archive URLs."
[]
(vec (map #(vals %) (query "vertnet" "SELECT dwca_url FROM publishers"))))

(defn archive-name
"Return archive name from supplied URL as defined by the IPT."
[url]
(str "dwca-" (nth (split url #"=") 1)))
;; ([?kingdom ?phylum ?class ?order ?family ?genus ?species ?sciname]
;; (source :#> 183 {151 ?kingdom 152 ?phylum 153 ?class 154 ?order
;; 155 ?family 156 ?genus 157 ?species 160 ?sciname}))]

(defn field-val
"Return the string value of the supplied record field."
[^Field field ^DarwinCoreRecord rec]
{:pre [(instance? Field field)
(instance? DarwinCoreRecord rec)]}
(.setAccessible field true)
(let [val (.get field rec)]
(cond val (.trim val))))
(defn taxon-location-table
"Create taxon location table."
[taxon location occurrence sink-path]
(let [sink (taps/hfs-delimited sink-path :sinkmode :replace)]
(?<- sink
[?taxon-id ?loc-id ?occ-id]
(taxon ?taxon-id ?name)
(location ?loc-id ?lat ?lon)
(occurrence :#> 183 {0 ?occ-id 22 ?lat 23 ?lon 160 ?name}))))

(defn rec->lines
"Return a tab dilinated string of values in supplied DarwinCoreRecord object."
[^ DarwinCoreRecord rec]
{:pre [(instance? DarwinCoreRecord rec)]}
(let [fields (->> rec .getClass .getDeclaredFields)
values (map #(field-val % rec) fields)]
(join "\t" values)))
(defmapcatop explode-names
"Emits all taxon names."
[kingdom phylum class order family genus species sciname]
(vec (map vector [kingdom phylum class order family genus species sciname])))

(defn grab
"Download and expand a Darwin Core Archive at a URL and return a path to it."
[url]
(let [temp-dir (Files/createTempDir)
temp-path (.getPath temp-dir)
archive-name (archive-name url)
zip-path (str temp-path "/" archive-name ".zip")
archive-path (str temp-path "/" archive-name)]
(download url zip-path)
(unzip zip-path archive-path)
archive-path))

(defmapcatop url->recs
"Emit records as tab delineated lines from archive located at URL."
(defn taxon-table
"Create taxon table of unique names with generated UUIDs."
[source sink-path]
(let [sink (taps/hfs-delimited sink-path :sinkmode :replace)
unique-names (<- [?name]
(source :#> 183 {151 ?kingdom 152 ?phylum 153 ?class
154 ?order 155 ?family 156 ?genus
157 ?species 160 ?sciname})
(explode-names ?kingdom ?phylum ?class ?order ?family
?genus ?species ?sciname :> ?name))]
(?<- sink
[?uuid ?name]
(unique-names ?name)
(util/gen-uuid :> ?uuid))))

(defn location-table
"Create location table of unique and valid lat/lon with generated UUIDs."
[source sink-path]
(let [sink (taps/hfs-delimited sink-path :sinkmode :replace)
unique-latlons (<- [?lat ?lon]
(source :#> 183 {22 ?lat 23 ?lon})
(util/latlon-valid? ?lat ?lon))]
(?<- sink
[?uuid ?lat ?lon]
(unique-latlons ?lat ?lon)
(util/gen-uuid :> ?uuid))))

(defn explode
[rec]
(vec (cons (util/gen-uuid) (field-vals rec))))

(defmapcatop explode-lines
"Emit records as tab delineated lines from archive located at URL. A UUID is
prepended to each line for use by Cascalog joins when building other tables."
[url]
(for [rec (get-records (grab url))]
[(rec->lines rec)]))
(vec (map explode (dwca/open url))))

(defn harvest
(defn occurrence-table
"Download and store records from many Darwin Core Archive URLs to CSV file."
[sink-path]
(let [source (dwca-urls)
sink (hfs-delimited sink-path :delimiter "\t" :sinkmode :replace)]
[source sink-path]
(let [sink (taps/hfs-delimited sink-path :sinkmode :replace)]
(?<- sink
[?line]
[?0 ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9 ?10 ?11 ?12 ?13 ?14 ?15 ?16 ?17 ?18 ?19
?20 ?21 ?22 ?23 ?24 ?25 ?26 ?27 ?28 ?29 ?30 ?31 ?32 ?33 ?34 ?35 ?36
?37 ?38 ?39 ?40 ?41 ?42 ?43 ?44 ?45 ?46 ?47 ?48 ?49 ?50 ?51 ?52 ?53
?54 ?55 ?56 ?57 ?58 ?59 ?60 ?61 ?62 ?63 ?64 ?65 ?66 ?67 ?68 ?69 ?70
?71 ?72 ?73 ?74 ?75 ?76 ?77 ?78 ?79 ?80 ?81 ?82 ?83 ?84 ?85 ?86 ?87
?88 ?89 ?90 ?91 ?92 ?93 ?94 ?95 ?96 ?97 ?98 ?99 ?100 ?101 ?102 ?103
?104 ?105 ?106 ?107 ?108 ?109 ?110 ?111 ?112 ?113 ?114 ?115 ?116 ?117
?118 ?119 ?120 ?121 ?122 ?123 ?124 ?125 ?126 ?127 ?128 ?129 ?130 ?131
?132 ?133 ?134 ?135 ?136 ?137 ?138 ?139 ?140 ?141 ?142 ?143 ?144 ?145
?146 ?147 ?148 ?149 ?150 ?151 ?152 ?153 ?154 ?155 ?156 ?157 ?158 ?159
?160 ?161 ?162 ?163 ?164 ?165 ?166 ?167 ?168 ?169 ?170 ?171 ?172 ?173
?174 ?175 ?176 ?177 ?178 ?179 ?180 ?181 ?182]
(source ?url)
(url->recs ?url :> ?line))))
(explode-lines ?url :> ?0 ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9 ?10 ?11 ?12 ?13
?14 ?15 ?16 ?17 ?18 ?19 ?20 ?21 ?22 ?23 ?24 ?25 ?26 ?27
?28 ?29 ?30 ?31 ?32 ?33 ?34 ?35 ?36 ?37 ?38 ?39 ?40 ?41
?42 ?43 ?44 ?45 ?46 ?47 ?48 ?49 ?50 ?51 ?52 ?53 ?54 ?55
?56 ?57 ?58 ?59 ?60 ?61 ?62 ?63 ?64 ?65 ?66 ?67 ?68 ?69
?70 ?71 ?72 ?73 ?74 ?75 ?76 ?77 ?78 ?79 ?80 ?81 ?82 ?83
?84 ?85 ?86 ?87 ?88 ?89 ?90 ?91 ?92 ?93 ?94 ?95 ?96 ?97
?98 ?99 ?100 ?101 ?102 ?103 ?104 ?105 ?106 ?107 ?108 ?109
?110 ?111 ?112 ?113 ?114 ?115 ?116 ?117 ?118 ?119 ?120
?121 ?122 ?123 ?124 ?125 ?126 ?127 ?128 ?129 ?130 ?131
?132 ?133 ?134 ?135 ?136 ?137 ?138 ?139 ?140 ?141 ?142
?143 ?144 ?145 ?146 ?147 ?148 ?149 ?150 ?151 ?152 ?153
?154 ?155 ?156 ?157 ?158 ?159 ?160 ?161 ?162 ?163 ?164
?165 ?166 ?167 ?168 ?169 ?170 ?171 ?172 ?173 ?174 ?175
?176 ?177 ?178 ?179 ?180 ?181 ?182))))

(defn harvest
[source occ-path loc-path taxon-path taxon-loc-path ]
(occurrence-table source occ-path)
(let [occ-source (taps/hfs-delimited occ-path :sinkmode :replace)]
(location-table occ-source loc-path)
(taxon-table occ-source taxon-path)
(taxon-location-table (taps/hfs-delimited taxon-path :sinkmode :replace)
(taps/hfs-delimited loc-path :sinkmode :replace)
occ-source taxon-loc-path)))
42 changes: 42 additions & 0 deletions src/clj/gulo/util.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
(ns gulo.util
"This namespace contains utility functions."
(:use [cartodb.client :as cdb :only (query)]))

(defn dwca-urls
"Return collection of Darwin Core Archive URLs."
[]
(vec (map #(vals %) (cdb/query "vertnet" "SELECT dwca_url FROM publishers"))))

(defn gen-uuid
"Return a randomly generated UUID string."
[& x] ;; Cascalog ArityException: Wrong number of args without [& x]
(str (java.util.UUID/randomUUID)))

;; Valid ranges for latitude and longitude.
(def latlon-range {:lat-min -90 :lat-max 90 :lon-min -180 :lon-max 180})

(defn read-latlon
"Converts lat and lon values from string to number."
[lat lon]
{:pre [(instance? java.lang.String lat)
(instance? java.lang.String lon)]}
[(read-string lat) (read-string lon)])

(defn latlon-valid?
"Return true if lat and lon are valid, otherwise return false."
[lat lon]
(try
(let [{:keys [lat-min lat-max lon-min lon-max]} latlon-range
[lat lon] (read-latlon lat lon)]
(and (<= lat lat-max)
(>= lat lat-min)
(<= lon lon-max)
(>= lon lon-min)))
(catch Exception e false)))

(defn occurrence-table-header
"Return the occurrence table header."
[]
(join " "
(for [x (field-keys rec)]
(symbol (clojure.string/replace (lower-case (str x)) ":" "")))))
17 changes: 12 additions & 5 deletions test/gulo/core_test.clj
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
(ns gulo.core-test
(:use clojure.test
gulo.core))
(:use gulo.core
[midje sweet]
[clojure.string :only (split)])
(:import [com.google.common.io Files]))

(deftest a-test
(testing "FIXME, I fail."
(is (= 0 1))))
(fact
"Check harvesting."
(let [source [["http://vertnet.nhm.ku.edu:8080/ipt/archive.do?r=ttrs_mammals"]]
temp-dir (Files/createTempDir)
sink-path (.getPath temp-dir)]
(harvest source sink-path)
(println sink-path)
(count (split (slurp (str sink-path "/part-00000")) #"\n")) => 968))

0 comments on commit d93b367

Please sign in to comment.