-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from VertNet/feature/build-tables
Add Cascalog queries to build tax, occ, loc, tax_loc tables.
- Loading branch information
Showing
5 changed files
with
157 additions
and
62 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,3 +4,4 @@ pom.xml | |
/classes/ | ||
.lein-deps-sum | ||
.lein-plugins | ||
rm-dwca-reader-clj-jars.sh |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,66 +1,110 @@ | ||
(ns gulo.core | ||
"This namespace downloads and harvests a set of Darwin Core Archives using | ||
Cascalog and unicorn magic." | ||
(:use [cascalog.api] | ||
[dwca.core] | ||
[cartodb.client :only (query)] | ||
[clojure.string :only (join split)]) | ||
(:require [clojure.java.io :as io]) | ||
(:import [org.gbif.dwc.record DarwinCoreRecord] | ||
[java.lang.reflect Field] | ||
[com.google.common.io Files])) | ||
Cascalog and unicorn magic." | ||
(:use [gulo.util :as util :only (latlon-valid? gen-uuid)] | ||
[cascalog.api] | ||
[clojure.contrib.string :as s :only (grep)] | ||
[cascalog.more-taps :as taps :only (hfs-delimited)] | ||
[dwca.core :as dwca] | ||
[cartodb.client :as cdb :only (query)] | ||
[clojure.string :only (join split lower-case)])) | ||
|
||
(defn dwca-urls | ||
"Return collection of Darwin Core Archive URLs." | ||
[] | ||
(vec (map #(vals %) (query "vertnet" "SELECT dwca_url FROM publishers")))) | ||
|
||
(defn archive-name | ||
"Return archive name from supplied URL as defined by the IPT." | ||
[url] | ||
(str "dwca-" (nth (split url #"=") 1))) | ||
;; ([?kingdom ?phylum ?class ?order ?family ?genus ?species ?sciname] | ||
;; (source :#> 183 {151 ?kingdom 152 ?phylum 153 ?class 154 ?order | ||
;; 155 ?family 156 ?genus 157 ?species 160 ?sciname}))] | ||
|
||
(defn field-val | ||
"Return the string value of the supplied record field." | ||
[^Field field ^DarwinCoreRecord rec] | ||
{:pre [(instance? Field field) | ||
(instance? DarwinCoreRecord rec)]} | ||
(.setAccessible field true) | ||
(let [val (.get field rec)] | ||
(cond val (.trim val)))) | ||
(defn taxon-location-table | ||
"Create taxon location table." | ||
[taxon location occurrence sink-path] | ||
(let [sink (taps/hfs-delimited sink-path :sinkmode :replace)] | ||
(?<- sink | ||
[?taxon-id ?loc-id ?occ-id] | ||
(taxon ?taxon-id ?name) | ||
(location ?loc-id ?lat ?lon) | ||
(occurrence :#> 183 {0 ?occ-id 22 ?lat 23 ?lon 160 ?name})))) | ||
|
||
(defn rec->lines | ||
"Return a tab dilinated string of values in supplied DarwinCoreRecord object." | ||
[^ DarwinCoreRecord rec] | ||
{:pre [(instance? DarwinCoreRecord rec)]} | ||
(let [fields (->> rec .getClass .getDeclaredFields) | ||
values (map #(field-val % rec) fields)] | ||
(join "\t" values))) | ||
(defmapcatop explode-names | ||
"Emits all taxon names." | ||
[kingdom phylum class order family genus species sciname] | ||
(vec (map vector [kingdom phylum class order family genus species sciname]))) | ||
|
||
(defn grab | ||
"Download and expand a Darwin Core Archive at a URL and return a path to it." | ||
[url] | ||
(let [temp-dir (Files/createTempDir) | ||
temp-path (.getPath temp-dir) | ||
archive-name (archive-name url) | ||
zip-path (str temp-path "/" archive-name ".zip") | ||
archive-path (str temp-path "/" archive-name)] | ||
(download url zip-path) | ||
(unzip zip-path archive-path) | ||
archive-path)) | ||
|
||
(defmapcatop url->recs | ||
"Emit records as tab delineated lines from archive located at URL." | ||
(defn taxon-table | ||
"Create taxon table of unique names with generated UUIDs." | ||
[source sink-path] | ||
(let [sink (taps/hfs-delimited sink-path :sinkmode :replace) | ||
unique-names (<- [?name] | ||
(source :#> 183 {151 ?kingdom 152 ?phylum 153 ?class | ||
154 ?order 155 ?family 156 ?genus | ||
157 ?species 160 ?sciname}) | ||
(explode-names ?kingdom ?phylum ?class ?order ?family | ||
?genus ?species ?sciname :> ?name))] | ||
(?<- sink | ||
[?uuid ?name] | ||
(unique-names ?name) | ||
(util/gen-uuid :> ?uuid)))) | ||
|
||
(defn location-table | ||
"Create location table of unique and valid lat/lon with generated UUIDs." | ||
[source sink-path] | ||
(let [sink (taps/hfs-delimited sink-path :sinkmode :replace) | ||
unique-latlons (<- [?lat ?lon] | ||
(source :#> 183 {22 ?lat 23 ?lon}) | ||
(util/latlon-valid? ?lat ?lon))] | ||
(?<- sink | ||
[?uuid ?lat ?lon] | ||
(unique-latlons ?lat ?lon) | ||
(util/gen-uuid :> ?uuid)))) | ||
|
||
(defn explode | ||
[rec] | ||
(vec (cons (util/gen-uuid) (field-vals rec)))) | ||
|
||
(defmapcatop explode-lines | ||
"Emit records as tab delineated lines from archive located at URL. A UUID is | ||
prepended to each line for use by Cascalog joins when building other tables." | ||
[url] | ||
(for [rec (get-records (grab url))] | ||
[(rec->lines rec)])) | ||
(vec (map explode (dwca/open url)))) | ||
|
||
(defn harvest | ||
(defn occurrence-table | ||
"Download and store records from many Darwin Core Archive URLs to CSV file." | ||
[sink-path] | ||
(let [source (dwca-urls) | ||
sink (hfs-delimited sink-path :delimiter "\t" :sinkmode :replace)] | ||
[source sink-path] | ||
(let [sink (taps/hfs-delimited sink-path :sinkmode :replace)] | ||
(?<- sink | ||
[?line] | ||
[?0 ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9 ?10 ?11 ?12 ?13 ?14 ?15 ?16 ?17 ?18 ?19 | ||
?20 ?21 ?22 ?23 ?24 ?25 ?26 ?27 ?28 ?29 ?30 ?31 ?32 ?33 ?34 ?35 ?36 | ||
?37 ?38 ?39 ?40 ?41 ?42 ?43 ?44 ?45 ?46 ?47 ?48 ?49 ?50 ?51 ?52 ?53 | ||
?54 ?55 ?56 ?57 ?58 ?59 ?60 ?61 ?62 ?63 ?64 ?65 ?66 ?67 ?68 ?69 ?70 | ||
?71 ?72 ?73 ?74 ?75 ?76 ?77 ?78 ?79 ?80 ?81 ?82 ?83 ?84 ?85 ?86 ?87 | ||
?88 ?89 ?90 ?91 ?92 ?93 ?94 ?95 ?96 ?97 ?98 ?99 ?100 ?101 ?102 ?103 | ||
?104 ?105 ?106 ?107 ?108 ?109 ?110 ?111 ?112 ?113 ?114 ?115 ?116 ?117 | ||
?118 ?119 ?120 ?121 ?122 ?123 ?124 ?125 ?126 ?127 ?128 ?129 ?130 ?131 | ||
?132 ?133 ?134 ?135 ?136 ?137 ?138 ?139 ?140 ?141 ?142 ?143 ?144 ?145 | ||
?146 ?147 ?148 ?149 ?150 ?151 ?152 ?153 ?154 ?155 ?156 ?157 ?158 ?159 | ||
?160 ?161 ?162 ?163 ?164 ?165 ?166 ?167 ?168 ?169 ?170 ?171 ?172 ?173 | ||
?174 ?175 ?176 ?177 ?178 ?179 ?180 ?181 ?182] | ||
(source ?url) | ||
(url->recs ?url :> ?line)))) | ||
(explode-lines ?url :> ?0 ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9 ?10 ?11 ?12 ?13 | ||
?14 ?15 ?16 ?17 ?18 ?19 ?20 ?21 ?22 ?23 ?24 ?25 ?26 ?27 | ||
?28 ?29 ?30 ?31 ?32 ?33 ?34 ?35 ?36 ?37 ?38 ?39 ?40 ?41 | ||
?42 ?43 ?44 ?45 ?46 ?47 ?48 ?49 ?50 ?51 ?52 ?53 ?54 ?55 | ||
?56 ?57 ?58 ?59 ?60 ?61 ?62 ?63 ?64 ?65 ?66 ?67 ?68 ?69 | ||
?70 ?71 ?72 ?73 ?74 ?75 ?76 ?77 ?78 ?79 ?80 ?81 ?82 ?83 | ||
?84 ?85 ?86 ?87 ?88 ?89 ?90 ?91 ?92 ?93 ?94 ?95 ?96 ?97 | ||
?98 ?99 ?100 ?101 ?102 ?103 ?104 ?105 ?106 ?107 ?108 ?109 | ||
?110 ?111 ?112 ?113 ?114 ?115 ?116 ?117 ?118 ?119 ?120 | ||
?121 ?122 ?123 ?124 ?125 ?126 ?127 ?128 ?129 ?130 ?131 | ||
?132 ?133 ?134 ?135 ?136 ?137 ?138 ?139 ?140 ?141 ?142 | ||
?143 ?144 ?145 ?146 ?147 ?148 ?149 ?150 ?151 ?152 ?153 | ||
?154 ?155 ?156 ?157 ?158 ?159 ?160 ?161 ?162 ?163 ?164 | ||
?165 ?166 ?167 ?168 ?169 ?170 ?171 ?172 ?173 ?174 ?175 | ||
?176 ?177 ?178 ?179 ?180 ?181 ?182)))) | ||
|
||
(defn harvest | ||
[source occ-path loc-path taxon-path taxon-loc-path ] | ||
(occurrence-table source occ-path) | ||
(let [occ-source (taps/hfs-delimited occ-path :sinkmode :replace)] | ||
(location-table occ-source loc-path) | ||
(taxon-table occ-source taxon-path) | ||
(taxon-location-table (taps/hfs-delimited taxon-path :sinkmode :replace) | ||
(taps/hfs-delimited loc-path :sinkmode :replace) | ||
occ-source taxon-loc-path))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
(ns gulo.util | ||
"This namespace contains utility functions." | ||
(:use [cartodb.client :as cdb :only (query)])) | ||
|
||
(defn dwca-urls | ||
"Return collection of Darwin Core Archive URLs." | ||
[] | ||
(vec (map #(vals %) (cdb/query "vertnet" "SELECT dwca_url FROM publishers")))) | ||
|
||
(defn gen-uuid | ||
"Return a randomly generated UUID string." | ||
[& x] ;; Cascalog ArityException: Wrong number of args without [& x] | ||
(str (java.util.UUID/randomUUID))) | ||
|
||
;; Valid ranges for latitude and longitude. | ||
(def latlon-range {:lat-min -90 :lat-max 90 :lon-min -180 :lon-max 180}) | ||
|
||
(defn read-latlon | ||
"Converts lat and lon values from string to number." | ||
[lat lon] | ||
{:pre [(instance? java.lang.String lat) | ||
(instance? java.lang.String lon)]} | ||
[(read-string lat) (read-string lon)]) | ||
|
||
(defn latlon-valid? | ||
"Return true if lat and lon are valid, otherwise return false." | ||
[lat lon] | ||
(try | ||
(let [{:keys [lat-min lat-max lon-min lon-max]} latlon-range | ||
[lat lon] (read-latlon lat lon)] | ||
(and (<= lat lat-max) | ||
(>= lat lat-min) | ||
(<= lon lon-max) | ||
(>= lon lon-min))) | ||
(catch Exception e false))) | ||
|
||
(defn occurrence-table-header | ||
"Return the occurrence table header." | ||
[] | ||
(join " " | ||
(for [x (field-keys rec)] | ||
(symbol (clojure.string/replace (lower-case (str x)) ":" ""))))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,14 @@ | ||
(ns gulo.core-test | ||
(:use clojure.test | ||
gulo.core)) | ||
(:use gulo.core | ||
[midje sweet] | ||
[clojure.string :only (split)]) | ||
(:import [com.google.common.io Files])) | ||
|
||
(deftest a-test | ||
(testing "FIXME, I fail." | ||
(is (= 0 1)))) | ||
(fact | ||
"Check harvesting." | ||
(let [source [["http://vertnet.nhm.ku.edu:8080/ipt/archive.do?r=ttrs_mammals"]] | ||
temp-dir (Files/createTempDir) | ||
sink-path (.getPath temp-dir)] | ||
(harvest source sink-path) | ||
(println sink-path) | ||
(count (split (slurp (str sink-path "/part-00000")) #"\n")) => 968)) |