Permalink
Browse files

Guardian datasets loading with a sample.

  • Loading branch information...
1 parent 7c93e31 commit da79025507c18460bad385b728d7d053e5c054b9 @mohawkjohn mohawkjohn committed Sep 22, 2011
Showing with 187 additions and 14 deletions.
  1. +22 −0 lib/ext/csv.rb
  2. +26 −0 lib/ext/string.rb
  3. +13 −1 lib/sciruby.rb
  4. +75 −13 lib/sciruby/data.rb
  5. +51 −0 samples/guardian1.rb
View
@@ -0,0 +1,22 @@
+require 'csv'
+require 'statsample'
+
+class CSV
+ def to_dataset mode=:col
+ CSV::Table.new(self).send("by_#{mode}.to_s".to_sym).to_dataset
+ end
+end
+
+class CSV::Table
+ def to_dataset
+ begin
+ h = {}
+ self.headers.each { |header| h[header] = self[header].to_scale }
+ h
+ rescue NoMethodError => e # Table has no headers. Try a different way.
+ v = []
+ 0.upto(self.size-1).each { |j| v[j] = self[j].to_scale }
+ v
+ end.to_dataset
+ end
+end
View
@@ -0,0 +1,26 @@
+class String
+ unless method_defined?(:constantize)
+ # Based on constantize from ActiveSupport::Inflector
+ def constantize
+ names = self.split('::')
+ names.shift if names.empty? || names.first.empty?
+
+ constant = Object
+ names.each do |name|
+ constant = constant.const_defined?(name, false) ? constant.const_get(name) : constant.const_missing(name)
+ end
+ constant
+ end
+ end
+
+ unless method_defined?(:camelize)
+ # Adapted from camelize from ActiveSupport::Inflector
+ def camelize first_letter_in_uppercase = true
+ if first_letter_in_uppercase
+ self.to_s.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase }
+ else
+ self.to_s[0].chr.downcase + self[1..-1].camelize
+ end
+ end
+ end
+end
View
@@ -26,6 +26,7 @@
require "rubygems"
require "bundler/setup"
+require "./lib/ext/string"
module SciRuby
VERSION = '0.1.3'
@@ -40,6 +41,16 @@ def integrate *args, &block
require "integration"
::Integration.integrate(*args, &block)
end
+
+ # Produce a list of datasets that can be loaded using the +dataset+ method
+ def dataset_search database, args = {}
+ "SciRuby::Data::#{database.to_s.camelize}".constantize.new(args).datasets.keys
+ end
+
+ # Load a dataset from a specific database. For a list of datasets, use `dataset_search(:guardian)`, for example.
+ def dataset database, source_id
+ "SciRuby::Data::#{database.to_s.camelize}".constantize.new.dataset(source_id)
+ end
end
autoload(:Plotter, File.join(DIR, 'sciruby', 'plotter'))
@@ -49,4 +60,5 @@ def integrate *args, &block
autoload(:Data, File.join(DIR, 'sciruby', 'data'))
end
-autoload(:Shoes, File.join(SciRuby::DIR, 'ext', 'shoes'))
+autoload(:Shoes, File.join(SciRuby::DIR, 'ext', 'shoes'))
+autoload(:CSV, File.join(SciRuby::DIR, 'ext', 'csv'))
View
@@ -3,7 +3,6 @@
require "uri"
require "cgi"
require "ostruct"
-require "csv"
module SciRuby
module Data
@@ -38,7 +37,6 @@ def http_get params={} #:nodoc:
# Adapted from: http://stackoverflow.com/questions/1252210/parametrized-get-request-in-ruby/1252305#1252305
def http_get_internal domain, path, params = {} #:nodoc:
path_with_params = "#{path}?".concat(params.collect { |k,v| "#{k.to_s}=#{CGI::escape(v.to_s)}"}.join('&'))
- STDERR.puts [domain, path_with_params].join("\t")
return Net::HTTP.get(domain, path_with_params) unless params.empty?
Net::HTTP.get(domain, path)
end
@@ -49,6 +47,16 @@ class Guardian < SearcherBase
QUERY_DOMAIN = %q{www.guardian.co.uk}
QUERY_PATH = %q{/world-government-data/search.json}
+ class DatasetInfo < ::OpenStruct
+ def initialize h
+ super h
+ self.download_links.each_index do |i|
+ self.download_links[i] = ::OpenStruct.new(self.download_links[i])
+ end
+ end
+ end
+
+
# Search the site or database using some set of parameters.
#
# This function is the one that you should redefine if you want to require certain parameters, or if there are
@@ -60,31 +68,85 @@ class Guardian < SearcherBase
# * facet_source_title: e.g., data from Australian government would be data.nsw.org.au
# * facet_format: e.g., csv, excel, xml, shapefile, kml
def initialize args={}
+
args[:facet_format] ||= :csv
- @require_format ||= args[:facet_format]
+ @require_format ||= args[:facet_format] # This should be removed when we can interpret other formats.
+
super args
end
attr_reader :search_result
- # Get a hash of dataset information by source_id
- def dataset_info
- @dataset_info ||= begin # Datasets are stored by source ID
+ # Return dataset meta-data found in the search, hashed by source_id. So, do datasets.keys if you want a list of
+ # source_ids.
+ def datasets
+ @datasets ||= begin
h = {}
- search_result["results"].each do |result|
- h[result["source_id"]] = result
+ search_result["results"].each do |res|
+ h[res['source_id']] = DatasetInfo.new(res)
end
h
end
end
- def dataset source_id=nil
+
+ # After a call to dataset(source_id), what is the content that was downloaded from a given +link+?
+ def raw_dataset source_id, link
+ @raw_dataset ||= {}
+ @raw_dataset[source_id] ||= {}
+ @raw_dataset[source_id][link] ||= begin
+ url = URI.parse link
+ http_get_internal(url.host, url.path)
+ end
+ end
+
+ # After a call to dataset(source_id), from what links did we download?
+ def raw_dataset_links_cached source_id=nil
+ return @raw_dataset if source_id.nil?
+ @raw_dataset[source_id].keys
+ end
+
+ # Download a specific dataset by +source_id+ and cache it in the searcher. Returns a Statsample::Dataset.
+ #
+ # If this raises an exception, you can try this:
+ #
+ # links = raw_dataset_links_cached(source_id)
+ #
+ # And then for each of +links+, do `raw_dataset(source_id, link)` to see what the actual downloaded data was.
+ # This is good for debugging -- e.g., did the page move? or is there something wrong with Ruby's CSV interpreter?
+ # Or is it in some other format altogether?
+ #
+ # Right now, this function only handles CSV. TODO: Add more format handlers!
+ def dataset source_id
@dataset ||= {}
@dataset[source_id] ||= begin # Datasets are stored by source ID
- dataset_info[source_id]["download_links"].each do |link_info|
- next unless link_info["format"] == @require_format.to_s
- url = URI.parse(link_info["link"])
- return CSV.parse(http_get_internal(url.host, url.path))
+ pos = 0
+ datasets[source_id].download_links.each do |link_info|
+
+ unless link_info.format == @require_format.to_s
+ pos += 1
+ next # Format is incorrect.
+ end
+
+ # Format appears to be correct, prior to actually downloading. Proceed.
+ d = nil
+ exception_raised = false
+
+ raw = raw_dataset(source_id, link_info.link)
+
+ begin
+ d = CSV.parse(raw, :headers => true, :converters => :all).to_dataset
+ d.name = datasets[source_id].title
+ rescue CSV::MalformedCSVError => e
+ exception_raised = true
+ raise(TypeError, "Malformed CSV; dataset has probably moved") if pos == datasets[source_id].download_links.size - 1
+ ensure
+ pos += 1
+ end
+ return d unless d.nil?
+
+ raise(TypeError, "All dataset sources returned malformed CSV data; dataset has probably moved") if exception_raised
+ raise(NameError, "Couldn't find any dataset sources in the correct format (CSV)")
end
end
end
View
@@ -0,0 +1,51 @@
+dat = SciRuby.dataset :guardian, 'gla-sector-employment-projections-2009'
+
+label_width = 30
+w = 1000
+label_height = 40
+h = 600
+
+t = Rubyvis::Scale.linear(dat["Year"].min, dat["Year"].max).range(label_width, w)
+y = Rubyvis::Scale.linear(0, dat["Business services"].max).range(label_height, h)
+f = Rubyvis::Scale.ordinal(dat.fields).range('red', 'orange', 'yellow', 'green', 'blue', 'indigo', 'violet')
+
+coords = dat["Year"].zip(dat["Business services"])
+
+::Rubyvis::Panel.new do
+ width w
+ height h
+
+ rule do
+ data dat["Year"]
+ stroke_style { |d| d>1971 ? '#CCC' : 'red' }
+ bottom label_height
+ left { |d| t.scale(d) }
+ end.anchor('bottom').add(::Rubyvis::Label).
+ text_angle(-Math::PI / 2.0).
+ text_baseline('middle').text_align('right')
+
+ rule do
+ data [0,500000,1000000,1500000]
+ stroke_style { |d| d>0 ? '#CCC' : 'red'}
+ bottom { |d| y.scale(d) }
+ left label_width
+ right 0
+ end.anchor('left').add(::Rubyvis::Label).
+ text_angle(-Math::PI / 2.0).
+ text_align('center').text_baseline('right').left(15)
+
+
+ dat.fields.each do |field|
+ next if field == "Year"
+ coords = dat["Year"].zip(dat[field])
+ line do
+ data coords
+ stroke_style lambda { f.scale(field) }
+ left { |d| t.scale(d[0]) }
+ bottom { |d| y.scale(d[1]) }
+ end.anchor('right').add(::Rubyvis::Label).
+ visible{ |d| d[0] == 1971 }.
+ text_style('#666').
+ text(lambda { field })
+ end
+end

0 comments on commit da79025

Please sign in to comment.