Skip to content

Commit

Permalink
add export service
Browse files Browse the repository at this point in the history
  • Loading branch information
Greg Luis Ramirez committed Jun 18, 2020
1 parent 237af84 commit aa77ccd
Show file tree
Hide file tree
Showing 2 changed files with 135 additions and 77 deletions.
119 changes: 119 additions & 0 deletions lib/hyrax/migrator/export.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# frozen_string_literal:true

module Hyrax::Migrator
##
# To use during the export step in OD2 migration
class Export
def initialize(export_dir, export_name, pidlist, verbose = false)
@export_dir = export_dir
@export_name = export_name
@datastreams_dir = File.join(export_dir, export_name)
@pidlist = pidlist
@verbose = verbose
@keylist = keylist
datetime_today = Time.zone.now.strftime('%Y-%m-%d-%H%M%S') # "2017-10-21-125903"
report_filename = File.join(export_dir, "report_#{export_name}_#{datetime_today}.txt")
@report = File.open(report_filename, 'w')
@errors = []
end

def export
begin
@errors << "Exporting #{@export_name}"
Dir.mkdir(@export_dir) unless Dir.exist?(@export_dir)
Dir.mkdir(@datastreams_dir) unless Dir.exist?(@datastreams_dir)
export_datastreams
make_bags
rescue StandardError => e
@errors << "Error #{e.message}:#{e.backtrace.join("\n")}"
end
write_errors
@report.close
end

def export_datastreams
File.readlines(File.join(@export_dir, @pidlist)).each do |line|
puts "exporting content for #{line}" if @verbose
item = GenericAsset.find(line.strip)
next unless item.present?

add_content_to_keylist(item)
export_data(item, item.datastreams[key].content, line)
end
end

def keylist
{
'DC' => 'xml',
'RELS-EXT' => 'xml',
'rightsMetadata' => 'xml',
'workflowMetadata' => 'yml',
'descMetadata' => 'nt',
'leafMetadata' => 'yml'
}
end

def add_content_to_keylist(item)
@keylist['content'] = asset_mimetype(item) unless item.datastreams['content'].blank?
end

def asset_mimetype(item)
item.datastreams['content'].mimeType.split('/').last
end

def export_data(item, content, line)
@keylist.each do |key, ext|
next if item.datastreams[key].blank?

cleanpid = line.strip.gsub('oregondigital:', '')
f = File.open(File.join(@export_dir, cleanpid + '_' + key + '.' + ext), 'wb')
f.write(content)
f.close
end
end

def make_bags
puts "Bagging from #{@datastreams_dir}..." if @verbose
bag_dir = @datastreams_dir + '_bags'
Dir.mkdir bag_dir unless Dir.exist? bag_dir

Dir.chdir(@datastreams_dir)
Dir.glob('*.nt').each do |item|
pid = get_short_pid(item)
list = Dir.glob("*#{pid}*")
make_bag(bag_dir, @datastreams_dir, list, pid)
if validate_list(list)
puts "bagged #{pid}"
else
puts "no content file included for #{pid}"
end
end
puts 'Completed bagging.'
end

def make_bag(dir, source_dir, list, pid)
bag = BagIt::Bag.new(File.join(dir, pid))

list.each do |item|
bag.add_file(item, File.join(source_dir, item))
end
bag.tagmanifest!
bag.manifest!
end

def get_short_pid(filename)
pid = /[a-z0-9]{9}/.match filename
pid.to_s.try(:gsub, 'oregondigital-', '') unless pid.nil?
end

def validate_list(list)
list.join.include? 'content'
end

def write_errors
@errors.each do |e|
@report.puts e
end
end
end
end
93 changes: 16 additions & 77 deletions lib/tasks/export_bags.rake
Original file line number Diff line number Diff line change
Expand Up @@ -6,85 +6,24 @@ require 'bagit'
#
# task export_datastreams
# args
# input_tsv: full path to the tsv file
# batch: name of a batch or group of assets (or collections)
# export_dir:
# base directory for exported bags
# name:
# name of the batch (a directory with this name will be added to the
# pidlist:
# path to the pidlist txt file (one pid per line) expected within export_dir
# base directory
#
# example:
#
# RAILS_ENV=production bundle exec rake export_datastreams input_tsv="/data1/batch/exports/od2_seed_data_pids_Baseball_jpegs.tsv"
# RAILS_ENV=production bundle exec rake export_bags export_dir=/data1/batch/exports name='baseball' pidlist=pidlist.txt verbose=true
#
desc 'Export datastreams given a vts file'
task :export_datastreams => :environment do |t, args|
path_to_tsv_file = ENV['input_tsv']
path_to_export_dir = "/data1/batch/exports/"
keylist = { "DC" => "xml", "RELS-EXT" => "xml", "rightsMetadata" => "xml", "workflowMetadata" => "yml", "descMetadata" => "nt", "leafMetadata" => "yml"}
File.readlines(path_to_tsv_file).each do |line|
line_cols = line.split("\t")
item = GenericAsset.find(line_cols[0].strip)
target_path = File.join(path_to_export_dir, line_cols[1].strip)
Dir.mkdir(target_path) unless Dir.exists?(target_path)
if !item.nil?
keylist["content"] = item.datastreams["content"].mimeType.split("/").last unless item.datastreams["content"].blank?
cleanpid = line_cols[0].strip.gsub(":", "-")
onlypid = cleanpid.strip.gsub("oregondigital-","")
puts "exporting #{onlypid}"
keylist.each do |key, ext|
next if item.datastreams[key].blank?
f = File.open( File.join(target_path, onlypid + "_" + key + "." + ext), 'wb')
f.write(item.datastreams[key].content)
f.close
end
end
end
puts "export_datastreams done"
end

# Usage:
#
# task make_bags
# args
# batch: name of a batch or group of assets (or collections)
#
# example:
#
# RAILS_ENV=production bundle exec rake make_bags batch="Baseball_jpegs"
#
desc 'Make bags given re-exported datastreams'
task :make_bags => :environment do |t, args|
batch = ENV['batch']
bag_dir = "/data1/batch/exports/#{batch}_datastreams"
source_dir = "/data1/batch/exports/#{batch}"
puts "Bagging from #{source_dir}..."
Dir.mkdir bag_dir unless Dir.exist? bag_dir
Dir.chdir(source_dir)
Dir.glob('*.nt').each do |item|
pid = get_short_pid(item)
list = Dir.glob("*#{pid}*")
make_bag(bag_dir, source_dir, list, pid)
if validate_list(list)
puts "bagged #{pid}"
else
puts "no content file included for #{pid}"
end
end
puts "Completed bagging."
end

def make_bag(dir, source_dir, list, pid)
bag = BagIt::Bag.new(File.join(dir, pid))

list.each do |item|
bag.add_file(item, File.join(source_dir, item) ) #relativepathtobag, srcpath
end
bag.tagmanifest!
bag.manifest!
end

def get_short_pid(filename)
pid = /[a-z0-9]{9}/.match filename
pid.to_s.gsub("oregondigital-", "") unless pid.nil?
end

def validate_list(list)
list.join.include? "content"
desc 'Export bags given an export path, a batch name, and a pidlist text file'
task export_bags: :environment do
require 'hyrax/migrator/export'
export_dir = ENV['export_dir']
name = ENV['name']
pidlist = ENV['pidlist']
service = Hyrax::Migrator::Export.new(export_dir, name, pidlist, ENV['verbose'])
service.export
end

0 comments on commit aa77ccd

Please sign in to comment.