Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Refactored split routines into more appropriate modules.

  • Loading branch information...
commit 69bb0b3807832e2ed9c6fca8031e6dff348c46dc 1 parent 8286aa6
@RayRacine authored
View
11 aws/s3/s3-uri.rkt
@@ -20,15 +20,20 @@
#| Helpers for S3 URI's, extracting buckets etc.|#
-(provide
- s3-uri-path->prefix)
+(provide:
+ [new-s3-uri (String String -> Uri)]
+ [s3-uri-path->prefix (String -> String)])
(require
(only-in httpclient/uri
- Uri Uri-scheme Uri-path)
+ Uri Uri-scheme Uri-path make-uri)
(only-in httpclient/uri/path
uri-path-split uri-build-path))
+(: new-s3-uri (String String -> Uri))
+(define (new-s3-uri bucket prefix)
+ (make-uri "s3" #f bucket #f (string-append "/" prefix) #f #f))
+
(: s3-uri-path->prefix (String -> String))
(define (s3-uri-path->prefix path)
(if (string=? path "")
View
73 mapred/input/s3-split.rkt
@@ -0,0 +1,73 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Ray Racine's Munger Library
+;; Copyright (C) 2007-2013 Raymond Paul Racine
+;;
+;; This program is free software: you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation, either version 3 of the License, or
+;; (at your option) any later version.
+;;
+;; This program is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with this program. If not, see <http://www.gnu.org/licenses/>.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+#lang typed/racket/base
+
+(provide:
+ [s3-split-bucket-prefix (String String String Natural Natural -> (Values BlockSet String))])
+
+(require
+ racket/pretty
+ (only-in httpclient/uri
+ Uri)
+ (only-in aws/s3/types
+ Key Key-key Key-size
+ Keys Keys-objects)
+ (only-in aws/s3/s3-uri
+ new-s3-uri)
+ (only-in aws/s3/objects
+ s3-list-bucket-objects)
+ (only-in "../types.rkt"
+ Block BlockSet)
+ (only-in "split.rkt"
+ n-block))
+
+#| Basic Splitting |#
+#|
+Focus on 3 Storage Options.
+1) A set of zipfiles.
+2) A set of LZO files.
+3) A set of textual files.
+|#
+
+;; Max number of object to fetch in a single s3 listing call.
+(define max-listing-size 10)
+
+(: s3-split-bucket-prefix (String String String Natural Natural -> (Values BlockSet String)))
+(define (s3-split-bucket-prefix bucket prefix marker min-splits max-split-size)
+ (let ((keys (s3-list-bucket-objects bucket prefix "" marker max-listing-size)))
+ (let: loop : (Values BlockSet String)
+ ([keys : (Listof Key) (Keys-objects keys)]
+ (blocks : (Listof Block) '())
+ (last-key : String ""))
+ (if (or (null? keys)
+ (>= (length blocks) min-splits))
+ (values
+ (BlockSet (new-s3-uri bucket prefix) blocks)
+ last-key)
+ (let* ((key (car keys))
+ (name (Key-key key))
+ (size (Key-size key)))
+ (if (> size 0)
+ (let ((new-blocks (n-block name size max-split-size)))
+ (loop (cdr keys)
+ (append new-blocks blocks)
+ name))
+ (loop (cdr keys) blocks name)))))))
+
+
View
70 mapred/input/split.rkt
@@ -0,0 +1,70 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Ray Racine's Munger Library
+;; Copyright (C) 2007-2013 Raymond Paul Racine
+;;
+;; This program is free software: you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation, either version 3 of the License, or
+;; (at your option) any later version.
+;;
+;; This program is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with this program. If not, see <http://www.gnu.org/licenses/>.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+#lang typed/racket/base
+
+#|Utilitities to split large data sources into small Blocks |#
+
+(provide:
+ [n-block (String Nonnegative-Integer Nonnegative-Integer -> (Listof Block))]
+ [n-rdd (RDD Natural -> RDD)]
+ [n-blockset (BlockSet Natural -> (Listof BlockSet))])
+
+(require
+ (only-in "../types.rkt"
+ RDD RDD-blocksets
+ Block BlockSet BlockSet-uri BlockSet-blocks
+ Range))
+
+(: n-block (String Nonnegative-Integer Nonnegative-Integer -> (Listof Block)))
+(define (n-block loc object-sz block-sz)
+ (let-values (((bs lb-sz) (quotient/remainder object-sz block-sz)))
+ (let: ((full-blocks : (Listof Block)
+ (for/list ((block-num (in-range bs))
+ #:when (>= block-num 0)) ;; for type-checker
+ (let ((sod (* block-num block-sz))
+ (eod (* (add1 block-num) block-sz)))
+ (Block loc (Range sod eod))))))
+ (if (> lb-sz 0) ;; partial block
+ (let* ((sod (* bs block-sz))
+ (eod (+ sod lb-sz)))
+ (cons (Block loc (Range sod eod)) full-blocks))
+ full-blocks))))
+
+;; Split a BlockSet into smaller blocksets no larger than N Blocks in size.
+;; Does NOT split a Block.
+(: n-blockset (BlockSet Natural -> (Listof BlockSet)))
+(define (n-blockset blockset n)
+ (define uri (BlockSet-uri blockset))
+ (let: loop : (Listof BlockSet) ((blocks : (Listof Block)'())
+ (blocksets : (Listof BlockSet) '())
+ (counter : Natural n)
+ (all-blocks : (Listof Block) (BlockSet-blocks blockset)))
+ (if (null? all-blocks)
+ (cons (BlockSet uri blocks) blocksets)
+ (if (zero? counter)
+ (loop '() (cons (BlockSet uri blocks) blocksets) n all-blocks)
+ (loop (cons (car all-blocks) blocks) blocksets (sub1 counter) (cdr all-blocks))))))
+
+;; Split an RDD into smaller BlockSets no larger than N of them.
+(: n-rdd (RDD Natural -> RDD))
+(define (n-rdd rdd n)
+ (RDD (apply append (map (λ: ((blockset : BlockSet))
+ (n-blockset blockset n))
+ (RDD-blocksets rdd)))))
+
View
45 mapred/input/text-split.rkt
@@ -0,0 +1,45 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Ray Racine's Munger Library
+;; Copyright (C) 2007-2013 Raymond Paul Racine
+;;
+;; This program is free software: you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation, either version 3 of the License, or
+;; (at your option) any later version.
+;;
+;; This program is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with this program. If not, see <http://www.gnu.org/licenses/>.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+#lang typed/racket/base
+
+(provide:
+ [rdd-text (case-> (Path -> (RDD Text))
+ (Path Natural -> (RDD Text)))])
+
+(require
+ (only-in httpclient/uri/filescheme
+ local-path->uri)
+ (only-in "../config.rkt"
+ DEFAULT-BLOCK-SIZE)
+ (only-in "../types.rkt"
+ Text RDD BlockSet)
+(only-in "split.rkt"
+ n-block))
+
+;; Build RDD from an input path
+(: rdd-text (case-> (Path -> (RDD Text))
+ (Path Natural -> (RDD Text))))
+(define (rdd-text base-dir-path [block-size DEFAULT-BLOCK-SIZE])
+ (RDD (list (BlockSet (local-path->uri base-dir-path)
+ (apply append (map (λ: ((file-name : Path))
+ (let ((full-path (path->complete-path file-name base-dir-path)))
+ (n-block (path->string file-name) (file-size full-path) block-size)))
+ (directory-list base-dir-path)))))))
+
+
Please sign in to comment.
Something went wrong with that request. Please try again.