forked from haskell-hvr/cassava
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Encoding.hs
296 lines (260 loc) · 10.3 KB
/
Encoding.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
{-# LANGUAGE BangPatterns, OverloadedStrings #-}
-- Module: Data.Csv.Encoding
-- Copyright: (c) 2011 MailRank, Inc.
-- (c) 2012 Johan Tibell
-- License: BSD3
-- Maintainer: Johan Tibell <johan.tibell@gmail.com>
-- Stability: experimental
-- Portability: portable
--
-- Encoding and decoding of data types into CSV.
module Data.Csv.Encoding
(
-- * Encoding and decoding
decode
, decodeByName
, encode
, encodeByName
-- ** Encoding and decoding options
, DecodeOptions(..)
, defaultDecodeOptions
, spaceDecodeOptions
, decodeWith
, decodeByNameWith
, EncodeOptions(..)
, defaultEncodeOptions
, spaceEncodeOptions
, encodeWith
, encodeByNameWith
) where
import Blaze.ByteString.Builder (Builder, fromByteString, fromWord8,
toLazyByteString)
import Blaze.ByteString.Builder.Char8 (fromString)
import Control.Applicative ((*>), (<|>), optional, pure)
import Data.Attoparsec.Char8 (endOfInput, endOfLine)
import qualified Data.Attoparsec.ByteString.Lazy as AL
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as B8
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Lazy.Char8 as BL8
import qualified Data.HashMap.Strict as HM
import Data.Monoid (mconcat, mempty)
import Data.Vector (Vector)
import qualified Data.Vector as V
import Data.Word (Word8)
import Prelude hiding (unlines)
import Data.Csv.Compat.Monoid ((<>))
import Data.Csv.Conversion (FromNamedRecord, FromRecord, ToNamedRecord,
ToRecord, parseNamedRecord, parseRecord, runParser,
toNamedRecord, toRecord)
import Data.Csv.Parser hiding (csv, csvWithHeader)
import qualified Data.Csv.Parser as Parser
import Data.Csv.Types hiding (toNamedRecord)
import qualified Data.Csv.Types as Types
import Data.Csv.Util (blankLine)
-- TODO: 'encode' isn't as efficient as it could be.
------------------------------------------------------------------------
-- * Encoding and decoding
-- | Efficiently deserialize CSV records from a lazy 'L.ByteString'.
-- If this fails due to incomplete or invalid input, @'Left' msg@ is
-- returned. Equivalent to @'decodeWith' 'defaultDecodeOptions'@.
decode :: FromRecord a
=> Bool -- ^ Data contains header that should be
-- skipped
-> L.ByteString -- ^ CSV data
-> Either String (Vector a)
decode = decodeWith defaultDecodeOptions
{-# INLINE decode #-}
-- | Efficiently deserialize CSV records from a lazy 'L.ByteString'.
-- If this fails due to incomplete or invalid input, @'Left' msg@ is
-- returned. The data is assumed to be preceeded by a header.
-- Equivalent to @'decodeByNameWith' 'defaultDecodeOptions'@.
decodeByName :: FromNamedRecord a
=> L.ByteString -- ^ CSV data
-> Either String (Header, Vector a)
decodeByName = decodeByNameWith defaultDecodeOptions
{-# INLINE decodeByName #-}
-- | Efficiently serialize CSV records as a lazy 'L.ByteString'.
encode :: ToRecord a => V.Vector a -> L.ByteString
encode = encodeWith defaultEncodeOptions
{-# INLINE encode #-}
-- | Efficiently serialize CSV records as a lazy 'L.ByteString'. The
-- header is written before any records and dictates the field order.
encodeByName :: ToNamedRecord a => Header -> V.Vector a -> L.ByteString
encodeByName = encodeByNameWith defaultEncodeOptions
{-# INLINE encodeByName #-}
------------------------------------------------------------------------
-- ** Encoding and decoding options
-- | Like 'decode', but lets you customize how the CSV data is parsed.
decodeWith :: FromRecord a
=> DecodeOptions -- ^ Decoding options
-> Bool -- ^ Data contains header that should be
-- skipped
-> L.ByteString -- ^ CSV data
-> Either String (Vector a)
decodeWith = decodeWithC csv
{-# INLINE [1] decodeWith #-}
{-# RULES
"idDecodeWith" decodeWith = idDecodeWith
#-}
-- | Same as 'decodeWith', but more efficient as no type
-- conversion is performed.
idDecodeWith :: DecodeOptions -> Bool -> L.ByteString
-> Either String (Vector (Vector B.ByteString))
idDecodeWith = decodeWithC Parser.csv
-- | Decode CSV data using the provided parser, skipping a leading
-- header if 'skipHeader' is 'True'. Returns 'Left' @errMsg@ on
-- failure.
decodeWithC :: (DecodeOptions -> AL.Parser a) -> DecodeOptions -> Bool
-> BL8.ByteString -> Either String a
decodeWithC p !opts skipHeader = decodeWithP parser
where parser
| skipHeader = header opts *> p opts
| otherwise = p opts
{-# INLINE decodeWithC #-}
-- | Like 'decodeByName', but lets you customize how the CSV data is
-- parsed.
decodeByNameWith :: FromNamedRecord a
=> DecodeOptions -- ^ Decoding options
-> L.ByteString -- ^ CSV data
-> Either String (Header, Vector a)
decodeByNameWith !opts = decodeWithP (csvWithHeader opts)
-- | Options that controls how data is encoded. These options can be
-- used to e.g. encode data in a tab-separated format instead of in a
-- comma-separated format.
--
-- To avoid having your program stop compiling when new fields are
-- added to 'EncodeOptions', create option records by overriding
-- values in 'defaultEncodeOptions'. Example:
--
-- > myOptions = defaultEncodeOptions {
-- > encDelimiter = fromIntegral (ord '\t')
-- > }
data EncodeOptions = EncodeOptions
{ -- | Field delimiter.
encDelimiter :: {-# UNPACK #-} !Word8
} deriving (Eq, Show)
-- | Encoding options for CSV files. Comma is used as separator.
defaultEncodeOptions :: EncodeOptions
defaultEncodeOptions = EncodeOptions
{ encDelimiter = 44 -- comma
}
-- | Encode options for space-delimited files. Tab is used as separator.
spaceEncodeOptions :: EncodeOptions
spaceEncodeOptions = EncodeOptions
{ encDelimiter = 9 -- tab
}
-- | Like 'encode', but lets you customize how the CSV data is
-- encoded.
encodeWith :: ToRecord a => EncodeOptions -> V.Vector a -> L.ByteString
encodeWith opts = toLazyByteString
. unlines
. map (encodeRecord (encDelimiter opts) . toRecord)
. V.toList
{-# INLINE encodeWith #-}
encodeRecord :: Word8 -> Record -> Builder
encodeRecord delim = mconcat . intersperse (fromWord8 delim)
. map fromByteString . map (escape delim) . V.toList
{-# INLINE encodeRecord #-}
-- TODO: Optimize
escape :: Word8 -> B.ByteString -> B.ByteString
escape delim s
| B.find (\ b -> b == dquote || b == delim || b == nl || b == cr ||
b == sp) s == Nothing = s
| otherwise =
B.concat ["\"",
B.concatMap
(\ b -> if b == dquote then "\"\"" else B.singleton b) s,
"\""]
where
sp = 32
dquote = 34
nl = 10
cr = 13
-- | Like 'encodeByName', but lets you customize how the CSV data is
-- encoded.
encodeByNameWith :: ToNamedRecord a => EncodeOptions -> Header -> V.Vector a
-> L.ByteString
encodeByNameWith opts hdr v =
toLazyByteString ((encodeRecord (encDelimiter opts) hdr) <>
fromByteString "\r\n" <> records)
where
records = unlines
. map (encodeRecord (encDelimiter opts)
. namedRecordToRecord hdr . toNamedRecord)
. V.toList $ v
{-# INLINE encodeByNameWith #-}
namedRecordToRecord :: Header -> NamedRecord -> Record
namedRecordToRecord hdr nr = V.map find hdr
where
find n = case HM.lookup n nr of
Nothing -> moduleError "namedRecordToRecord" $
"header contains name " ++ show (B8.unpack n) ++
" which is not present in the named record"
Just v -> v
moduleError :: String -> String -> a
moduleError func msg = error $ "Data.Csv.Encoding." ++ func ++ ": " ++ msg
{-# NOINLINE moduleError #-}
unlines :: [Builder] -> Builder
unlines [] = mempty
unlines (b:bs) = b <> fromString "\r\n" <> unlines bs
intersperse :: Builder -> [Builder] -> [Builder]
intersperse _ [] = []
intersperse sep (x:xs) = x : prependToAll sep xs
prependToAll :: Builder -> [Builder] -> [Builder]
prependToAll _ [] = []
prependToAll sep (x:xs) = sep <> x : prependToAll sep xs
decodeWithP :: AL.Parser a -> L.ByteString -> Either String a
decodeWithP p s =
case AL.parse p s of
AL.Done _ v -> Right v
AL.Fail left _ msg -> Left $ "parse error (" ++ msg ++ ") at " ++
show (BL8.unpack left)
{-# INLINE decodeWithP #-}
-- These alternative implementation of the 'csv' and 'csvWithHeader'
-- parsers from the 'Parser' module performs the
-- 'FromRecord'/'FromNamedRecord' conversions on-the-fly, thereby
-- avoiding the need to hold a big 'CSV' value in memory. The 'CSV'
-- type has a quite large memory overhead due to high constant
-- overheads of 'B.ByteString' and 'V.Vector'.
-- TODO: Check that the error messages don't duplicate prefixes, as in
-- "parse error: conversion error: ...".
-- | Parse a CSV file that does not include a header.
csv :: FromRecord a => DecodeOptions -> AL.Parser (V.Vector a)
csv !opts = do
vals <- records
_ <- optional endOfLine
endOfInput
return $! V.fromList vals
where
records = do
!r <- record opts
if blankLine r
then (endOfLine *> records) <|> pure []
else case runParser (parseRecord r) of
Left msg -> fail $ "conversion error: " ++ msg
Right val -> do
!vals <- (endOfLine *> records) <|> pure []
return (val : vals)
{-# INLINE csv #-}
-- | Parse a CSV file that includes a header.
csvWithHeader :: FromNamedRecord a => DecodeOptions
-> AL.Parser (Header, V.Vector a)
csvWithHeader !opts = do
!hdr <- header opts
vals <- records hdr
_ <- optional endOfLine
endOfInput
let !v = V.fromList vals
return (hdr, v)
where
records hdr = do
!r <- record opts
if blankLine r
then (endOfLine *> records hdr) <|> pure []
else case runParser (convert hdr r) of
Left msg -> fail $ "conversion error: " ++ msg
Right val -> do
!vals <- (endOfLine *> records hdr) <|> pure []
return (val : vals)
convert hdr = parseNamedRecord . Types.toNamedRecord hdr