This repository has been archived by the owner on Apr 22, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 12
/
merge_multiple_files.py
78 lines (53 loc) · 2.06 KB
/
merge_multiple_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
"""
Brewery Example - merge multiple CSV files
Input: Multiple CSV files with different fields, but with common subset of
fields.
Output: Single CSV file with all fields from all files and with additional
column with origin file name
Run:
$ python merge_multiple_files.py
Afterwards display the CSV file:
$ cat merged.csv | brewery pipe pretty_printer
And see the field completeness (data quality dimension):
$ cat merged.csv | brewery pipe audit pretty_printer
"""
import brewery
from brewery import ds
import sys
# List of sources - you might want to keep this list in a json file
sources = [
{"file": "grants_2008.csv",
"fields": ["receiver", "amount", "date"]},
{"file": "grants_2009.csv",
"fields": ["id", "receiver", "amount", "contract_number", "date"]},
{"file": "grants_2010.csv",
"fields": ["receiver", "subject", "requested_amount", "amount", "date"]}
]
# Create list of all fields and add filename to store information
# about origin of data records
all_fields = brewery.FieldList(["file"])
# Go through source definitions and collect the fields
for source in sources:
for field in source["fields"]:
if field not in all_fields:
all_fields.append(field)
# Create and initialize a data target
out = ds.CSVDataTarget("merged.csv")
out.fields = brewery.FieldList(all_fields)
out.initialize()
# Append all sources
for source in sources:
path = source["file"]
# Initialize data source: skip reading of headers - we are preparing them ourselves
# use XLSDataSource for XLS files
# We ignore the fields in the header, because we have set-up fields
# previously. We need to skip the header row.
src = ds.CSVDataSource(path,read_header=False,skip_rows=1)
src.fields = ds.FieldList(source["fields"])
src.initialize()
for record in src.records():
# Add file reference into ouput - to know where the row comes from
record["file"] = path
out.append(record)
# Close the source stream
src.finalize()