Skip to content

Commit

Permalink
feat: some CSV manipulation scripts
Browse files Browse the repository at this point in the history
Signed-off-by: Evan Prodromou <evan@openearth.org>
  • Loading branch information
evanp committed Mar 28, 2024
1 parent 6ba03fc commit fc42d15
Show file tree
Hide file tree
Showing 5 changed files with 111 additions and 0 deletions.
22 changes: 22 additions & 0 deletions global-api/importer/misc/dedupe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from csv import DictReader, DictWriter
import sys
import csv

csv.field_size_limit(sys.maxsize)

def main(input, output, key):
seen = set()
with open(input, 'r') as f:
reader = DictReader(f)
with open(output, 'w') as f:
writer = DictWriter(f, fieldnames=reader.fieldnames)
writer.writeheader()
for row in reader:
if row[key] in seen:
continue
seen.add(row[key])
writer.writerow(row)

if __name__ == '__main__':
import sys
main(sys.argv[1], sys.argv[2], sys.argv[3])
26 changes: 26 additions & 0 deletions global-api/importer/misc/mergecsv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from csv import DictReader, DictWriter
import sys
import csv

csv.field_size_limit(sys.maxsize)

def main(key, output, inputs):
merged = {}
fieldnames = None
for file in inputs:
with open(file, 'r') as f:
reader = DictReader(f)
if not fieldnames:
fieldnames = reader.fieldnames
for row in reader:
merged[row[key]] = row

with open(output, 'w') as f:
writer = DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for row in merged.values():
writer.writerow(row)

if __name__ == '__main__':
import sys
main(sys.argv[1], sys.argv[2], sys.argv[3:])
25 changes: 25 additions & 0 deletions global-api/importer/misc/ordercols.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from csv import DictReader, DictWriter
import sys
import csv

csv.field_size_limit(sys.maxsize)

def main(first, others):
merged = {}
fieldnames = None
with open(first, 'r') as f:
reader = DictReader(f)
fieldnames = reader.fieldnames

for other in others:
with open(other, 'r') as f:
reader = DictReader(f)
with open(other + '.reordered', 'w') as g:
writer = DictWriter(g, fieldnames=fieldnames)
writer.writeheader()
for row in reader:
writer.writerow(row)

if __name__ == '__main__':
import sys
main(sys.argv[1], sys.argv[2:])
14 changes: 14 additions & 0 deletions global-api/importer/misc/semicolon_to_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from csv import DictReader, DictWriter

def main(inputfile, outputfile):
with open(inputfile, 'r') as f:
reader = DictReader(f, delimiter=';')
with open(outputfile, 'w') as f:
writer = DictWriter(f, delimiter=',', fieldnames=reader.fieldnames)
writer.writeheader()
for row in reader:
writer.writerow(row)

if __name__ == '__main__':
import sys
main(sys.argv[1], sys.argv[2])
24 changes: 24 additions & 0 deletions global-api/importer/misc/sortcsv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from csv import DictReader, DictWriter
import sys
import csv

csv.field_size_limit(sys.maxsize)

def main(key, input, output):
data = {}
fieldnames = None
with open(input, 'r') as f:
reader = DictReader(f)
fieldnames = reader.fieldnames
for row in reader:
data[row[key]] = row

with open(output, 'w') as f:
writer = DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for key in sorted(data.keys()):
writer.writerow(data[key])

if __name__ == '__main__':
import sys
main(sys.argv[1], sys.argv[2], sys.argv[3])

0 comments on commit fc42d15

Please sign in to comment.