-
Notifications
You must be signed in to change notification settings - Fork 1
/
load_data.py
87 lines (74 loc) · 3.08 KB
/
load_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# -*- coding: utf-8 -*-
import sys
import os
import ntpath
import csv
from pathlib import Path
from _csv import QUOTE_MINIMAL
from glob import glob
from .utils import Utils
class LoadData():
"""Main Class for ingesting data and building summary statistics.
- will process CSV data into dict/set structures
- will filter data, cleaned output can be stored
- will generate statistics
"""
def loop_input_records(records, transferlimit, import_mapper, config):
"""Loops input json or csv records, converts to ProtoBuf structure and adds to records_dict
Returns statistic-counts, modifies (adds results to) import_mapper
"""
finished = False
processed_records = 0
db_row_number = 0
for record in records:
processed_records += 1
if config.is_local_input:
single_record = record
else:
db_row_number = record[0]
single_record = record[2]
if LoadData.skip_empty_or_other(single_record):
continue
if config.local_file_type == 'json' or not config.is_local_input:
import_mapper.parseJsonRecord(single_record, config.input_lbsn_type)
elif config.local_file_type in ('txt','csv'):
import_mapper.parse_csv_record(single_record)
else:
exit(f'Format {config.local_file_type} not supportet.')
if (transferlimit and processed_records >= transferlimit) or \
(not config.is_local_input and config.end_with_db_row_number and db_row_number >= config.end_with_db_row_number):
finished = True
break
return processed_records, finished
@staticmethod
def fetch_csv_data_from_file(source_config):
"""Read csv entries from file (either *.txt or *.csv).
The actual CSV formatting is not setable in config yet. There are many specifics, e.g.
#QUOTE_NONE is used here because media saved from Flickr does not contain any quotes ""
"""
records = []
loc_file = loc_filelist[start_file_id]
HF.log_main_debug(f'\nCurrent file: {ntpath.basename(loc_file)}')
with open(loc_file, 'r', encoding="utf-8", errors='replace') as file:
reader = csv.reader(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_NONE)
next(reader, None) # skip headerline
records = list(reader)
if not records:
return None
return records
@staticmethod
def read_local_files(config):
"""Read Local Files according to config parameters and returns list of file-paths"""
input_path = config.input_folder
filelist = list(input_path.glob(f'*.{config.source["Main"]["file_extension"]}'))
input_count = len(filelist)
if input_count == 0:
sys.exit("No input files found.")
else:
return filelist
@staticmethod
def skip_empty_or_other(single_record):
"""Detect empty records"""
if not single_record:
return False
return True