This repository has been archived by the owner on Apr 22, 2024. It is now read-only.
/
stream_auditor.py
61 lines (49 loc) · 1.9 KB
/
stream_auditor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import base
from brewery import dq
class StreamAuditor(base.DataTarget):
"""Target stream for auditing data values from stream. For more information about probed value
properties, please refer to :class:`brewery.dq.FieldStatistics`"""
def __init__(self, distinct_threshold = 10):
super(StreamAuditor, self).__init__()
self.record_count = 0
self.stats = {}
self.distinct_threshold = distinct_threshold
self._field_names = None
def initialize(self):
self.record_count = 0
def append(self, obj):
"""Probe row or record and update statistics."""
self.record_count += 1
if type(obj) == dict:
self._probe_record(obj)
else:
self._probe_row(obj)
def _probe_record(self, record):
for field, value in record.items():
stat = self._field_stat(field)
stat.probe(value)
def _probe_row(self, row):
if not self.fields:
raise ValueError("Fields are not initialized")
for i, field in enumerate(self.fields.names()):
stat = self._field_stat(field)
value = row[i]
stat.probe(value)
def finalize(self):
for key, stat in self.stats.items():
stat.finalize(self.record_count)
def _field_stat(self, field):
"""Get single field statistics. Create if does not exist"""
if not field in self.stats:
stat = dq.FieldStatistics(field, distinct_threshold = self.distinct_threshold)
self.stats[field] = stat
else:
stat = self.stats[field]
return stat
@property
def field_statistics(self):
"""Return field statistics as dictionary: keys are field names, values are
:class:`brewery.dq.FieldStatistics` objects"""
return self.stats