forked from GoogleCloudPlatform/cloud-data-quality
-
Notifications
You must be signed in to change notification settings - Fork 0
/
row_by_row.yaml
186 lines (174 loc) · 4.45 KB
/
row_by_row.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
entities:
TEST_DATA:
source_database: BIGQUERY
table_name: <table_id>
dataset_name: <dataset_id>
project_name: <project_id>
columns:
UNIQUE_KEY:
name: unique_key
data_type: INTEGER
DESCRIPT:
name: descript
data_type: STRING
DATE:
name: date
data_type: DATE
TIME:
name: time
data_type: TIME
ADDRESS:
name: address
data_type: STRING
LON:
name: longitude
data_type: FLOAT
LAT:
name: latitude
data_type: FLOAT
LOCATION:
name: location
data_type: STRING
TIMESTAMP:
name: timestamp
data_type: TIMESTAMP
TAXI_TRIPS:
source_database: BIGQUERY
table_name: <table_id>
dataset_name: <dataset_id>
project_name: <project_id>
columns:
UNIQUE_KEY:
name: unique_key
data_type: STRING
TAXI_ID:
name: taxi_id
data_type: STRING
TRIP_START_TIMESTAMP:
name: trip_start_timestamp
data_type: TIMESTAMP
TRIP_END_TIMESTAMP:
name: trip_end_timestamp
data_type: TIMESTAMP
PAYMENT_TYPE:
name: payment_type
data_type: STRING
FARE:
name: fare
data_type: FLOAT
TIPS:
name: tips
data_type: FLOAT
TOLLS:
name: tolls
data_type: FLOAT
EXTRAS:
name: extras
data_type: FLOAT
TRIP_TOTAL:
name: trip_total
data_type: FLOAT
PICKUP_LOCATION:
name: pickup_location
data_type: STRING
DROPOFF_LOCATION:
name: dropoff_location
data_type: STRING
row_filters:
NONE:
filter_sql_expr: |-
True
rules:
EQUAL_COLUMNS:
rule_type: CUSTOM_SQL_STATEMENT
custom_sql_arguments:
- ref_table_id
- ref_column_id
params:
custom_sql_statement: |-
with t1 as (
select $column as col, count(*) as t1_n
from data
group by $column),
t2 as (
select $ref_column_id as col, count(*) as t2_n
from <dataset_id>.`$ref_table_id`
group by $ref_column_id
)
select t1.*, t2.t2_n
from t1
full outer join t2 using(col)
where not t1_n = t2_n
EQUAL_COLUMNS_UNIQUE_VALUES:
rule_type: CUSTOM_SQL_STATEMENT
custom_sql_arguments:
- ref_table_id
- ref_column_id
params:
custom_sql_statement: |-
with t1 as (
select
$column as t1_col,
row_number() over (order by $column) as n
from data),
t2 as (
select
$ref_column_id as t2_col,
row_number() over (order by $ref_column_id) as n
from <dataset_id>.`$ref_table_id`
)
select t1.t1_col, t2.t2_col
from t1
full outer join t2 using(n)
where not t1_col = t2_col
EQUAL_AGGREGATE:
rule_type: CUSTOM_SQL_STATEMENT
custom_sql_arguments:
key_column_id
value_column_id
ref_table_id
ref_key_column_id
ref_value_column_id
params:
custom_sql_statement: |-
with t1 as (
select $key_column_id as key, sum($value_column_id) as total
from data
group by $key_column_id),
t2 as (
select $ref_key_column_id as key, sum($ref_value_column_id) as ref_total
from <dataset_id>.`$ref_table_id`
group by $ref_key_column_id
)
select t1.*, t2.*
from t1
full outer join t2 using(key)
where not total = ref_total
rule_bindings:
EQUAL_COLUMNS:
entity_id: TEST_DATA
column_id: UNIQUE_KEY
row_filter_id: NONE
rule_ids:
- EQUAL_COLUMNS:
ref_table_id: <reference_table_id>
ref_column_id: <reference_column_id>
EQUAL_COLUMNS_UNIQUE_VALUES:
entity_id: TEST_DATA
column_id: UNIQUE_KEY
row_filter_id: NONE
rule_ids:
- EQUAL_COLUMNS_UNIQUE_VALUES:
ref_table_id: <reference_table_id>
ref_column_id: <reference_column_id>
EQUAL_AGGREGATE:
entity_id: TAXI_TRIPS
column_id: TAXI_ID
row_filter_id: NONE
rule_ids:
- EQUAL_AGGREGATE:
key_column_id: <key_column_id>
value_column_id: <value_column_id>
ref_table_id: <reference_table_id>
ref_key_column_id: <reference_key_column_id>
ref_value_column_id: <reference_value_column_id>