-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
filtr.py
104 lines (79 loc) · 2.71 KB
/
filtr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# Authors: Thierry Moudiki
#
# License: BSD 3
import numpy as np
import pandas as pd
import polars as pl
from ..utils import parse_request
from ..utils import polars_to_pandas, pandas_to_polars
# filtr(df, 'tip > 5')
# req = "(time == 'Dinner') & (day == 'Sun') & (tip>1.5)"
# filtr(df, req, limit=3, random=False)
# filtr(df, req, limit=4, random=True)
#
# req = "(tip>1.5)"
# filtr(df, req, limit=7, random=False)œ
# filtr(df, req, limit=5, random=True)
#
# req = "(tip > 5) & (size > 3)"
# filtr(df, req, limit=5, random=False)
# filtr(df, req, limit=8, random=True)
#
# req = "(tip > 5) & (size > 3) & (sex == 'Male')"
# filtr(df, req, limit=7, random=False)
# filtr(df, req, limit=8, random=True)
def filtr(df, req, limit=None, random=False, seed=123):
""" Filter rows, based on given criteria.
Args:
req: str
criteria for filtering the rows
limit: int
number of records to be retrieved
random: bool
`True` if we want a random set of records
seed: int
reproducibility seed for situations where `random == True`
Examples:
https://github.com/thierrymoudiki/querier/tree/master/querier/demo
"""
if isinstance(df, pl.DataFrame):
df = polars_to_pandas(df)
# if request is not None:
n, p = df.shape
str_conds = parse_request(req)
df_res = df[eval(str_conds)]
if limit is not None:
assert int(limit) == limit, "limit must be an integer"
if random == False:
try:
if isinstance(df, pl.DataFrame):
return pandas_to_polars(df_res.head(limit))
return df_res.head(limit)
except:
raise ValueError(
"invalid request: check column names + contents (and parentheses for multiple conditions)"
)
# if random == True:
try:
np.random.seed(seed)
df_res.iloc[
np.random.choice(
range(0, df_res.shape[0]), size=limit, replace=False
),
]
if isinstance(df, pl.DataFrame):
return pandas_to_polars(df_res)
return df_res
except:
raise ValueError(
"invalid request: check column names + contents (and parentheses for multiple conditions)"
)
# if limit is None:
try:
if isinstance(df, pl.DataFrame):
return pandas_to_polars(df_res)
return df_res
except:
raise ValueError(
"invalid request: check column names + contents (and parentheses for multiple conditions)"
)