In [2]:
import pandas as pd

path = 'datasets/employee_list.parquet'
df = pd.read_parquet(path)

 #### Use built-in methods
 - .eq (equal)
 - .ne (not equal)
 - .le (less than or equal)
 - .gt (greater than)
 - .ge (greater than or equal)
 - .lt (less than)

In [3]:
# Find empoyees older than 50 years
df[df.age.gt(50)].head()

Unnamed: 0,employee_number,name,company,country,dob,age,department,salary,has_parking_space
3,267447,Seth Smith,Spears-Brown,Germany,1969-03-04,52,System Architect,115653,False
6,646135,Jonathan Brown,Wilson and Sons,USA,1969-06-05,52,Developer,77451,False
13,869838,Benjamin Snyder,Spears-Brown,Japan,1968-04-10,53,System Architect,133943,True
21,207726,Bruce Crawford,Spears-Brown,Venezuela,1969-01-04,53,Consulting,78198,False
31,254128,Larry Robinson,Wilson and Sons,India,1970-04-19,51,Management,89521,True


#### Other built-in methods
- nlargest
- nsmallest

In [10]:
df.nlargest(n=5, columns='dob')

Unnamed: 0,employee_number,name,company,country,dob,age,department,salary,has_parking_space
80,703509,Lisa Phillips,Spears-Brown,Japan,1990-09-05,31,Management,103309,False
10,361949,Laura Lane,Wilson and Sons,India,1990-07-13,31,Developer,89915,True
97,628844,Elizabeth Thomas,Spears-Brown,Cayman Islands,1990-06-30,31,System Architect,77442,True
8,587088,David Dunn,"Hernandez, Cunningham and Clark",Japan,1989-12-18,32,Management,96973,False
44,550539,Claudia Johnson,Wilson and Sons,Japan,1989-11-14,32,Finance,100316,True


In [13]:
df.nsmallest(n=5, columns='age', keep='all')

Unnamed: 0,employee_number,name,company,country,dob,age,department,salary,has_parking_space
10,361949,Laura Lane,Wilson and Sons,India,1990-07-13,31,Developer,89915,True
80,703509,Lisa Phillips,Spears-Brown,Japan,1990-09-05,31,Management,103309,False
97,628844,Elizabeth Thomas,Spears-Brown,Cayman Islands,1990-06-30,31,System Architect,77442,True
8,587088,David Dunn,"Hernandez, Cunningham and Clark",Japan,1989-12-18,32,Management,96973,False
41,116767,Diana Brown,Spears-Brown,Israel,1989-09-23,32,System Architect,126015,True
44,550539,Claudia Johnson,Wilson and Sons,Japan,1989-11-14,32,Finance,100316,True
53,358248,Nicholas May,"Hernandez, Cunningham and Clark",USA,1989-03-25,32,Management,131896,True
69,460868,Jeremy Cook,Wilson and Sons,Israel,1989-02-08,32,Management,133361,True
96,683924,Christopher Barnett,Wilson and Sons,Mauritania,1989-10-18,32,Management,123480,True


#### Select columns by data type

In [4]:
df.select_dtypes(include=['int']).head()

Unnamed: 0,employee_number,age,salary
0,897028,38,124790
1,463979,41,103122
2,388446,34,119072
3,267447,52,115653
4,401300,42,119412


In [5]:
df.select_dtypes(exclude=['int']).head()

Unnamed: 0,name,company,country,dob,department,has_parking_space
0,Kenneth Jensen,Wilson and Sons,India,1983-07-03,Management,False
1,Sarah Anderson,"Hernandez, Cunningham and Clark",India,1980-08-09,Consulting,True
2,Tracie Rollins,"Hernandez, Cunningham and Clark",Cayman Islands,1987-07-29,Consulting,False
3,Seth Smith,Spears-Brown,Germany,1969-03-04,System Architect,False
4,Katherine Fields,"Hernandez, Cunningham and Clark",Venezuela,1980-01-26,Finance,False


#### Simple Boolean condition

In [34]:
df[df.country == 'USA'].head()

Unnamed: 0,employee_number,name,company,country,dob,age,department,salary,has_parking_space
6,646135,Jonathan Brown,Wilson and Sons,USA,1969-06-05,52,Developer,77451,False
15,325975,Patrick Gonzales,"Hernandez, Cunningham and Clark",USA,1976-05-28,45,Management,78081,False
19,821821,Jared Summers,Spears-Brown,USA,1982-10-28,39,Consulting,112803,False
20,830747,Laura Jones,Wilson and Sons,USA,1976-02-15,45,Consulting,127183,True
39,227851,Tiffany Adkins,Wilson and Sons,USA,1974-10-10,47,Management,109596,True


In [39]:
df[df.country != 'USA'].head()

Unnamed: 0,employee_number,name,company,country,dob,age,department,salary,has_parking_space
0,897028,Kenneth Jensen,Wilson and Sons,India,1983-07-03,38,Management,124790,False
1,463979,Sarah Anderson,"Hernandez, Cunningham and Clark",India,1980-08-09,41,Consulting,103122,True
2,388446,Tracie Rollins,"Hernandez, Cunningham and Clark",Cayman Islands,1987-07-29,34,Consulting,119072,False
3,267447,Seth Smith,Spears-Brown,Germany,1969-03-04,52,System Architect,115653,False
4,401300,Katherine Fields,"Hernandez, Cunningham and Clark",Venezuela,1980-01-26,42,Finance,119412,False


#### Negation
When we include a **`~`** before the filter then it means `'the opposite'`.

In [38]:
df[~df.country.isin(['USA', 'Germany'])].head()

Unnamed: 0,employee_number,name,company,country,dob,age,department,salary,has_parking_space
0,897028,Kenneth Jensen,Wilson and Sons,India,1983-07-03,38,Management,124790,False
1,463979,Sarah Anderson,"Hernandez, Cunningham and Clark",India,1980-08-09,41,Consulting,103122,True
2,388446,Tracie Rollins,"Hernandez, Cunningham and Clark",Cayman Islands,1987-07-29,34,Consulting,119072,False
4,401300,Katherine Fields,"Hernandez, Cunningham and Clark",Venezuela,1980-01-26,42,Finance,119412,False
5,930543,Katherine Evans,Spears-Brown,Cayman Islands,1972-08-31,49,Finance,113570,True


#### Multiple Boolean conditions

In [17]:
criteria_a1 = df.age > 50
criteria_a2 = df.salary > 130_000
criteria_a = criteria_a1 & criteria_a2

criteria_b1 = df.age < 35
criteria_b2 = df.salary < 80_000
criteria_b = criteria_b1 & criteria_b2

cols = ['age', 'department', 'salary']
df.loc[criteria_a | criteria_b, cols]

Unnamed: 0,age,department,salary
13,53,System Architect,133943
34,53,System Architect,131134
97,31,System Architect,77442


### Index filtering

In [19]:
df2 = df.set_index('department').sort_index()
df2.loc['Developer'].head(n=5)

Unnamed: 0_level_0,employee_number,name,company,country,dob,age,salary,has_parking_space
department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Developer,560416,Justin Anderson,Spears-Brown,India,1971-12-19,50,75715,True
Developer,250029,Jennifer Rodgers,"Hernandez, Cunningham and Clark",Germany,1980-08-21,41,108389,False
Developer,990054,Elizabeth Foster,Wilson and Sons,Mauritania,1983-01-28,39,84062,True
Developer,794549,Arthur Proctor,Spears-Brown,Suriname,1977-04-22,44,85364,False
Developer,140866,Joyce Lucas,Spears-Brown,Japan,1975-05-26,46,117384,False


### Translating SQL WHERE clauses

In [22]:
cols = ['name', 'age', 'department', 'salary']

top_department = df.department.value_counts().index[:1]
department_criteria = df.department.isin(top_department)
age_criteria = df.age > 50
salary_criteria = df.salary.between(110_000, 140_000)

criteria = department_criteria & age_criteria & salary_criteria

df.loc[criteria, cols]

Unnamed: 0,name,age,department,salary
3,Seth Smith,52,System Architect,115653
13,Benjamin Snyder,53,System Architect,133943
34,Julie Craig,53,System Architect,131134


### Improve the readability of Boolean indexing with the query method

Strings passed to the .query method are going to look more like plain English than normal pandas code. It is possible to reference Python variables using the at symbol (@), as with depts. All DataFrame column names are available in the query namespace by referencing their names without extra quotes. If a string is needed, such as Female, inner quotes will need to wrap it.

Another nice feature of the query syntax is the ability to combine Boolean operators using and, or, and not.



In [25]:
cols = ['name', 'age', 'department', 'salary']
top_department = df.department.value_counts().index[:1]

qs = (
    'department in @top_department '
    ' and age > 50'
    ' and 81000 <= salary <= 140000' 
)

emp_filtered = df.query(qs)
emp_filtered[cols].head()

Unnamed: 0,name,age,department,salary
3,Seth Smith,52,System Architect,115653
13,Benjamin Snyder,53,System Architect,133943
34,Julie Craig,53,System Architect,131134
43,Charles Williams,52,System Architect,85485
58,Susan Horn,53,System Architect,86788


### Preserving Series size with the .where method
The where method is an application of the if-then idiom. For each element in the calling DataFrame, if `cond` is True the element is used; otherwise the corresponding element from the DataFrame `other` is used.

In [32]:
criteria_high = df.age < 50
criteria_low = df.age > 35
df.age.where(criteria_high, 50).where(criteria_low, 35).head(15)

0     38
1     41
2     35
3     50
4     42
5     49
6     50
7     39
8     35
9     44
10    35
11    44
12    40
13    50
14    45
Name: age, dtype: int64

We can do the same thing by using the `clip` method

In [15]:
df.age.clip(lower=35, upper=50).head(15)

0     38
1     41
2     35
3     50
4     42
5     49
6     50
7     39
8     35
9     44
10    35
11    44
12    40
13    50
14    45
Name: age, dtype: int64

### Mask
- The **`mask()`** method replaces the values of the rows where the condition evaluates to `True`.
- The **`mask()`** method is the opposite of the The `where()` method.
- syntax:
  - dataframe.mask(cond, other, inplace, axis, level, errors, try_cast)

In [54]:
cols = ['name', 'age', 'department', 'salary']
c1 = df.age > 33
c2 = df.department == 'System Architect'
c3 = df.salary > 100_000
criteria = c1 | c2 | c3

# By default the .mask method fills in rows where the Boolean array is True with NaN.
# we use .dropna to remove these rows
df[cols].mask(criteria).dropna()

Unnamed: 0,name,age,department,salary
8,David Dunn,32.0,Management,96973.0
10,Laura Lane,31.0,Developer,89915.0


### Dynamic filtering using query method

In [55]:
df.head()

Unnamed: 0,employee_number,name,company,country,dob,age,department,salary,has_parking_space
0,897028,Kenneth Jensen,Wilson and Sons,India,1983-07-03,38,Management,124790,False
1,463979,Sarah Anderson,"Hernandez, Cunningham and Clark",India,1980-08-09,41,Consulting,103122,True
2,388446,Tracie Rollins,"Hernandez, Cunningham and Clark",Cayman Islands,1987-07-29,34,Consulting,119072,False
3,267447,Seth Smith,Spears-Brown,Germany,1969-03-04,52,System Architect,115653,False
4,401300,Katherine Fields,"Hernandez, Cunningham and Clark",Venezuela,1980-01-26,42,Finance,119412,False


In [60]:
from collections import namedtuple
Constraint = namedtuple('Constraint', 'column, eq_sign, value')

def create_query_substring(c):
    if c.eq_sign in ['>', '<', '==']:
        return f'{c.column} {c.eq_sign} {repr(c.value)}'
    value = repr(c.value) if isinstance(c.value, str) else c.value
    return f'{c.column}.{c.eq_sign}({value})'

def create_query_string(constraints):
    return ' & '.join(create_query_substring(c) for c in constraints)

constraints = [
    Constraint('company', '==', 'Spears-Brown'),
    Constraint('age', '>', 50),
    Constraint('name', 'str.contains', 'Seth'),
]

query_str = create_query_string(constraints)

df.query(query_str, engine='python')

Unnamed: 0,employee_number,name,company,country,dob,age,department,salary,has_parking_space
3,267447,Seth Smith,Spears-Brown,Germany,1969-03-04,52,System Architect,115653,False
