# This modulo dataset + memory optimization

In [2]:
import pandas as pd

In [15]:
df = pd.read_csv("employees.csv", parse_dates=["Start Date", "Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype(bool)
df["Gender"] = df["Gender"].astype("category")
df["Team"] = df["Team"].astype("category")
df.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-08-13 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-08-13 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2022-08-13 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-08-13 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2022-08-13 16:47:00,101004,1.389,True,Client Services


# Filter a DataFrame based on a condition

## With a funky syntax

In [21]:
df[df["Gender"] == "Male"]  # extracting all rows with gender == "male"

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-08-13 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-08-13 06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,2022-08-13 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2022-08-13 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2022-08-13 01:35:00,115163,10.125,False,Legal
...,...,...,...,...,...,...,...,...
994,George,Male,2013-06-21,2022-08-13 17:47:00,98874,4.479,True,Marketing
996,Phillip,Male,1984-01-31,2022-08-13 06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,2022-08-13 12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,2022-08-13 16:45:00,60500,11.985,False,Business Development


In [23]:
df[df["Team"] == "Finance"] # df with all rows corresponding to team finance

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2022-08-13 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-08-13 13:00:00,138705,9.340,True,Finance
7,,Female,2015-07-20,2022-08-13 10:43:00,45906,11.598,True,Finance
14,Kimberly,Female,1999-01-14,2022-08-13 07:13:00,41426,14.543,True,Finance
46,Bruce,Male,2009-11-28,2022-08-13 22:47:00,114796,6.796,False,Finance
...,...,...,...,...,...,...,...,...
907,Elizabeth,Female,1998-07-27,2022-08-13 11:12:00,137144,10.081,False,Finance
954,Joe,Male,1980-01-19,2022-08-13 16:06:00,119667,1.148,True,Finance
987,Gloria,Female,2014-12-08,2022-08-13 05:08:00,136709,10.331,True,Finance
992,Anthony,Male,2011-10-16,2022-08-13 08:35:00,112769,11.625,True,Finance


## There is a more elegant way to do this
And we can even combine conditions.

In [66]:
condition = df["Team"] == "Finance"
df[condition]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2022-08-13 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-08-13 13:00:00,138705,9.340,True,Finance
7,,Female,2015-07-20,2022-08-13 10:43:00,45906,11.598,True,Finance
14,Kimberly,Female,1999-01-14,2022-08-13 07:13:00,41426,14.543,True,Finance
46,Bruce,Male,2009-11-28,2022-08-13 22:47:00,114796,6.796,False,Finance
...,...,...,...,...,...,...,...,...
907,Elizabeth,Female,1998-07-27,2022-08-13 11:12:00,137144,10.081,False,Finance
954,Joe,Male,1980-01-19,2022-08-13 16:06:00,119667,1.148,True,Finance
987,Gloria,Female,2014-12-08,2022-08-13 05:08:00,136709,10.331,True,Finance
992,Anthony,Male,2011-10-16,2022-08-13 08:35:00,112769,11.625,True,Finance


## Extracting a boolean

In [48]:
condition = df["Senior Management"] != True
df[condition]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2022-08-13 11:17:00,130590,11.858,False,Finance
5,Dennis,Male,1987-04-18,2022-08-13 01:35:00,115163,10.125,False,Legal
13,Gary,Male,2008-01-27,2022-08-13 23:40:00,109831,5.831,False,Sales
15,Lillian,Female,2016-06-05,2022-08-13 06:09:00,59414,1.256,False,Product
16,Jeremy,Male,2010-09-21,2022-08-13 05:56:00,90370,7.369,False,Human Resources
...,...,...,...,...,...,...,...,...
989,Justin,,1991-02-10,2022-08-13 16:58:00,38344,3.794,False,Legal
995,Henry,,2014-11-23,2022-08-13 06:09:00,132483,16.655,False,Distribution
996,Phillip,Male,1984-01-31,2022-08-13 06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,2022-08-13 12:39:00,96914,1.421,False,Product


In [51]:
df.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-08-13 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-08-13 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2022-08-13 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-08-13 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2022-08-13 16:47:00,101004,1.389,True,Client Services


In [68]:
# Select all rows with salary above 110000
above_11e4 = df["Salary"] >= 11e4
df[above_11e4]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2022-08-13 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-08-13 13:00:00,138705,9.340,True,Finance
5,Dennis,Male,1987-04-18,2022-08-13 01:35:00,115163,10.125,False,Legal
9,Frances,Female,2002-08-08,2022-08-13 06:51:00,139852,7.524,True,Business Development
12,Brandon,Male,1980-12-01,2022-08-13 01:08:00,112807,17.492,True,Human Resources
...,...,...,...,...,...,...,...,...
987,Gloria,Female,2014-12-08,2022-08-13 05:08:00,136709,10.331,True,Finance
991,Rose,Female,2002-08-25,2022-08-13 05:12:00,134505,11.051,True,Marketing
992,Anthony,Male,2011-10-16,2022-08-13 08:35:00,112769,11.625,True,Finance
995,Henry,,2014-11-23,2022-08-13 06:09:00,132483,16.655,False,Distribution


In [63]:
prior_to_85 = df["Start Date"] <= "1985-01-01"
df[prior_to_85]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
10,Louise,Female,1980-08-12,2022-08-13 09:01:00,63241,15.132,True,
12,Brandon,Male,1980-12-01,2022-08-13 01:08:00,112807,17.492,True,Human Resources
18,Diana,Female,1981-10-23,2022-08-13 10:27:00,132940,19.082,False,Client Services
28,Terry,Male,1981-11-27,2022-08-13 18:30:00,124008,13.464,True,Client Services
37,Linda,Female,1981-10-19,2022-08-13 20:49:00,57427,9.557,True,Client Services
...,...,...,...,...,...,...,...,...
982,Rose,Female,1982-04-06,2022-08-13 10:43:00,91411,8.639,True,Human Resources
983,John,Male,1982-12-23,2022-08-13 22:35:00,146907,11.738,False,Engineering
985,Stephen,,1983-07-10,2022-08-13 20:10:00,85668,1.909,False,Legal
986,Donna,Female,1982-11-26,2022-08-13 07:04:00,82871,17.999,False,Marketing


# Filter with More than One Condition (AND)

In [69]:
# Filter data in which gender is female and team is marketing
female = df["Gender"] == "Female"
marketing = df["Team"] == "Marketing"
df[female & marketing]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
43,Marilyn,Female,1980-12-07,2022-08-13 03:16:00,73524,5.207,True,Marketing
62,,Female,2007-06-12,2022-08-13 17:25:00,58112,19.414,True,Marketing
98,Tina,Female,2016-06-16,2022-08-13 19:47:00,100705,16.961,True,Marketing
140,Shirley,Female,1981-02-28,2022-08-13 13:23:00,113850,1.854,False,Marketing
158,Norma,Female,1999-02-28,2022-08-13 20:45:00,114412,8.756,True,Marketing
201,Kimberly,Female,1997-07-15,2022-08-13 05:57:00,36643,7.953,False,Marketing
220,,Female,1991-06-17,2022-08-13 12:49:00,71945,5.56,True,Marketing
305,Margaret,Female,1993-02-06,2022-08-13 13:05:00,125220,3.733,False,Marketing
319,Jacqueline,Female,1981-11-25,2022-08-13 15:01:00,145988,18.243,False,Marketing
331,Evelyn,Female,1983-09-03,2022-08-13 13:58:00,36759,17.269,True,Marketing


# Filter with More than One Condition (OR)


In [70]:
df.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-08-13 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-08-13 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2022-08-13 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-08-13 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2022-08-13 16:47:00,101004,1.389,True,Client Services


In [73]:
senior = df["Senior Management"]
start_date = df["Start Date"] <= "1990-01-01"
# check is at least a condition is true
df[senior | start_date]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-08-13 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-08-13 06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,2022-08-13 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2022-08-13 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2022-08-13 01:35:00,115163,10.125,False,Legal
...,...,...,...,...,...,...,...,...
992,Anthony,Male,2011-10-16,2022-08-13 08:35:00,112769,11.625,True,Finance
993,Tina,Female,1997-05-15,2022-08-13 15:53:00,56450,19.040,True,Engineering
994,George,Male,2013-06-21,2022-08-13 17:47:00,98874,4.479,True,Marketing
996,Phillip,Male,1984-01-31,2022-08-13 06:30:00,42392,19.675,False,Finance


In [87]:
# we can combine more than two conditions.
# check if first_name == Robert AND team == Client Services
# OR start date greater than 1st june 2016

name = df["First Name"] == "Robert"
team = df["Team"] == "Client Services"
start_date = df["Start Date"] >= "2016-06-01"
df[(name & team) | start_date]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
15,Lillian,Female,2016-06-05,2022-08-13 06:09:00,59414,1.256,False,Product
98,Tina,Female,2016-06-16,2022-08-13 19:47:00,100705,16.961,True,Marketing
387,Robert,Male,1994-10-29,2022-08-13 04:26:00,123294,19.894,False,Client Services
451,Terry,,2016-07-15,2022-08-13 00:29:00,140002,19.49,True,Marketing
