# This modulo dataset + memory optimization

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("employees.csv", parse_dates=["Start Date", "Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype(bool)
df["Gender"] = df["Gender"].astype("category")
df["Team"] = df["Team"].astype("category")
df.head()

# Filter a DataFrame based on a condition

## With a funky syntax

In [None]:
df[df["Gender"] == "Male"]  # extracting all rows with gender == "male"

In [None]:
df[df["Team"] == "Finance"]  # df with all rows corresponding to team finance

## There is a more elegant way to do this
And we can even combine conditions.

In [None]:
condition = df["Team"] == "Finance"
df[condition]

## Extracting a boolean

In [None]:
condition = df["Senior Management"] != True
df[condition]

In [None]:
df.head()

In [None]:
# Select all rows with salary above 110000
above_11e4 = df["Salary"] >= 11e4
df[above_11e4]

In [None]:
prior_to_85 = df["Start Date"] <= "1985-01-01"
df[prior_to_85]

# Filter with More than One Condition (AND)

In [None]:
# Filter data in which gender is female and team is marketing
female = df["Gender"] == "Female"
marketing = df["Team"] == "Marketing"
df[female & marketing]

# Filter with More than One Condition (OR)


In [None]:
df.head()

In [None]:
senior = df["Senior Management"]
start_date = df["Start Date"] <= "1990-01-01"
# check is at least a condition is true
df[senior | start_date]

In [None]:
# we can combine more than two conditions.
# check if first_name == Robert AND team == Client Services
# OR start date greater than 1st june 2016

name = df["First Name"] == "Robert"
team = df["Team"] == "Client Services"
start_date = df["Start Date"] >= "2016-06-01"
df[(name & team) | start_date]

In [None]:
df.head()

# Check for inclusion with `.isin()` method

In [None]:
mask = df["Team"].isin(["Legal", "Sales", "Product"])
df[mask]

# The `.isnull()` and `.notnull()` methods

In [None]:
df["Team"].isnull()

In [None]:
df["Gender"].notnull()

# The `.between()` method

In [None]:
df.head()

In [None]:
df["Salary"].between(left=6e4, right=7e4, inclusive="both")  # include both by default

In [None]:
df["Bonus %"].between(2.0, 5.0)

In [None]:
df[df["Start Date"].between("1991-01-01", "1991-12-31")]

In [None]:
df[df["Last Login Time"].between("8:30AM", "12:00PM")]

# Check for duplicated rows with `.duplicated()`

In [None]:
df.sort_values(by="First Name", inplace=True)
df.head()

In [None]:
mark = ~df[
    "First Name"
].duplicated()  # to negate a condition in pandas we use the ~ symbol
df[mark]

# Remove duplicates with `drop_duplicates()`

In [None]:
df.head()

In [None]:
df.drop_duplicates(subset=["First Name", "Team"], keep=False)

# The `unique()` and `nunique()` methods

In [None]:
df = pd.read_csv("employees.csv", parse_dates=["Start Date", "Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype(bool)
df["Gender"] = df["Gender"].astype("category")
df["Team"] = df["Team"].astype("category")
df.head()

In [None]:
len(df["Gender"].unique())

In [None]:
len(df["Team"].unique())

In [None]:
df["Gender"].nunique(dropna=False)

In [None]:
df["Team"].nunique(dropna=False)

# Check for inclusion with `.isin()` method

In [None]:
mask = df["Team"].isin(["Legal", "Sales", "Product"])
df[mask]

# The `.isnull()` and `.notnull()` methods

In [None]:
df["Team"].isnull()

In [None]:
df["Gender"].notnull()

# The `.between()` method

In [None]:
df.head()

In [None]:
df["Salary"].between(left=6e4, right=7e4, inclusive="both")  # include both by default

In [None]:
df["Bonus %"].between(2.0, 5.0)

In [None]:
df[df["Start Date"].between("1991-01-01", "1991-12-31")]

In [None]:
df[df["Last Login Time"].between("8:30AM", "12:00PM")]

# Check for duplicated rows with `.duplicated()`

In [None]:
df.sort_values(by="First Name", inplace=True)
df.head()

In [None]:
mark = ~df[
    "First Name"
].duplicated()  # to negate a condition in pandas we use the ~ symbol
df[mark]

# Remove duplicates with `drop_duplicates()`

In [None]:
df.head()

In [None]:
df.drop_duplicates(subset=["First Name", "Team"], keep=False)

# The `unique()` and `nunique()` methods

In [None]:
df = pd.read_csv("employees.csv", parse_dates=["Start Date", "Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype(bool)
df["Gender"] = df["Gender"].astype("category")
df["Team"] = df["Team"].astype("category")
df.head()

In [None]:
len(df["Gender"].unique())

In [None]:
len(df["Team"].unique())

In [None]:
df["Gender"].nunique(dropna=False)

In [None]:
df["Team"].nunique(dropna=False)