In [None]:
import pandas as pd

In [None]:
emp = pd.read_csv("data/employees.csv")
emp["Start Date"] = pd.to_datetime(emp["Start Date"])
emp["Last Login Time"] = pd.to_datetime(emp["Last Login Time"])

# or we can use shortcuts - pd.read_csv("data/employees.csv", parse_dates = ["Start Date", "Last Login Time"])

emp["Senior Manangement"] = emp["Senior Management"].astype("bool")
emp["Gender"] = emp["Gender"].astype("category")
emp.head(3)

In [None]:
emp.info()

## filtering DataFrame based on Condition

In [None]:
emp[emp["Gender"] == "Male"]

In [None]:
emp[emp["Team"] == "Finance"]

In [None]:
emp[emp["Senior Manangement"]]

In [None]:
mask = emp["Team"] != "Marketing"
emp[mask]

In [None]:
mask = emp["Salary"] > 110000
emp[mask]

In [None]:
start_date_mask = emp["Start Date"] <= "1985-01-01"
emp[start_date_mask]

## filter with more than one condition

In [None]:
gender_mask = emp["Gender"] == "Male"
team_mask = emp["Team"] == "Marketing"

emp[gender_mask & team_mask]

In [None]:
senMg_mask = emp["Senior Manangement"]
date_mask = emp["Start Date"] < "1990-01-01"

emp[senMg_mask | date_mask]

In [None]:
name_mask = emp["First Name"] == "Robert"
team_mask = emp["Team"] == "Client Services"
date_mask = emp["Start Date"] > "2016-06-01"

emp[(name_mask & team_mask) | date_mask]

## .isin() method

In [None]:
# check for multiple values in condition
team_mask = emp["Team"].isin(["Legal", "Sales", "Product"])
emp[team_mask]

## .isnull() and .notnull() methods

In [None]:
emp[emp["Team"].isnull()]

In [None]:
emp[emp["Gender"].notnull()]

## .between() method

In [None]:
# checks for range
emp[emp["Salary"].between(60000, 70000)]

In [None]:
emp[emp["Bonus %"].between(2.0, 5.0)]

In [None]:
emp[emp["Start Date"].between("1991-01-01", "1992-01-01")]

## .duplicated() method

In [None]:
emp.sort_values("First Name", inplace=True)
emp.head()

In [None]:
# 'keep' defines position value to keep. keep='first' is default
# keep = False marks duplicated if something appears more than once

emp[~emp["First Name"].duplicated(keep=False)]

## .drop_duplicates() method

In [None]:
# it allows to work on DataFrame
# by default, remove rows which are similar across entire columns

# we can specify column name
emp.drop_duplicates(subset=["First Name"], keep=False)

## .unique() and nunique() methods

In [None]:
emp["Gender"].unique()

In [None]:
print("No. of unique elements in Team : ", len(emp["Team"].unique()))

In [None]:
# nunique - returns no. of unique values
emp["Team"].nunique()

# by default, it has dropna = True
emp["Team"].nunique(dropna=False)