In [1]:
import pandas as pd
import datetime as dt
pd.__version__

'1.4.2'

## This Module's Dataset + Memory Optimization
- The `pd.to_datetime` method
- The `parse_dates` parameter of `read_csv`

In [15]:
df = pd.read_csv("employees.csv")

df.dtypes # Two ways to get information about data types
df.info()

df["Start Date"]      = pd.to_datetime(df["Start Date"])
df["Last Login Time"] = pd.to_datetime(df["Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype(bool)
df["Gender"] = df["Gender"].astype("category")

# Shortcut with parse_dates
df = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype(bool)
df["Gender"]            = df["Gender"].astype("category")      
df.head(2)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   First Name         933 non-null    object 
 1   Gender             855 non-null    object 
 2   Start Date         1000 non-null   object 
 3   Last Login Time    1000 non-null   object 
 4   Salary             1000 non-null   int64  
 5   Bonus %            1000 non-null   float64
 6   Senior Management  933 non-null    object 
 7   Team               957 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 62.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    category      
 2   Start Date         100

## Filter A `DataFrame`  Based On A Condition
- Create a **subset**

In [3]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype(bool)
df["Gender"] = df["Gender"].astype("category")
df.head(2)

df["Gender"]
df["Gender"] == "Male"
df[df["Gender"] == "Male"]

on_finance_team = df["Team"] == "Finance"
df[on_finance_team].head(3)

# Booleans
df[df["Senior Management"] == True].head(3)

# Numbers
df[df["Salary"] > 110000]
df[df["Bonus %"] < 1.5].head(3)

# Datetime Objects
df[df["Start Date"] < "1985-01-01"].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
10,Louise,Female,1980-08-12,2022-06-16 09:01:00,63241,15.132,True,
12,Brandon,Male,1980-12-01,2022-06-16 01:08:00,112807,17.492,True,Human Resources
18,Diana,Female,1981-10-23,2022-06-16 10:27:00,132940,19.082,False,Client Services


## Filter with More than One Condition (AND)

In [4]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype(bool)
df["Gender"] = df["Gender"].astype("category")
df.head(2)

is_male = df["Gender"] == "Male"
is_in_marketing = df["Team"] == "Marketing"
df[is_male & is_in_marketing].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-06-16 12:42:00,97308,6.945,True,Marketing
21,Matthew,Male,1995-09-05,2022-06-16 02:12:00,100612,13.645,False,Marketing
26,Craig,Male,2000-02-27,2022-06-16 07:45:00,37598,7.757,True,Marketing


## Filter with More than One Condition (OR)

In [5]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype(bool)
df["Gender"] = df["Gender"].astype("category")

# Example 1
senior_management = df["Senior Management"] == True
started_in_80s = df["Start Date"] < "1990-01-01"
df[senior_management | started_in_80s].head(5)

# Example 2 (And + Or)
is_robert = df["First Name"] == "Robert"
is_in_client_services = df["Team"] == "Client Services"
start_date_after_may = df["Start Date"] > "2016-06-01"
df[(is_robert & is_in_client_services) | start_date_after_may]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
15,Lillian,Female,2016-06-05,2022-06-16 06:09:00,59414,1.256,False,Product
98,Tina,Female,2016-06-16,2022-06-16 19:47:00,100705,16.961,True,Marketing
387,Robert,Male,1994-10-29,2022-06-16 04:26:00,123294,19.894,False,Client Services
451,Terry,,2016-07-15,2022-06-16 00:29:00,140002,19.49,True,Marketing


## The `.isin()` Method (Create Lesson)
- Accepts an object like list, tuple, or `Series`
- Returns True if value is found in the argument.

In [6]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype(bool)
df["Gender"] = df["Gender"].astype("category")

teams = ["Legal", "Sales", "Product"]

df[df["Team"].isin(teams)].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,2022-06-16 01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,2022-06-16 16:20:00,65476,10.012,True,Product
11,Julie,Female,1997-10-26,2022-06-16 15:19:00,102508,12.637,True,Legal


## The `.isnull()` and `notnull()` Methods

In [7]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype(bool)
df["Gender"] = df["Gender"].astype("category")

df.info()

df[df["Team"].isnull()].head(3)
df[df["Gender"].notnull()].tail(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    category      
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   datetime64[ns]
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  1000 non-null   bool          
 7   Team               957 non-null    object        
dtypes: bool(1), category(1), datetime64[ns](2), float64(1), int64(1), object(2)
memory usage: 49.1+ KB


Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
997,Russell,Male,2013-05-20,2022-06-16 12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,2022-06-16 16:45:00,60500,11.985,False,Business Development
999,Albert,Male,2012-05-15,2022-06-16 18:24:00,129949,10.169,True,Sales


## The `.between()` Method

In [8]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype(bool)
df["Gender"] = df["Gender"].astype("category")
df.head(3)

df[df["Salary"].between(60000, 70000)] # Both values inclusive
df[df["Bonus %"].between(2.0, 5.0)]

df[df["Start Date"].between("1991-01-01", "1992-01-01")]
df[df["Last Login Time"].between("08:30AM", "12:00PM")]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2022-06-16 11:17:00,130590,11.858,False,Finance
7,,Female,2015-07-20,2022-06-16 10:43:00,45906,11.598,True,Finance
10,Louise,Female,1980-08-12,2022-06-16 09:01:00,63241,15.132,True,
18,Diana,Female,1981-10-23,2022-06-16 10:27:00,132940,19.082,False,Client Services
33,Jean,Female,1993-12-18,2022-06-16 09:07:00,119082,16.180,False,Business Development
...,...,...,...,...,...,...,...,...
963,Ann,Female,1994-09-23,2022-06-16 11:15:00,89443,17.940,True,Sales
977,Sarah,Female,1995-12-04,2022-06-16 09:16:00,124566,5.949,False,Product
982,Rose,Female,1982-04-06,2022-06-16 10:43:00,91411,8.639,True,Human Resources
988,Alice,Female,2004-10-05,2022-06-16 09:34:00,47638,11.209,False,Human Resources


## The `.duplicated()` Method
- Also returns a Boolean `Series`

In [9]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype(bool)
df["Gender"] = df["Gender"].astype("category")

# Keeps first value, removes duplicate values second time around
df[df["First Name"].duplicated()].sort_values("First Name")
df[df["First Name"].duplicated(keep = "first")].sort_values("First Name")

df[df["First Name"].duplicated(keep = "last")].sort_values("First Name")
# Mark all duplicates as True -- original and duplicate values will be kept
df[df["First Name"].duplicated(keep = False)].sort_values("First Name")

# Remove duplicates with ~
condition = ~df["First Name"].duplicated(keep = False)
df[condition]

Output = None

## The `drop_duplicates()` Method

In [10]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype(bool)
df["Gender"] = df["Gender"].astype("category")
df.head(2)

len(df.drop_duplicates()) # Didn't work - no duplicate rows

print(len(df.drop_duplicates("Team")))
df.drop_duplicates("Team")

11


Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-06-16 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-06-16 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2022-06-16 11:17:00,130590,11.858,False,Finance
4,Larry,Male,1998-01-24,2022-06-16 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2022-06-16 01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,2022-06-16 16:20:00,65476,10.012,True,Product
8,Angela,Female,2005-11-22,2022-06-16 06:29:00,95570,18.523,True,Engineering
9,Frances,Female,2002-08-08,2022-06-16 06:51:00,139852,7.524,True,Business Development
12,Brandon,Male,1980-12-01,2022-06-16 01:08:00,112807,17.492,True,Human Resources
13,Gary,Male,2008-01-27,2022-06-16 23:40:00,109831,5.831,False,Sales


## The `nunique` and `unique` Methods

In [11]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype(bool)
df["Gender"] = df["Gender"].astype("category")
df.head(2)

df["Gender"].unique() # Array of unique values
df["Team"].unique() # Null values will be included
len(df["Team"].unique()) # Get length, compare to next example

df["Team"].unique()
df["Team"].nunique() # Count excludes null / NaN Values
df["Team"].nunique(dropna = True) # Same thing
df["Team"].nunique(dropna = False) # Now the same length as the unique array

11

## The `.replace()` Method

In [12]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype(bool)
df["Gender"] = df["Gender"].astype("category")
df.head(2)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-06-16 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-06-16 06:53:00,61933,4.17,True,
