## **Filtering DataFrame**

In [16]:
import numpy as np
import pandas as pd

In [17]:
emp = pd.read_csv("data/employees.csv", parse_dates=["Start Date"])
emp

  emp = pd.read_csv("data/employees.csv", parse_dates=["Start Date"])


Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,,True,Marketing
1,Thomas,Male,1996-03-31,61933.0,True,
2,Maria,Female,NaT,130590.0,False,Finance
3,Jerry,,2005-03-04,138705.0,True,Finance
4,Larry,Male,1998-01-24,101004.0,True,IT
...,...,...,...,...,...,...
996,Phillip,Male,1984-01-31,42392.0,False,Finance
997,Russell,Male,2013-05-20,96914.0,False,Product
998,Larry,Male,2013-04-20,60500.0,False,Business Dev
999,Albert,Male,2012-05-15,129949.0,True,Sales


In [18]:
emp.head()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,,True,Marketing
1,Thomas,Male,1996-03-31,61933.0,True,
2,Maria,Female,NaT,130590.0,False,Finance
3,Jerry,,2005-03-04,138705.0,True,Finance
4,Larry,Male,1998-01-24,101004.0,True,IT


In [19]:
emp.info()          # to get the more info on data and to inspect the data

# mgmt is True or False. so it should be boolean instead Object. 
# Salary column has 2 missing values. 1001-999, have to fill the missing values with 0. and convert the type to int
# Gender should be a different data type called category. 
# Team should also be category

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   First Name  933 non-null    object        
 1   Gender      854 non-null    object        
 2   Start Date  999 non-null    datetime64[ns]
 3   Salary      999 non-null    float64       
 4   Mgmt        933 non-null    object        
 5   Team        957 non-null    object        
dtypes: datetime64[ns](1), float64(1), object(4)
memory usage: 47.1+ KB


#### **Using astype() method() change the datatype of a column**

In [20]:
# mgmt column

emp["Mgmt"] = emp["Mgmt"].astype(bool)

In [21]:
emp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   First Name  933 non-null    object        
 1   Gender      854 non-null    object        
 2   Start Date  999 non-null    datetime64[ns]
 3   Salary      999 non-null    float64       
 4   Mgmt        1001 non-null   bool          
 5   Team        957 non-null    object        
dtypes: bool(1), datetime64[ns](1), float64(1), object(3)
memory usage: 40.2+ KB


In [22]:
# Salary column

emp["Salary"]
# there are two missing values one is at the beginning and one is at the end

0            NaN
1        61933.0
2       130590.0
3       138705.0
4       101004.0
          ...   
996      42392.0
997      96914.0
998      60500.0
999     129949.0
1000         NaN
Name: Salary, Length: 1001, dtype: float64

In [23]:
emp["Salary"] = emp["Salary"].fillna(0).astype(int)
emp.info()

# NaN values in Salary column marked them as 0 and changed the data type to int

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   First Name  933 non-null    object        
 1   Gender      854 non-null    object        
 2   Start Date  999 non-null    datetime64[ns]
 3   Salary      1001 non-null   int64         
 4   Mgmt        1001 non-null   bool          
 5   Team        957 non-null    object        
dtypes: bool(1), datetime64[ns](1), int64(1), object(3)
memory usage: 40.2+ KB


In [24]:
# Gender column - changing the data type to category

emp["Gender"] = emp["Gender"].astype("category")

emp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   First Name  933 non-null    object        
 1   Gender      854 non-null    category      
 2   Start Date  999 non-null    datetime64[ns]
 3   Salary      1001 non-null   int64         
 4   Mgmt        1001 non-null   bool          
 5   Team        957 non-null    object        
dtypes: bool(1), category(1), datetime64[ns](1), int64(1), object(2)
memory usage: 33.5+ KB


In [25]:
# Team column - should be a category

emp["Team"] = emp["Team"].astype("category")

emp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   First Name  933 non-null    object        
 1   Gender      854 non-null    category      
 2   Start Date  999 non-null    datetime64[ns]
 3   Salary      1001 non-null   int64         
 4   Mgmt        1001 non-null   bool          
 5   Team        957 non-null    category      
dtypes: bool(1), category(2), datetime64[ns](1), int64(1), object(1)
memory usage: 27.0+ KB


In [26]:
# check the shape

emp.shape           # 1001 rows and 6 columns  

(1001, 6)

### **Filtering the DataFrame using a Single Condition**

In [27]:
emp["First Name"]

0       Douglas
1        Thomas
2         Maria
3         Jerry
4         Larry
         ...   
996     Phillip
997     Russell
998       Larry
999      Albert
1000        NaN
Name: First Name, Length: 1001, dtype: object

In [31]:
# We can take the subset of the dataframe to analyze the content
# if condition checking firstname == "Maria", you will get a boolean value

marias = emp["First Name"] == "Maria"

In [32]:
# to see only rows where the condition met. for all true booleans

emp[emp["First Name"] == "Maria"]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
2,Maria,Female,NaT,130590,False,Finance
198,Maria,Female,1990-12-27,36067,True,Product
815,Maria,,1986-01-18,106562,False,HR
844,Maria,,1985-06-19,148857,False,Legal
936,Maria,Female,2003-03-14,96250,False,Business Dev
984,Maria,Female,2011-10-15,43455,False,Engineering


In [33]:
# using the assigned variable name instead of whole filter gives the same result as above, 

emp[marias]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
2,Maria,Female,NaT,130590,False,Finance
198,Maria,Female,1990-12-27,36067,True,Product
815,Maria,,1986-01-18,106562,False,HR
844,Maria,,1985-06-19,148857,False,Legal
936,Maria,Female,2003-03-14,96250,False,Business Dev
984,Maria,Female,2011-10-15,43455,False,Engineering


In [34]:
finance = emp["Team"] == "Finance"

emp[finance]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
2,Maria,Female,NaT,130590,False,Finance
3,Jerry,,2005-03-04,138705,True,Finance
7,,Female,2015-07-20,45906,True,Finance
14,Kimberly,Female,1999-01-14,41426,True,Finance
46,Bruce,Male,2009-11-28,114796,False,Finance
...,...,...,...,...,...,...
907,Elizabeth,Female,1998-07-27,137144,False,Finance
954,Joe,Male,1980-01-19,119667,True,Finance
987,Gloria,Female,2014-12-08,136709,True,Finance
992,Anthony,Male,2011-10-16,112769,True,Finance


In [None]:
management = emp["Mgmt"]            # Mgmt is a boolean column, no need to explisitly mention == True

emp[management]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
1,Thomas,Male,1996-03-31,61933,True,
3,Jerry,,2005-03-04,138705,True,Finance
4,Larry,Male,1998-01-24,101004,True,IT
6,Ruby,Female,1987-08-17,65476,True,Product
...,...,...,...,...,...,...
992,Anthony,Male,2011-10-16,112769,True,Finance
993,Tina,Female,1997-05-15,56450,True,Engineering
994,George,Male,2013-06-21,98874,True,Marketing
999,Albert,Male,2012-05-15,129949,True,Sales


In [None]:
#inverse operation ~ gives the opposite values

emp[~emp["Mgmt"]]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
2,Maria,Female,NaT,130590,False,Finance
5,Dennis,Male,1987-04-18,115163,False,Legal
13,Gary,Male,2008-01-27,109831,False,Sales
15,Lillian,Female,2016-06-05,59414,False,Product
16,Jeremy,Male,2010-09-21,90370,False,HR
...,...,...,...,...,...,...
989,Justin,,1991-02-10,38344,False,Legal
995,Henry,,2014-11-23,132483,False,Distribution
996,Phillip,Male,1984-01-31,42392,False,Finance
997,Russell,Male,2013-05-20,96914,False,Product


In [39]:
greater_than_100k = emp["Salary"]>100000
emp[greater_than_100k]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
2,Maria,Female,NaT,130590,False,Finance
3,Jerry,,2005-03-04,138705,True,Finance
4,Larry,Male,1998-01-24,101004,True,IT
5,Dennis,Male,1987-04-18,115163,False,Legal
9,Frances,Female,2002-08-08,139852,True,Business Dev
...,...,...,...,...,...,...
990,Robin,Female,1987-07-24,100765,True,IT
991,Rose,Female,2002-08-25,134505,True,Marketing
992,Anthony,Male,2011-10-16,112769,True,Finance
995,Henry,,2014-11-23,132483,False,Distribution


In [None]:
# Using multiple conditions at a time
# & for and; | for or

is_female = emp["Gender"] == "Female"
is_biz_dev = emp["Team"] == "Business Dev"
is_manager = emp["Mgmt"]

emp[is_female & is_biz_dev].head()


Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
9,Frances,Female,2002-08-08,139852,True,Business Dev
33,Jean,Female,1993-12-18,119082,False,Business Dev
36,Rachel,Female,2009-02-16,142032,False,Business Dev
38,Stephanie,Female,1986-09-13,36844,True,Business Dev
61,Denise,Female,2001-11-06,106862,False,Business Dev


In [44]:
emp[is_female & is_biz_dev & is_manager].head()


Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
9,Frances,Female,2002-08-08,139852,True,Business Dev
38,Stephanie,Female,1986-09-13,36844,True,Business Dev
66,Nancy,Female,2012-12-15,125250,True,Business Dev
92,Linda,Female,2000-05-25,119009,True,Business Dev
111,Bonnie,Female,1999-12-17,42153,True,Business Dev


In [48]:
emp[is_biz_dev & is_female | is_manager].head(10)

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
1,Thomas,Male,1996-03-31,61933,True,
3,Jerry,,2005-03-04,138705,True,Finance
4,Larry,Male,1998-01-24,101004,True,IT
6,Ruby,Female,1987-08-17,65476,True,Product
7,,Female,2015-07-20,45906,True,Finance
8,Angela,Female,2005-11-22,95570,True,Engineering
9,Frances,Female,2002-08-08,139852,True,Business Dev
10,Louise,Female,1980-08-12,63241,True,
11,Julie,Female,1997-10-26,102508,True,Legal


In [50]:
# isin() method

star_teams = ["Legal", "Sales", "Marketing"]

all_star_teams = emp["Team"].isin(star_teams)

emp[all_star_teams]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
5,Dennis,Male,1987-04-18,115163,False,Legal
11,Julie,Female,1997-10-26,102508,True,Legal
13,Gary,Male,2008-01-27,109831,False,Sales
20,Lois,,1995-04-22,64714,True,Legal
...,...,...,...,...,...,...
986,Donna,Female,1982-11-26,82871,False,Marketing
989,Justin,,1991-02-10,38344,False,Legal
991,Rose,Female,2002-08-25,134505,True,Marketing
994,George,Male,2013-06-21,98874,True,Marketing


In [None]:
# range using between

salary_80k_90k = emp["Salary"].between(80000, 90000)

emp[salary_80k_90k]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
19,Donna,Female,2010-07-22,81014,False,Product
31,Joyce,,2005-02-20,88657,False,Product
35,Theresa,Female,2006-10-10,85182,False,Sales
45,Roger,Male,1980-04-17,88010,True,Sales
54,Sara,Female,2007-08-15,83677,False,Engineering
...,...,...,...,...,...,...
930,Nancy,Female,2001-09-10,85213,True,Marketing
956,Beverly,Female,1986-10-17,80838,False,Engineering
963,Ann,Female,1994-09-23,89443,True,Sales
985,Stephen,,1983-07-10,85668,False,Legal


In [53]:
# left start date, right end date. comparing the dates
# when we use left and right parameters, both are included. 

emp_in_1990 = emp["Start Date"].between(
    left = "1990-01-01",
    right= "1990-12-31"
)

emp[emp_in_1990]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
52,Todd,Male,1990-02-18,49339,True,HR
64,Kathleen,,1990-04-11,77834,False,Business Dev
139,,Female,1990-10-03,132373,True,
141,Adam,Male,1990-12-24,110194,True,Product
163,Terry,Male,1990-09-03,52226,False,IT
198,Maria,Female,1990-12-27,36067,True,Product
242,Robert,Male,1990-10-27,38041,True,Engineering
251,Sharon,,1990-03-01,83658,False,Business Dev
310,Harold,Male,1990-02-20,66775,True,Legal
323,Linda,Female,1990-12-16,115658,True,Sales


In [None]:
start_with_r = emp["First Name"].between("R", "S") # it doesnt include the first names which starts with S. S is exclusive here.

emp[start_with_r]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
6,Ruby,Female,1987-08-17,65476,True,Product
36,Rachel,Female,2009-02-16,142032,False,Business Dev
45,Roger,Male,1980-04-17,88010,True,Sales
67,Rachel,Female,1999-08-16,51178,True,Finance
78,Robin,Female,1983-06-04,114797,True,Sales
...,...,...,...,...,...,...
973,Russell,Male,2013-05-10,137359,False,Business Dev
982,Rose,Female,1982-04-06,91411,True,HR
990,Robin,Female,1987-07-24,100765,True,IT
991,Rose,Female,2002-08-25,134505,True,Marketing


In [56]:
emp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   First Name  933 non-null    object        
 1   Gender      854 non-null    category      
 2   Start Date  999 non-null    datetime64[ns]
 3   Salary      1001 non-null   int64         
 4   Mgmt        1001 non-null   bool          
 5   Team        957 non-null    category      
dtypes: bool(1), category(2), datetime64[ns](1), int64(1), object(1)
memory usage: 27.0+ KB


In [57]:
# isnull() identifies null values in a column

check_null_names = emp["First Name"].isnull()

emp[check_null_names]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
7,,Female,2015-07-20,45906,True,Finance
23,,Male,2012-06-14,125792,True,
25,,Male,2012-10-08,37076,True,IT
32,,Male,1998-08-21,122340,True,
39,,Male,2016-01-29,122173,True,IT
...,...,...,...,...,...,...
925,,Female,2000-08-23,95866,True,Sales
946,,Female,1985-09-15,133472,True,Distribution
947,,Male,2012-07-30,107351,True,Marketing
951,,Female,2010-09-14,143638,True,


In [59]:
emp[~check_null_names]              # non null values with inverse operator

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
1,Thomas,Male,1996-03-31,61933,True,
2,Maria,Female,NaT,130590,False,Finance
3,Jerry,,2005-03-04,138705,True,Finance
4,Larry,Male,1998-01-24,101004,True,IT
...,...,...,...,...,...,...
995,Henry,,2014-11-23,132483,False,Distribution
996,Phillip,Male,1984-01-31,42392,False,Finance
997,Russell,Male,2013-05-20,96914,False,Product
998,Larry,Male,2013-04-20,60500,False,Business Dev


In [61]:
# notnull() used to find non-missing values(not null values)

team_exists = emp["Team"].notnull()

emp[team_exists]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
2,Maria,Female,NaT,130590,False,Finance
3,Jerry,,2005-03-04,138705,True,Finance
4,Larry,Male,1998-01-24,101004,True,IT
5,Dennis,Male,1987-04-18,115163,False,Legal
...,...,...,...,...,...,...
995,Henry,,2014-11-23,132483,False,Distribution
996,Phillip,Male,1984-01-31,42392,False,Finance
997,Russell,Male,2013-05-20,96914,False,Product
998,Larry,Male,2013-04-20,60500,False,Business Dev


In [62]:
# dropna() you can drop all the rows with missing values NaN or NaT

emp.dropna()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
4,Larry,Male,1998-01-24,101004,True,IT
5,Dennis,Male,1987-04-18,115163,False,Legal
6,Ruby,Female,1987-08-17,65476,True,Product
8,Angela,Female,2005-11-22,95570,True,Engineering
...,...,...,...,...,...,...
994,George,Male,2013-06-21,98874,True,Marketing
996,Phillip,Male,1984-01-31,42392,False,Finance
997,Russell,Male,2013-05-20,96914,False,Product
998,Larry,Male,2013-04-20,60500,False,Business Dev


In [63]:
emp.dropna(how="any")           # if at least one missing value in a row drop such rows

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
4,Larry,Male,1998-01-24,101004,True,IT
5,Dennis,Male,1987-04-18,115163,False,Legal
6,Ruby,Female,1987-08-17,65476,True,Product
8,Angela,Female,2005-11-22,95570,True,Engineering
...,...,...,...,...,...,...
994,George,Male,2013-06-21,98874,True,Marketing
996,Phillip,Male,1984-01-31,42392,False,Finance
997,Russell,Male,2013-05-20,96914,False,Product
998,Larry,Male,2013-04-20,60500,False,Business Dev


In [65]:
emp.dropna(how="all")           # only all the values in a row are missing

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
1,Thomas,Male,1996-03-31,61933,True,
2,Maria,Female,NaT,130590,False,Finance
3,Jerry,,2005-03-04,138705,True,Finance
4,Larry,Male,1998-01-24,101004,True,IT
...,...,...,...,...,...,...
996,Phillip,Male,1984-01-31,42392,False,Finance
997,Russell,Male,2013-05-20,96914,False,Product
998,Larry,Male,2013-04-20,60500,False,Business Dev
999,Albert,Male,2012-05-15,129949,True,Sales


In [None]:
emp.dropna(subset="Gender")         # drops the rows in the particular subset which is gender here

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
1,Thomas,Male,1996-03-31,61933,True,
2,Maria,Female,NaT,130590,False,Finance
4,Larry,Male,1998-01-24,101004,True,IT
5,Dennis,Male,1987-04-18,115163,False,Legal
...,...,...,...,...,...,...
994,George,Male,2013-06-21,98874,True,Marketing
996,Phillip,Male,1984-01-31,42392,False,Finance
997,Russell,Male,2013-05-20,96914,False,Product
998,Larry,Male,2013-04-20,60500,False,Business Dev


In [68]:
emp.dropna(subset=["First Name", "Gender"])         # checking multiple subsets for dropna

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
1,Thomas,Male,1996-03-31,61933,True,
2,Maria,Female,NaT,130590,False,Finance
4,Larry,Male,1998-01-24,101004,True,IT
5,Dennis,Male,1987-04-18,115163,False,Legal
...,...,...,...,...,...,...
994,George,Male,2013-06-21,98874,True,Marketing
996,Phillip,Male,1984-01-31,42392,False,Finance
997,Russell,Male,2013-05-20,96914,False,Product
998,Larry,Male,2013-04-20,60500,False,Business Dev


In [72]:
# If there are more than 4 non-missing values then don't drop those rows. 
#thresh specifies number of non-missing values to keep a row from deleting. 

emp.dropna(thresh=4)

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
1,Thomas,Male,1996-03-31,61933,True,
2,Maria,Female,NaT,130590,False,Finance
3,Jerry,,2005-03-04,138705,True,Finance
4,Larry,Male,1998-01-24,101004,True,IT
...,...,...,...,...,...,...
995,Henry,,2014-11-23,132483,False,Distribution
996,Phillip,Male,1984-01-31,42392,False,Finance
997,Russell,Male,2013-05-20,96914,False,Product
998,Larry,Male,2013-04-20,60500,False,Business Dev


### **Handling duplicate rows**

In [73]:
emp

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
1,Thomas,Male,1996-03-31,61933,True,
2,Maria,Female,NaT,130590,False,Finance
3,Jerry,,2005-03-04,138705,True,Finance
4,Larry,Male,1998-01-24,101004,True,IT
...,...,...,...,...,...,...
996,Phillip,Male,1984-01-31,42392,False,Finance
997,Russell,Male,2013-05-20,96914,False,Product
998,Larry,Male,2013-04-20,60500,False,Business Dev
999,Albert,Male,2012-05-15,129949,True,Sales


In [79]:
# duplicated() method return False for non-duplicate values and True for duplicate values

emp["Team"].head(15)

0        Marketing
1              NaN
2          Finance
3          Finance
4               IT
5            Legal
6          Product
7          Finance
8      Engineering
9     Business Dev
10             NaN
11           Legal
12              HR
13           Sales
14         Finance
Name: Team, dtype: category
Categories (10, object): ['Business Dev', 'Distribution', 'Engineering', 'Finance', ..., 'Legal', 'Marketing', 'Product', 'Sales']

In [80]:
emp["Team"].head(15).duplicated()

0     False
1     False
2     False
3      True
4     False
5     False
6     False
7      True
8     False
9     False
10     True
11     True
12    False
13    False
14     True
Name: Team, dtype: bool

In [83]:
emp["Team"].head(15).duplicated(keep="first")           # keep="first" will evaluate the duplicates from the first item to the last item


0     False
1     False
2     False
3      True
4     False
5     False
6     False
7      True
8     False
9     False
10     True
11     True
12    False
13    False
14     True
Name: Team, dtype: bool

In [84]:
emp["Team"].head(15).duplicated(keep="last")            # keep="last" will evaluate the duplicates from the last item to the first item


0     False
1      True
2      True
3      True
4     False
5      True
6     False
7      True
8     False
9     False
10    False
11    False
12    False
13    False
14    False
Name: Team, dtype: bool

In [86]:
emp[emp["Team"].duplicated()]           # show starting rowa for Team column with duplicates

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
3,Jerry,,2005-03-04,138705,True,Finance
7,,Female,2015-07-20,45906,True,Finance
10,Louise,Female,1980-08-12,63241,True,
11,Julie,Female,1997-10-26,102508,True,Legal
14,Kimberly,Female,1999-01-14,41426,True,Finance
...,...,...,...,...,...,...
996,Phillip,Male,1984-01-31,42392,False,Finance
997,Russell,Male,2013-05-20,96914,False,Product
998,Larry,Male,2013-04-20,60500,False,Business Dev
999,Albert,Male,2012-05-15,129949,True,Sales


In [87]:
emp[~emp["Team"].duplicated()]           # show starting rowa for Team column without duplicates

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
1,Thomas,Male,1996-03-31,61933,True,
2,Maria,Female,NaT,130590,False,Finance
4,Larry,Male,1998-01-24,101004,True,IT
5,Dennis,Male,1987-04-18,115163,False,Legal
6,Ruby,Female,1987-08-17,65476,True,Product
8,Angela,Female,2005-11-22,95570,True,Engineering
9,Frances,Female,2002-08-08,139852,True,Business Dev
12,Brandon,Male,1980-12-01,112807,True,HR
13,Gary,Male,2008-01-27,109831,False,Sales


In [88]:
emp[~emp["Team"].duplicated(keep="last")]           # show last non-duplicates

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
988,Alice,Female,2004-10-05,47638,False,HR
989,Justin,,1991-02-10,38344,False,Legal
990,Robin,Female,1987-07-24,100765,True,IT
993,Tina,Female,1997-05-15,56450,True,Engineering
994,George,Male,2013-06-21,98874,True,Marketing
995,Henry,,2014-11-23,132483,False,Distribution
996,Phillip,Male,1984-01-31,42392,False,Finance
997,Russell,Male,2013-05-20,96914,False,Product
998,Larry,Male,2013-04-20,60500,False,Business Dev
999,Albert,Male,2012-05-15,129949,True,Sales


In [90]:
#### drop_duplicates() method to drop duplicate rows

emp.drop_duplicates()               # drop duplicate rows

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
1,Thomas,Male,1996-03-31,61933,True,
2,Maria,Female,NaT,130590,False,Finance
3,Jerry,,2005-03-04,138705,True,Finance
4,Larry,Male,1998-01-24,101004,True,IT
...,...,...,...,...,...,...
996,Phillip,Male,1984-01-31,42392,False,Finance
997,Russell,Male,2013-05-20,96914,False,Product
998,Larry,Male,2013-04-20,60500,False,Business Dev
999,Albert,Male,2012-05-15,129949,True,Sales


In [None]:
emp.drop_duplicates(subset="Team")              # dropping the duplicates in the Team column

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
1,Thomas,Male,1996-03-31,61933,True,
2,Maria,Female,NaT,130590,False,Finance
4,Larry,Male,1998-01-24,101004,True,IT
5,Dennis,Male,1987-04-18,115163,False,Legal
6,Ruby,Female,1987-08-17,65476,True,Product
8,Angela,Female,2005-11-22,95570,True,Engineering
9,Frances,Female,2002-08-08,139852,True,Business Dev
12,Brandon,Male,1980-12-01,112807,True,HR
13,Gary,Male,2008-01-27,109831,False,Sales
