# Pandas Tips: `drop_duplicates()`

In [7]:
import pandas as pd

Check your pandas version to ensure similar behavior. 

_Version 2.1.1 was released September 2023._

In [8]:
pd.__version__

'1.5.3'

### Load data

Load data from GitHub.

_Check out my [read_csv video](https://youtu.be/sTXr73fqybc) to learn more about this step._

In [16]:
df = pd.read_csv(
    'https://raw.githubusercontent.com/kimfetti/Videos/master/Pandas_Tips/data/pet_data.csv'
)

In [17]:
df.shape

(500, 5)

In [18]:
df.head()

Unnamed: 0,name,pet_type,food_type,amount,brand
0,Simba,Lizard,Insects,212g,
1,Winston,Ferret,Dry Food,,Pawsome
2,Bella,Parrot,Seeds,,
3,Shadow,Hamster,Pellets,58g,
4,Milo,Rabbit,Vegetables,124g,


### Drop blank rows

_Check out my [dropna video]() to learn more about this step._

In [19]:
df.dropna(how='all', inplace=True) # how= all mean sthat drop the rows which contains all the entries as NA

In [20]:
df.shape

(496, 5)

## Basics

### Find duplicate rows

In [22]:
df[df.duplicated()]

Unnamed: 0,name,pet_type,food_type,amount,brand
242,Cooper,Guinea Pig,Pellets,,
275,Rocky,Fish,Pellets,,
323,Stella,Snake,,,
485,Luna,Cat,,,


In [23]:
df[df.duplicated(keep=False)] # keep= False means that the rows which are repeated (any number of times) will also be kept

Unnamed: 0,name,pet_type,food_type,amount,brand
28,Cooper,Guinea Pig,Pellets,,
95,Luna,Cat,,,
144,Rocky,Fish,Pellets,,
152,Stella,Snake,,,
242,Cooper,Guinea Pig,Pellets,,
275,Rocky,Fish,Pellets,,
323,Stella,Snake,,,
485,Luna,Cat,,,


### Drop duplicate rows

In [29]:
df.drop_duplicates()

Unnamed: 0,name,pet_type,food_type,amount,brand
0,Simba,Lizard,Insects,212g,
1,Winston,Ferret,Dry Food,,Pawsome
2,Bella,Parrot,Seeds,,
3,Shadow,Hamster,Pellets,58g,
4,Milo,Rabbit,Vegetables,124g,
...,...,...,...,...,...
495,Shadow,Dog,Wet Food,229g,TropicalFeast
496,Nala,Snake,Eggs,114g,
497,Finn,Turtle,Pellets,,
498,Finn,Hamster,Seeds,55g,


In [30]:
df.drop_duplicates().shape

(492, 5)

In [31]:
no_dupes = df.drop_duplicates()
no_dupes

Unnamed: 0,name,pet_type,food_type,amount,brand
0,Simba,Lizard,Insects,212g,
1,Winston,Ferret,Dry Food,,Pawsome
2,Bella,Parrot,Seeds,,
3,Shadow,Hamster,Pellets,58g,
4,Milo,Rabbit,Vegetables,124g,
...,...,...,...,...,...
495,Shadow,Dog,Wet Food,229g,TropicalFeast
496,Nala,Snake,Eggs,114g,
497,Finn,Turtle,Pellets,,
498,Finn,Hamster,Seeds,55g,


In [32]:
no_dupes[no_dupes.duplicated()]

Unnamed: 0,name,pet_type,food_type,amount,brand


## $\star$ Level Up $\star$

### Drop duplicates based on specific column(s)

In [33]:
df.head()

Unnamed: 0,name,pet_type,food_type,amount,brand
0,Simba,Lizard,Insects,212g,
1,Winston,Ferret,Dry Food,,Pawsome
2,Bella,Parrot,Seeds,,
3,Shadow,Hamster,Pellets,58g,
4,Milo,Rabbit,Vegetables,124g,


In [34]:
df.name.value_counts().to_frame()

Unnamed: 0,name
Charlie,18
Luna,18
Lucy,16
Oliver,15
Lily,15
Lola,14
Cooper,14
Oscar,14
Chloe,14
Winston,13


In [35]:
df.drop_duplicates(subset='name')  # drop duplicates based on the 'name' column. Means drop the columns which have the ame names

Unnamed: 0,name,pet_type,food_type,amount,brand
0,Simba,Lizard,Insects,212g,
1,Winston,Ferret,Dry Food,,Pawsome
2,Bella,Parrot,Seeds,,
3,Shadow,Hamster,Pellets,58g,
4,Milo,Rabbit,Vegetables,124g,
5,Zoe,Ferret,Dry Food,114g,
6,Casper,Rabbit,Vegetables,62g,
7,Gizmo,Lizard,Insects,179g,
8,Lily,Hamster,Seeds,149g,
9,Tiger,Fish,Pellets,173g,ChowTime


In [15]:
df.drop_duplicates(subset='name').shape

(47, 5)

In [37]:
df.drop_duplicates(subset='name').value_counts().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,0
name,pet_type,food_type,amount,brand,Unnamed: 5_level_1
Charlie,Dog,Dry Kibble,84g,GreenHarvest,1
Harley,Cat,Wet Food,90g,Pawsome,1
Loki,Dog,Wet Food,173g,Pawsome,1
Nala,Parrot,Pellets,31g,GrainyGraze,1
Penny,Guinea Pig,Pellets,119g,TropicalFeast,1
Tiger,Fish,Pellets,173g,ChowTime,1


In [38]:
df.drop_duplicates(subset='name').name.value_counts().to_frame()

Unnamed: 0,name
Simba,1
Loki,1
Rocky,1
Penny,1
Chloe,1
Buddy,1
Nala,1
Kitty,1
Ruby,1
Stella,1


In [39]:
df.head()

Unnamed: 0,name,pet_type,food_type,amount,brand
0,Simba,Lizard,Insects,212g,
1,Winston,Ferret,Dry Food,,Pawsome
2,Bella,Parrot,Seeds,,
3,Shadow,Hamster,Pellets,58g,
4,Milo,Rabbit,Vegetables,124g,


In [40]:
df.drop_duplicates(subset=['name', 'pet_type'])  # drop based on 'name', 'pet_type'

Unnamed: 0,name,pet_type,food_type,amount,brand
0,Simba,Lizard,Insects,212g,
1,Winston,Ferret,Dry Food,,Pawsome
2,Bella,Parrot,Seeds,,
3,Shadow,Hamster,Pellets,58g,
4,Milo,Rabbit,Vegetables,124g,
...,...,...,...,...,...
486,Luna,Hamster,Seeds,53g,
489,Rocky,Lizard,Insects,135g,ChowTime
491,Mocha,Turtle,Pellets,249g,
493,Shadow,Dog,Wet Food,176g,


In [41]:
df.drop_duplicates(subset=['name', 'pet_type']).shape

(323, 5)

### Specify which dupllicate to keep

In [20]:
df.drop_duplicates(subset='name')  #Note: Zoe is a ferret in row with id=5.

Unnamed: 0,name,pet_type,food_type,amount,brand
0,Simba,Lizard,Insects,212g,
1,Winston,Ferret,Dry Food,,Pawsome
2,Bella,Parrot,Seeds,,
3,Shadow,Hamster,Pellets,58g,
4,Milo,Rabbit,Vegetables,124g,
5,Zoe,Ferret,Dry Food,114g,
6,Casper,Rabbit,Vegetables,62g,
7,Gizmo,Lizard,Insects,179g,
8,Lily,Hamster,Seeds,149g,
9,Tiger,Fish,Pellets,173g,ChowTime


In [43]:
df.drop_duplicates(subset='name', keep='last')  #Note: Zoe is a hamster in row with id=429.  
#keep ther last occurance of the row ZOE

Unnamed: 0,name,pet_type,food_type,amount,brand
313,Tucker,Cat,Dry Kibble,,
338,Milo,Rabbit,,,
344,Zeus,Dog,Wet Food,147g,
360,Lulu,Turtle,Pellets,143g,
382,Leo,Fish,Pellets,66g,
405,Rosie,Guinea Pig,Vegetables,96g,
415,Daisy,Fish,Flakes,,SnackLand
418,Molly,Parrot,Seeds,243g,
422,Duke,Parrot,,,
429,Zoe,Hamster,Seeds,190g,


In [44]:
df.drop_duplicates(subset='name', keep=False) #Gets rid of all duplicates; here, that's all the animals.

Unnamed: 0,name,pet_type,food_type,amount,brand


### Make permanent changes

In [45]:
df.shape

(496, 5)

In [46]:
df.drop_duplicates().shape

(492, 5)

In [47]:
df.drop_duplicates(inplace=True)

In [48]:
df.shape

(492, 5)