In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px

## Detection of Duplicates

A duplicate would be a row that we have twice or even more times in our dataframe.
If we have many duplicates in our data, this can harm or distort the result of any statistical analysis of machine learning.

Duplicate data represents the presence of one or more redundant rows that contain the same information as another, and can therefore be removed.

So, identifying and removing duplicates is a very important topic in our data analysis workflow.

### Sample data

In [2]:
alphabet = pd.DataFrame(['a','b','c','c','d','e','f','g','g','g'], columns=['Alphabet'])
alphabet

Unnamed: 0,Alphabet
0,a
1,b
2,c
3,c
4,d
5,e
6,f
7,g
8,g
9,g


Use **.duplicated()** to identify duplicate rows of data

In [3]:
alphabet.duplicated()

0    False
1    False
2    False
3     True
4    False
5    False
6    False
7    False
8     True
9     True
dtype: bool

In [4]:
alphabet.duplicated(keep=False)

0    False
1    False
2     True
3     True
4    False
5    False
6    False
7     True
8     True
9     True
dtype: bool

In [5]:
alphabet[alphabet.duplicated(keep=False)]

Unnamed: 0,Alphabet
2,c
3,c
7,g
8,g
9,g


In [6]:
path = '../Data/titanic2.csv'

In [7]:
df = pd.read_csv(path)
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   sex       891 non-null    object 
 3   age       714 non-null    float64
 4   sibsp     891 non-null    int64  
 5   parch     891 non-null    int64  
 6   fare      891 non-null    float64
 7   embarked  889 non-null    object 
 8   deck      203 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [9]:
df.duplicated().sum()

107

In [10]:
df[df.duplicated()]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
47,1,3,female,,0,0,7.7500,Q,
76,0,3,male,,0,0,7.8958,S,
77,0,3,male,,0,0,8.0500,S,
87,0,3,male,,0,0,8.0500,S,
95,0,3,male,,0,0,8.0500,S,
...,...,...,...,...,...,...,...,...,...
870,0,3,male,26.0,0,0,7.8958,S,
877,0,3,male,19.0,0,0,7.8958,S,
878,0,3,male,,0,0,7.8958,S,
884,0,3,male,25.0,0,0,7.0500,S,


### subset - to narrow down to specific columns

In [11]:
df[df.duplicated(subset=['survived','pclass','age','fare'])]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
42,0,3,male,,0,0,7.8958,C,
47,1,3,female,,0,0,7.7500,Q,
76,0,3,male,,0,0,7.8958,S,
77,0,3,male,,0,0,8.0500,S,
87,0,3,male,,0,0,8.0500,S,
...,...,...,...,...,...,...,...,...,...
878,0,3,male,,0,0,7.8958,S,
881,0,3,male,33.0,0,0,7.8958,S,
884,0,3,male,25.0,0,0,7.0500,S,
886,0,2,male,27.0,0,0,13.0000,S,


## Handling / Removing Duplicates

In [12]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,


In [13]:
df.duplicated().sum()

107

In [14]:
df[df.duplicated()]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
47,1,3,female,,0,0,7.7500,Q,
76,0,3,male,,0,0,7.8958,S,
77,0,3,male,,0,0,8.0500,S,
87,0,3,male,,0,0,8.0500,S,
95,0,3,male,,0,0,8.0500,S,
...,...,...,...,...,...,...,...,...,...
870,0,3,male,26.0,0,0,7.8958,S,
877,0,3,male,19.0,0,0,7.8958,S,
878,0,3,male,,0,0,7.8958,S,
884,0,3,male,25.0,0,0,7.0500,S,


In [15]:
df.drop(index=[877, 878, 884], inplace=True)

In [16]:
df[df.duplicated()]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
47,1,3,female,,0,0,7.7500,Q,
76,0,3,male,,0,0,7.8958,S,
77,0,3,male,,0,0,8.0500,S,
87,0,3,male,,0,0,8.0500,S,
95,0,3,male,,0,0,8.0500,S,
...,...,...,...,...,...,...,...,...,...
846,0,3,male,,8,2,69.5500,S,
859,0,3,male,,0,0,7.2292,C,
863,0,3,female,,8,2,69.5500,S,
870,0,3,male,26.0,0,0,7.8958,S,


In [17]:
df.shape

(888, 9)

In [18]:
df.drop_duplicates(inplace=True)

In [19]:
df.shape

(784, 9)

### Resetting the index after removing duplicates

In [24]:
df.reset_index(drop=True, inplace=True)

In [26]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.2500,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.9250,S,
3,1,1,female,35.0,1,0,53.1000,S,C
4,0,3,male,35.0,0,0,8.0500,S,
...,...,...,...,...,...,...,...,...,...
779,0,3,female,39.0,0,5,29.1250,Q,
780,1,1,female,19.0,0,0,30.0000,S,B
781,0,3,female,,1,2,23.4500,S,
782,1,1,male,26.0,0,0,30.0000,C,C
