# Membersihkan duplikasi data dengan pandas

In [2]:
import pandas as pd

In [3]:
def load_data(): 
    df_all = pd.read_csv('./Data/data_train_duplicate.csv')
    # Buat subset dengan slicing data
    return df_all.loc[:300, ['Survived', 'Pclass', 'Sex', 'Cabin', 'Embarked']].dropna()

# Load subset
df = load_data()
df

Unnamed: 0,Survived,Pclass,Sex,Cabin,Embarked
0,0,1,male,C30,S
1,1,1,female,D33,C
9,1,3,male,E121,S
10,1,1,female,B22,S
14,0,1,male,B51 B53 B55,S
...,...,...,...,...,...
271,1,1,male,C93,S
278,0,1,male,C111,C
286,1,1,male,C148,C
299,1,1,female,D21,S


In [4]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Cabin,Embarked
0,0,1,male,C30,S
1,1,1,female,D33,C
9,1,3,male,E121,S
10,1,1,female,B22,S
14,0,1,male,B51 B53 B55,S


## 1. Pencarian baris duplikasi

In [5]:
# Untuk 1 kolom
df.Cabin.duplicated()

0      False
1      False
9      False
10     False
14     False
       ...  
271    False
278    False
286    False
299    False
300    False
Name: Cabin, Length: 80, dtype: bool

In [6]:
# Untuk keseluruhan DataFrame
df.duplicated()

0      False
1      False
9      False
10     False
14     False
       ...  
271    False
278    False
286    False
299    False
300    False
Length: 80, dtype: bool

In [7]:
# Mempertimbangkan kolom tertentu untuk mengidentifikasi duplikasi
df.duplicated(subset=['Survived', 'Pclass', 'Sex'])

0      False
1      False
9      False
10      True
14      True
       ...  
271     True
278     True
286     True
299     True
300     True
Length: 80, dtype: bool

## 2. Penghitungan duplikasi dan non-duplikasi

In [8]:
df.Cabin.duplicated().sum()

11

In [9]:
df.duplicated().sum()

3

In [10]:
df.duplicated(subset=['Survived', 'Pclass', 'Sex']).sum()

70

In [11]:
# Hitung jumlah non-duplikasi
(~df.duplicated()).sum()

77

## 3. Mengekstraksi baris duplikat dengan menggunakan loc

In [12]:
# Memungkinkan kita untuk melihat baris yang diidentifikasi oleh duplikasi()
df.loc[df.duplicated(), :]

Unnamed: 0,Survived,Pclass,Sex,Cabin,Embarked
138,1,2,female,F33,S
169,1,1,female,B77,S
237,1,1,female,B96 B98,S


## 4. Menentukan data duplikat mana yang akan ditandai menggunakan keep

In [13]:
# `keep` data yang pertama
df.loc[df.duplicated(keep='first'), :]

Unnamed: 0,Survived,Pclass,Sex,Cabin,Embarked
138,1,2,female,F33,S
169,1,1,female,B77,S
237,1,1,female,B96 B98,S


In [14]:
# 'keep' data yang terakhir
df.loc[df.duplicated(keep='last'), :]

Unnamed: 0,Survived,Pclass,Sex,Cabin,Embarked
36,1,1,female,B77,S
77,1,1,female,B96 B98,S
134,1,2,female,F33,S


In [15]:
# Opsi ketiga yang bisa kita gunakan keep=False
df.loc[df.duplicated(keep=False), :]

Unnamed: 0,Survived,Pclass,Sex,Cabin,Embarked
36,1,1,female,B77,S
77,1,1,female,B96 B98,S
134,1,2,female,F33,S
138,1,2,female,F33,S
169,1,1,female,B77,S
237,1,1,female,B96 B98,S


# Menghapus baris duplikasi

In [16]:
# Perhatikan jumlah baris, awalnya 80 --> sekarang 77
df.drop_duplicates()

Unnamed: 0,Survived,Pclass,Sex,Cabin,Embarked
0,0,1,male,C30,S
1,1,1,female,D33,C
9,1,3,male,E121,S
10,1,1,female,B22,S
14,0,1,male,B51 B53 B55,S
...,...,...,...,...,...
271,1,1,male,C93,S
278,0,1,male,C111,C
286,1,1,male,C148,C
299,1,1,female,D21,S


In [17]:
df.drop_duplicates(subset=['Survived', 'Pclass', 'Sex'])

Unnamed: 0,Survived,Pclass,Sex,Cabin,Embarked
0,0,1,male,C30,S
1,1,1,female,D33,C
9,1,3,male,E121,S
25,1,2,female,D,S
38,1,1,male,A6,S
48,0,3,male,F G73,S
63,0,2,male,D,C
113,1,3,female,E121,S
136,1,2,male,F4,S
172,0,1,female,C49,C
