In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Menangani missing values

## Penghapusan seluruh baris

In [3]:
df = pd.DataFrame({'visitor_name':['Ivan','Adhitama',np.nan],
                   'event_name':['Rock Concert 2022','Jazz Festival 2021','Pop Party 2022'],
                   'ticket_sold_year':[2022,np.nan,2022],
                   'event_year':[2022,2021,2022],
                   'ticket_type':['Gold','Silver','Bronze'],
                   'ticket_price':[100000.0,50000.0,30000.0]})
df['ticket_sold_year'] = df['ticket_sold_year'].astype('Int64')
df['event_year'] = df['event_year'].astype('Int64')
df['ticket_price'] = df['ticket_price'].astype(float)
df

Unnamed: 0,visitor_name,event_name,ticket_sold_year,event_year,ticket_type,ticket_price
0,Ivan,Rock Concert 2022,2022.0,2022,Gold,100000.0
1,Adhitama,Jazz Festival 2021,,2021,Silver,50000.0
2,,Pop Party 2022,2022.0,2022,Bronze,30000.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   visitor_name      2 non-null      object 
 1   event_name        3 non-null      object 
 2   ticket_sold_year  2 non-null      Int64  
 3   event_year        3 non-null      Int64  
 4   ticket_type       3 non-null      object 
 5   ticket_price      3 non-null      float64
dtypes: Int64(2), float64(1), object(3)
memory usage: 282.0+ bytes


In [5]:
# hapus baris yang nan
df.dropna()

Unnamed: 0,visitor_name,event_name,ticket_sold_year,event_year,ticket_type,ticket_price
0,Ivan,Rock Concert 2022,2022,2022,Gold,100000.0


In [6]:
# menghapus column
df.drop(columns=["visitor_name", "ticket_sold_year"])

Unnamed: 0,event_name,event_year,ticket_type,ticket_price
0,Rock Concert 2022,2022,Gold,100000.0
1,Jazz Festival 2021,2021,Silver,50000.0
2,Pop Party 2022,2022,Bronze,30000.0


## Pengisian dengan 0

In [7]:
df = pd.DataFrame({'visitor_name':['Ivan','Adhitama','Christanto','Rio','Tokyo'],
                   'age':[25,np.nan,18,21,25],
                   'event_name':['Rock Concert 2022','Jazz Festival 2021','Pop Party 2022','Rock Concert 2022','Pop Party 2022'],
                   'ticket_sold_year':[2022,2021,2022,2022,2022],
                   'event_year':[2022,2021,2022,2022,2022],
                   'ticket_type':['Gold','Silver','Bronze','Silver','Silver'],
                   'ticket_price':[100000.0,50000.0,30000.0,75000.0,60000.0],
                   'reward_point':[400,350,800,np.nan,250]})
df['age'] = df['age'].astype('Int64')
df['ticket_sold_year'] = df['ticket_sold_year'].astype('Int64')
df['event_year'] = df['event_year'].astype('Int64')
df['ticket_price'] = df['ticket_price'].astype(float)
df['reward_point'] = df['reward_point'].astype('Int64')
df

Unnamed: 0,visitor_name,age,event_name,ticket_sold_year,event_year,ticket_type,ticket_price,reward_point
0,Ivan,25.0,Rock Concert 2022,2022,2022,Gold,100000.0,400.0
1,Adhitama,,Jazz Festival 2021,2021,2021,Silver,50000.0,350.0
2,Christanto,18.0,Pop Party 2022,2022,2022,Bronze,30000.0,800.0
3,Rio,21.0,Rock Concert 2022,2022,2022,Silver,75000.0,
4,Tokyo,25.0,Pop Party 2022,2022,2022,Silver,60000.0,250.0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   visitor_name      5 non-null      object 
 1   age               4 non-null      Int64  
 2   event_name        5 non-null      object 
 3   ticket_sold_year  5 non-null      Int64  
 4   event_year        5 non-null      Int64  
 5   ticket_type       5 non-null      object 
 6   ticket_price      5 non-null      float64
 7   reward_point      4 non-null      Int64  
dtypes: Int64(4), float64(1), object(3)
memory usage: 472.0+ bytes


In [9]:
df["reward_point"] = df["reward_point"].fillna(0)
df.head()

Unnamed: 0,visitor_name,age,event_name,ticket_sold_year,event_year,ticket_type,ticket_price,reward_point
0,Ivan,25.0,Rock Concert 2022,2022,2022,Gold,100000.0,400
1,Adhitama,,Jazz Festival 2021,2021,2021,Silver,50000.0,350
2,Christanto,18.0,Pop Party 2022,2022,2022,Bronze,30000.0,800
3,Rio,21.0,Rock Concert 2022,2022,2022,Silver,75000.0,0
4,Tokyo,25.0,Pop Party 2022,2022,2022,Silver,60000.0,250


## Pengisian dengan rata-rata

In [10]:
df_mean_imp = df.copy()
df_mean_imp["age"] = df_mean_imp["age"].fillna(df_mean_imp["age"].mean().round())
df_mean_imp.head()

Unnamed: 0,visitor_name,age,event_name,ticket_sold_year,event_year,ticket_type,ticket_price,reward_point
0,Ivan,25,Rock Concert 2022,2022,2022,Gold,100000.0,400
1,Adhitama,22,Jazz Festival 2021,2021,2021,Silver,50000.0,350
2,Christanto,18,Pop Party 2022,2022,2022,Bronze,30000.0,800
3,Rio,21,Rock Concert 2022,2022,2022,Silver,75000.0,0
4,Tokyo,25,Pop Party 2022,2022,2022,Silver,60000.0,250


## Pengisian dengan median

In [11]:
df_median_imp = df.copy()
df_median_imp["age"] = df_median_imp["age"].fillna(df_median_imp["age"].median())
df_median_imp.head()

Unnamed: 0,visitor_name,age,event_name,ticket_sold_year,event_year,ticket_type,ticket_price,reward_point
0,Ivan,25,Rock Concert 2022,2022,2022,Gold,100000.0,400
1,Adhitama,23,Jazz Festival 2021,2021,2021,Silver,50000.0,350
2,Christanto,18,Pop Party 2022,2022,2022,Bronze,30000.0,800
3,Rio,21,Rock Concert 2022,2022,2022,Silver,75000.0,0
4,Tokyo,25,Pop Party 2022,2022,2022,Silver,60000.0,250


## Pengisian dengan mode

In [12]:
df_mode_imp = df.copy()
df_mode_imp["age"] = df_mode_imp["age"].fillna(df_mode_imp["age"].mode()[0])
df_mode_imp.head()

Unnamed: 0,visitor_name,age,event_name,ticket_sold_year,event_year,ticket_type,ticket_price,reward_point
0,Ivan,25,Rock Concert 2022,2022,2022,Gold,100000.0,400
1,Adhitama,25,Jazz Festival 2021,2021,2021,Silver,50000.0,350
2,Christanto,18,Pop Party 2022,2022,2022,Bronze,30000.0,800
3,Rio,21,Rock Concert 2022,2022,2022,Silver,75000.0,0
4,Tokyo,25,Pop Party 2022,2022,2022,Silver,60000.0,250


## pengisian dengan nilai sebelumnya

In [13]:
df_ffill_imp = df.copy()
df_ffill_imp["age"] = df_ffill_imp["age"].fillna(method="ffill")
df_ffill_imp.head()

  df_ffill_imp["age"] = df_ffill_imp["age"].fillna(method="ffill")


Unnamed: 0,visitor_name,age,event_name,ticket_sold_year,event_year,ticket_type,ticket_price,reward_point
0,Ivan,25,Rock Concert 2022,2022,2022,Gold,100000.0,400
1,Adhitama,25,Jazz Festival 2021,2021,2021,Silver,50000.0,350
2,Christanto,18,Pop Party 2022,2022,2022,Bronze,30000.0,800
3,Rio,21,Rock Concert 2022,2022,2022,Silver,75000.0,0
4,Tokyo,25,Pop Party 2022,2022,2022,Silver,60000.0,250


## pengisian dengan nilai sesudahnya

In [14]:
df_bfill_imp = df.copy()
df_bfill_imp["age"] = df_bfill_imp["age"].fillna(method="bfill")
df_bfill_imp.head()

  df_bfill_imp["age"] = df_bfill_imp["age"].fillna(method="bfill")


Unnamed: 0,visitor_name,age,event_name,ticket_sold_year,event_year,ticket_type,ticket_price,reward_point
0,Ivan,25,Rock Concert 2022,2022,2022,Gold,100000.0,400
1,Adhitama,18,Jazz Festival 2021,2021,2021,Silver,50000.0,350
2,Christanto,18,Pop Party 2022,2022,2022,Bronze,30000.0,800
3,Rio,21,Rock Concert 2022,2022,2022,Silver,75000.0,0
4,Tokyo,25,Pop Party 2022,2022,2022,Silver,60000.0,250


# Menangani duplikasi

## Duplikasi baris - duplikasi sebagian kolom

In [15]:
df = pd.DataFrame({'visitor_name':['Ivan','Adhitama','Christanto','Ivan','Tokyo'],
                   'age':[25,23,18,25,25],
                   'event_name':['Rock Concert 2022','Jazz Festival 2021','Pop Party 2022','Rock Concert 2022','Pop Party 2022'],
                   'ticket_sold_year':[2022,2021,2022,2022,2022],
                   'event_year':[2022,2021,2022,2022,2022],
                   'ticket_type':['Gold','Silver','Bronze','Silver','Silver'],
                   'ticket_price':[100000.0,50000.0,30000.0,75000.0,60000.0],
                   'reward_point':[400,350,800,0,250]})
df['age'] = df['age'].astype('Int64')
df['ticket_sold_year'] = df['ticket_sold_year'].astype('Int64')
df['event_year'] = df['event_year'].astype('Int64')
df['ticket_price'] = df['ticket_price'].astype(float)
df['reward_point'] = df['reward_point'].astype('Int64')
df

Unnamed: 0,visitor_name,age,event_name,ticket_sold_year,event_year,ticket_type,ticket_price,reward_point
0,Ivan,25,Rock Concert 2022,2022,2022,Gold,100000.0,400
1,Adhitama,23,Jazz Festival 2021,2021,2021,Silver,50000.0,350
2,Christanto,18,Pop Party 2022,2022,2022,Bronze,30000.0,800
3,Ivan,25,Rock Concert 2022,2022,2022,Silver,75000.0,0
4,Tokyo,25,Pop Party 2022,2022,2022,Silver,60000.0,250


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   visitor_name      5 non-null      object 
 1   age               5 non-null      Int64  
 2   event_name        5 non-null      object 
 3   ticket_sold_year  5 non-null      Int64  
 4   event_year        5 non-null      Int64  
 5   ticket_type       5 non-null      object 
 6   ticket_price      5 non-null      float64
 7   reward_point      5 non-null      Int64  
dtypes: Int64(4), float64(1), object(3)
memory usage: 472.0+ bytes


In [19]:
# penghapusan semua baris duplikasi kecuali yang pertama
df1 = df.copy()
df1.drop_duplicates(subset="visitor_name", keep="first", inplace=True)
df1

Unnamed: 0,visitor_name,age,event_name,ticket_sold_year,event_year,ticket_type,ticket_price,reward_point
0,Ivan,25,Rock Concert 2022,2022,2022,Gold,100000.0,400
1,Adhitama,23,Jazz Festival 2021,2021,2021,Silver,50000.0,350
2,Christanto,18,Pop Party 2022,2022,2022,Bronze,30000.0,800
4,Tokyo,25,Pop Party 2022,2022,2022,Silver,60000.0,250


In [20]:
# penghapusan semua baris duplikasi kecuali yang terakhir
df2 = df.copy()
df2.drop_duplicates(subset="visitor_name", keep="last", inplace=True)
df2

Unnamed: 0,visitor_name,age,event_name,ticket_sold_year,event_year,ticket_type,ticket_price,reward_point
1,Adhitama,23,Jazz Festival 2021,2021,2021,Silver,50000.0,350
2,Christanto,18,Pop Party 2022,2022,2022,Bronze,30000.0,800
3,Ivan,25,Rock Concert 2022,2022,2022,Silver,75000.0,0
4,Tokyo,25,Pop Party 2022,2022,2022,Silver,60000.0,250


## Duplikasi baris - duplikasi seluruh kolom

In [21]:
df = pd.DataFrame({'visitor_name':['Ivan','Adhitama','Christanto','Ivan','Tokyo'],
                   'age':[25,23,18,25,25],
                   'event_name':['Rock Concert 2022','Jazz Festival 2021','Pop Party 2022','Rock Concert 2022','Pop Party 2022'],
                   'ticket_sold_year':[2022,2021,2022,2022,2022],
                   'event_year':[2022,2021,2022,2022,2022],
                   'ticket_type':['Gold','Silver','Bronze','Gold','Silver'],
                   'ticket_price':[100000.0,50000.0,30000.0,100000.0,60000.0],
                   'reward_point':[400,350,800,400,250]})
df['age'] = df['age'].astype('Int64')
df['ticket_sold_year'] = df['ticket_sold_year'].astype('Int64')
df['event_year'] = df['event_year'].astype('Int64')
df['ticket_price'] = df['ticket_price'].astype(float)
df['reward_point'] = df['reward_point'].astype('Int64')
df

Unnamed: 0,visitor_name,age,event_name,ticket_sold_year,event_year,ticket_type,ticket_price,reward_point
0,Ivan,25,Rock Concert 2022,2022,2022,Gold,100000.0,400
1,Adhitama,23,Jazz Festival 2021,2021,2021,Silver,50000.0,350
2,Christanto,18,Pop Party 2022,2022,2022,Bronze,30000.0,800
3,Ivan,25,Rock Concert 2022,2022,2022,Gold,100000.0,400
4,Tokyo,25,Pop Party 2022,2022,2022,Silver,60000.0,250


In [22]:
# penghapusan semua baris duplikasi kecuali salah satunya
df.drop_duplicates(inplace=True)
df

Unnamed: 0,visitor_name,age,event_name,ticket_sold_year,event_year,ticket_type,ticket_price,reward_point
0,Ivan,25,Rock Concert 2022,2022,2022,Gold,100000.0,400
1,Adhitama,23,Jazz Festival 2021,2021,2021,Silver,50000.0,350
2,Christanto,18,Pop Party 2022,2022,2022,Bronze,30000.0,800
4,Tokyo,25,Pop Party 2022,2022,2022,Silver,60000.0,250


## Duplikasi kolom - duplikasi values

In [23]:
df = pd.DataFrame({'visitor_name':['Ivan','Adhitama','Christanto','Rio','Tokyo'],
                   'age':[25,23,18,21,25],
                   'event_name':['Rock Concert 2022','Jazz Festival 2021','Pop Party 2020','Rock Concert 2022','Pop Party 2020'],
                   'ticket_sold_year':[2022,2021,2020,2022,2020],
                   'event_year':[2022,2021,2020,2022,2020],
                   'ticket_type':['Gold','Silver','Bronze','Gold','Silver'],
                   'ticket_price':[100000.0,50000.0,30000.0,100000.0,60000.0],
                   'reward_point':[400,350,800,100,100]})
df['age'] = df['age'].astype('Int64')
df['ticket_sold_year'] = df['ticket_sold_year'].astype('Int64')
df['event_year'] = df['event_year'].astype('Int64')
df['ticket_price'] = df['ticket_price'].astype(float)
df['reward_point'] = df['reward_point'].astype('Int64')
df

Unnamed: 0,visitor_name,age,event_name,ticket_sold_year,event_year,ticket_type,ticket_price,reward_point
0,Ivan,25,Rock Concert 2022,2022,2022,Gold,100000.0,400
1,Adhitama,23,Jazz Festival 2021,2021,2021,Silver,50000.0,350
2,Christanto,18,Pop Party 2020,2020,2020,Bronze,30000.0,800
3,Rio,21,Rock Concert 2022,2022,2022,Gold,100000.0,100
4,Tokyo,25,Pop Party 2020,2020,2020,Silver,60000.0,100


In [24]:
# pip install fast-ml
from fast_ml.feature_selection import get_duplicate_features

df1 = df.copy()

# mendapatkan kolom-kolom dengan duplikasi values
duplicate_features = get_duplicate_features(df1)
print('Duplikasi kolom:\n')
print(duplicate_features)

# semua kolom dengan duplikasi values sebagai list
duplicate_features_list = duplicate_features.loc[duplicate_features['Desc']=='Duplicate Values', 'feature2'].to_list()
print('\nList kolom duplikasi values:\n')
print(duplicate_features_list)

# hapus semua kolom dengan duplikasi values
print('\nBanyaknya kolom sebelum penghapusan kolom duplikasi values: ' + str(df1.shape[1]))
df1.drop(columns=duplicate_features_list, inplace=True)
print('Banyaknya kolom setelah penghapusan kolom duplikasi values: ' + str(df1.shape[1]))
df1

Duplikasi kolom:

               Desc          feature1          feature2
0  Duplicate Values  ticket_sold_year        event_year
1   Duplicate Index        event_name  ticket_sold_year
2   Duplicate Index        event_name        event_year

List kolom duplikasi values:

['event_year']

Banyaknya kolom sebelum penghapusan kolom duplikasi values: 8
Banyaknya kolom setelah penghapusan kolom duplikasi values: 7


Unnamed: 0,visitor_name,age,event_name,ticket_sold_year,ticket_type,ticket_price,reward_point
0,Ivan,25,Rock Concert 2022,2022,Gold,100000.0,400
1,Adhitama,23,Jazz Festival 2021,2021,Silver,50000.0,350
2,Christanto,18,Pop Party 2020,2020,Bronze,30000.0,800
3,Rio,21,Rock Concert 2022,2022,Gold,100000.0,100
4,Tokyo,25,Pop Party 2020,2020,Silver,60000.0,100


## Duplikasi kolom - duplikasi index

In [25]:
# pip install fast-ml
from fast_ml.feature_selection import get_duplicate_features

df2 = df.copy()

# mendapatkan kolom-kolom dengan duplikasi values
duplicate_features = get_duplicate_features(df2)
print('Duplikasi kolom:\n')
print(duplicate_features)

# semua kolom dengan duplikasi values sebagai list
duplicate_index_features_list = duplicate_features.loc[duplicate_features['Desc']=='Duplicate Index', 'feature2'].to_list()
print('\nList kolom duplikasi index:\n')
print(duplicate_features_list)

# hapus semua kolom dengan duplikasi values
print('\nBanyaknya kolom sebelum penghapusan kolom duplikasi values: ' + str(df2.shape[1]))
df2.drop(columns=duplicate_index_features_list, inplace=True)
print('Banyaknya kolom setelah penghapusan kolom duplikasi values: ' + str(df2.shape[1]))
df2

Duplikasi kolom:

               Desc          feature1          feature2
0  Duplicate Values  ticket_sold_year        event_year
1   Duplicate Index        event_name  ticket_sold_year
2   Duplicate Index        event_name        event_year

List kolom duplikasi index:

['event_year']

Banyaknya kolom sebelum penghapusan kolom duplikasi values: 8
Banyaknya kolom setelah penghapusan kolom duplikasi values: 6


Unnamed: 0,visitor_name,age,event_name,ticket_type,ticket_price,reward_point
0,Ivan,25,Rock Concert 2022,Gold,100000.0,400
1,Adhitama,23,Jazz Festival 2021,Silver,50000.0,350
2,Christanto,18,Pop Party 2020,Bronze,30000.0,800
3,Rio,21,Rock Concert 2022,Gold,100000.0,100
4,Tokyo,25,Pop Party 2020,Silver,60000.0,100


# Menangani format/tipe data yang salah

## Kolom numerik dengan tipe data string

In [26]:
df = pd.DataFrame({'quantity':['100','250']})
df.info()
df['quantity'] = df['quantity'].astype(int)
print('\n')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   quantity  2 non-null      object
dtypes: object(1)
memory usage: 148.0+ bytes


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   quantity  2 non-null      int64
dtypes: int64(1)
memory usage: 148.0 bytes


## Kolom tanggal dengan tipe data string

In [27]:
df = pd.DataFrame({'date':['20220810','20220815']})
df.info()
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
print('\n')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    2 non-null      object
dtypes: object(1)
memory usage: 148.0+ bytes


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    2 non-null      datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 148.0 bytes


## Kapitalisasi teks yang tidak konsisten

In [28]:
df = pd.DataFrame({'product_name':['ICE TEA','Ice Tea','ice tea']})
print(df)
df['product_name'] = df['product_name'].str.upper()
print('\n')
print(df)

  product_name
0      ICE TEA
1      Ice Tea
2      ice tea


  product_name
0      ICE TEA
1      ICE TEA
2      ICE TEA


In [29]:
df = pd.DataFrame({'product_name':['ICE TEA','Ice Tea','ice tea']})
print(df)
df['product_name'] = df['product_name'].str.lower()
print('\n')
print(df)

  product_name
0      ICE TEA
1      Ice Tea
2      ice tea


  product_name
0      ice tea
1      ice tea
2      ice tea


## Extra whitespaces pada data teks

In [30]:
df = pd.DataFrame({'product_name':['   ICE TEA   ','ICE COFFEE   ','    RAINBOW CAKE']})
print(df)
df['product_name'] = df['product_name'].str.strip()
print('\n')
print(df)

       product_name
0        ICE TEA   
1     ICE COFFEE   
2      RAINBOW CAKE


   product_name
0       ICE TEA
1    ICE COFFEE
2  RAINBOW CAKE


## Detail teks yang tidak perlu

In [31]:
df = pd.DataFrame({'product_name':['ICE TEA (SOLD PER 1 GLASS)','RAINBOW CAKE (SOLD PER 1 SLICE)']})
print(df)
df['product_name'] = df['product_name'].str.replace(r' \(SOLD.*', '', regex=True)
print('\n')
print(df)

                      product_name
0       ICE TEA (SOLD PER 1 GLASS)
1  RAINBOW CAKE (SOLD PER 1 SLICE)


   product_name
0       ICE TEA
1  RAINBOW CAKE


## Penulisan teks yang tidak konsisten

In [32]:
df = pd.DataFrame({'address':['jln. harapan','jl. baru']})
print(df)
df['address'] = df['address'].str.replace(r'jl.* ', 'jalan ', regex=True)
print('\n')
print(df)

        address
0  jln. harapan
1      jl. baru


         address
0  jalan harapan
1     jalan baru
