# Self Practice 3 - Data Cleansing

___

## Import Library

In [1]:
import pandas as pd
from sklearn.datasets import load_iris

## Data Understanding

In [2]:
# Muat dataset iris
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target

# Identifikasi jumlah data
print("Jumlah data:", df.shape[0])

Jumlah data: 150


In [3]:
# Identifikasi tipe data
print("\nTipe data:")
print(df.dtypes)


Tipe data:
sepal length (cm)    float64
sepal width (cm)     float64
petal length (cm)    float64
petal width (cm)     float64
target                 int32
dtype: object


In [4]:
# Identifikasi nilai yang hilang
print("\nJumlah nilai yang hilang:")
print(df.isnull().sum())


Jumlah nilai yang hilang:
sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
dtype: int64


In [5]:
# Identifikasi outlier (menggunakan IQR)
for column in df.columns[:-1]:  # Loop melalui fitur numerik
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    print(f"\nOutlier pada kolom {column}:")
    print(outliers)


Outlier pada kolom sepal length (cm):
Empty DataFrame
Columns: [sepal length (cm), sepal width (cm), petal length (cm), petal width (cm), target]
Index: []

Outlier pada kolom sepal width (cm):
    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
15                5.7               4.4                1.5               0.4   
32                5.2               4.1                1.5               0.1   
33                5.5               4.2                1.4               0.2   
60                5.0               2.0                3.5               1.0   

    target  
15       0  
32       0  
33       0  
60       1  

Outlier pada kolom petal length (cm):
Empty DataFrame
Columns: [sepal length (cm), sepal width (cm), petal length (cm), petal width (cm), target]
Index: []

Outlier pada kolom petal width (cm):
Empty DataFrame
Columns: [sepal length (cm), sepal width (cm), petal length (cm), petal width (cm), target]
Index: []


In [6]:
# Deskripsi statistik data
print("\nDeskripsi statistik:")
print(df.describe())


Deskripsi statistik:
       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         150.000000        150.000000         150.000000   
mean            5.843333          3.057333           3.758000   
std             0.828066          0.435866           1.765298   
min             4.300000          2.000000           1.000000   
25%             5.100000          2.800000           1.600000   
50%             5.800000          3.000000           4.350000   
75%             6.400000          3.300000           5.100000   
max             7.900000          4.400000           6.900000   

       petal width (cm)      target  
count        150.000000  150.000000  
mean           1.199333    1.000000  
std            0.762238    0.819232  
min            0.100000    0.000000  
25%            0.300000    0.000000  
50%            1.300000    1.000000  
75%            1.800000    2.000000  
max            2.500000    2.000000  


## Data Cleansing

In [7]:
# Muat dataset iris
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target

# Identifikasi jumlah data
print("Jumlah data:", df.shape[0])

Jumlah data: 150


In [8]:
# Identifikasi tipe data
print("\nTipe data:")
print(df.dtypes)


Tipe data:
sepal length (cm)    float64
sepal width (cm)     float64
petal length (cm)    float64
petal width (cm)     float64
target                 int32
dtype: object


In [9]:
# Identifikasi nilai yang hilang
print("\nJumlah nilai yang hilang:")
print(df.isnull().sum())


Jumlah nilai yang hilang:
sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
dtype: int64


In [10]:
# Identifikasi outlier (menggunakan IQR)
for column in df.columns[:-1]:  # Loop melalui fitur numerik
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    print(f"\nOutlier pada kolom {column}:")
    print(outliers)


Outlier pada kolom sepal length (cm):
Empty DataFrame
Columns: [sepal length (cm), sepal width (cm), petal length (cm), petal width (cm), target]
Index: []

Outlier pada kolom sepal width (cm):
    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
15                5.7               4.4                1.5               0.4   
32                5.2               4.1                1.5               0.1   
33                5.5               4.2                1.4               0.2   
60                5.0               2.0                3.5               1.0   

    target  
15       0  
32       0  
33       0  
60       1  

Outlier pada kolom petal length (cm):
Empty DataFrame
Columns: [sepal length (cm), sepal width (cm), petal length (cm), petal width (cm), target]
Index: []

Outlier pada kolom petal width (cm):
Empty DataFrame
Columns: [sepal length (cm), sepal width (cm), petal length (cm), petal width (cm), target]
Index: []


In [11]:
# Deskripsi statistik data
print("\nDeskripsi statistik:")
print(df.describe())


Deskripsi statistik:
       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         150.000000        150.000000         150.000000   
mean            5.843333          3.057333           3.758000   
std             0.828066          0.435866           1.765298   
min             4.300000          2.000000           1.000000   
25%             5.100000          2.800000           1.600000   
50%             5.800000          3.000000           4.350000   
75%             6.400000          3.300000           5.100000   
max             7.900000          4.400000           6.900000   

       petal width (cm)      target  
count        150.000000  150.000000  
mean           1.199333    1.000000  
std            0.762238    0.819232  
min            0.100000    0.000000  
25%            0.300000    0.000000  
50%            1.300000    1.000000  
75%            1.800000    2.000000  
max            2.500000    2.000000  


In [12]:
# Pembersihan data
# 1. Pengisian nilai yang hilang (jika ada)
# Dalam dataset iris, tidak ada nilai yang hilang, jadi bagian ini bisa dilewati.
# Namun, jika ada, Anda bisa menggunakan:
df['sepal length (cm)'].fillna(df['sepal length (cm)'].mean(), inplace=True) 
df['sepal width (cm)'].fillna(df['sepal width (cm)'].median(), inplace=True)
df['petal length (cm)'].fillna(df['petal length (cm)'].mode()[0], inplace=True)

# 2. Menghapus baris dengan data yang salah (jika ada)
# Dalam dataset iris, tidak ada data yang salah secara jelas.
# Namun, jika ada, Anda bisa menggunakan:
df.drop(df[df['sepal length (cm)'] < 0].index, inplace=True)

# 3. Mengoreksi nilai outlier
for column in df.columns[:-1]:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = df[column].clip(lower_bound, upper_bound)

# Setelah pembersihan data
print("\nDeskripsi statistik setelah pembersihan:")
print(df.describe())


Deskripsi statistik setelah pembersihan:
       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         150.000000         150.00000         150.000000   
mean            5.843333           3.05400           3.758000   
std             0.828066           0.42539           1.765298   
min             4.300000           2.05000           1.000000   
25%             5.100000           2.80000           1.600000   
50%             5.800000           3.00000           4.350000   
75%             6.400000           3.30000           5.100000   
max             7.900000           4.05000           6.900000   

       petal width (cm)      target  
count        150.000000  150.000000  
mean           1.199333    1.000000  
std            0.762238    0.819232  
min            0.100000    0.000000  
25%            0.300000    0.000000  
50%            1.300000    1.000000  
75%            1.800000    2.000000  
max            2.500000    2.000000  


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sepal length (cm)'].fillna(df['sepal length (cm)'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sepal width (cm)'].fillna(df['sepal width (cm)'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work be