<a href="https://colab.research.google.com/github/Slamm-alt/DataCleansing-Online-Shopping/blob/main/2318076DataCleansing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Import Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 2. Baca Dataset pelanggan yang kotor

In [None]:
# Load dataset
file_path = "/content/drive/MyDrive/Colab Notebooks/data_products_id_small.csv"# ganti sesuai path di Colab
df_dirty = pd.read_csv(file_path)

In [None]:
# Use the already loaded dataframe
df = df_dirty

# Tampilkan semua baris data
print("=== Dataset Asli ===")
print(df.head())

print("\nInfo Dataset:")
print(df.info())

print("\nJumlah Data & Kolom:", df.shape)

=== Dataset Asli ===
    product_id                             image  \
0  12523520280  e6205d4fcb8a485235778ad210ac4d90   
1  17941082335  8563c9fc31b880ed105f91b8d43e6d88   
2  23517487660  id-11134207-7qul7-leyb85rn449s74   
3   7978948479  d6b77df3df58d6a13b3d78eeb303cbb5   
4   4653307963  f888e78a3845120009602a0b23e65ce0   

                                                name             shop_name  \
0  PS3 Super Slim 1TB Full Game + 2 Stik getar + ...     spirit elektronik   
1  GAMEBOY PVP DUAL ANALOG FULLSET BISA MAIN DI T...  Gaming Console Store   
2                           Pulsa Min A10 K1T Remote           GAMOLBERKAH   
3                 DUS PSP-2004/ DUS PSP STREET E1004              GAMES168   
4  playstation 2 full game|| hardis external full...           Milkha_shop   

      shopid main_category sub_category  
0  379815570    Elektronik  Konsol Game  
1  268909000    Elektronik  Konsol Game  
2  102394348    Elektronik  Konsol Game  
3   28318233    Elektronik  K

# 3. Menampilkan struktur variable data pelanggan

In [None]:
print("\n=== Cek Missing Values ===")
print(df.isnull().sum())


=== Cek Missing Values ===
product_id       0
image            0
name             0
shop_name        1
shopid           0
main_category    0
sub_category     0
dtype: int64


# 4. Perbaikan Nilai yang Hilang

In [None]:
# Kolom 'shop_name' → isi dengan 'Unknown'
df_dirty['shop_name'].fillna("Unknown", inplace=True)

# 5. Cek Ulang Perbaikan Mising Velue

In [None]:
print("\n=== Missing Values Setelah Perbaikan ===")
print(df_dirty.isnull().sum())


=== Missing Values Setelah Perbaikan ===
product_id       0
image            0
name             0
shop_name        0
shopid           0
main_category    0
sub_category     0
dtype: int64


# 6. Cek & Hapus Duplikasi

In [None]:
print("\n=== Cek Data Duplikat ===")
dup_count = df_dirty.duplicated().sum()
print(f"Jumlah baris duplikat: {dup_count}")

before = df_dirty.shape[0]
df_clean = df_dirty.drop_duplicates()
after = df_clean.shape[0]


=== Cek Data Duplikat ===
Jumlah baris duplikat: 0


In [None]:
print(f"Jumlah baris sebelum cleaning: {before}")
print(f"Jumlah baris setelah cleaning: {after}")
print(f"Jumlah baris terhapus (duplikat): {before - after}")

Jumlah baris sebelum cleaning: 162205
Jumlah baris setelah cleaning: 162205
Jumlah baris terhapus (duplikat): 0


# 7. DATA ENRICHMENT

- Mengisikan Nomor/Angka Penjualan yang mempunyai nilai 0 dengan    menggunakan mean

In [None]:
# Pastikan semua kolom numerik bertipe float
for col in df.select_dtypes(include=["number"]).columns:
    df[col] = df[col].astype(float)

changed_rows = []

# Loop tiap kolom numerik
for col in df.select_dtypes(include=["number"]).columns:
    if (df[col] == 0).any():  # hanya kalau ada nilai 0
        mean_val = df.loc[df[col] > 0, col].mean()

        # Simpan data sebelum diganti
        before = df.loc[df[col] == 0, [col]].copy()
        before = before.rename(columns={col: "Sebelum"})
        before["Kolom"] = col
        before["Sesudah"] = mean_val

        # Update dataframe (ganti 0 dengan mean)
        df.loc[df[col] == 0, col] = mean_val

        # Tambahkan ke list perubahan
        changed_rows.append(before)

# Gabungkan semua perubahan
if changed_rows:
    result = pd.concat(changed_rows, axis=0).reset_index(drop=True)
    print("=== Data yang berubah (sebelum → sesudah) ===")
    print(result.head(10))  # tampilkan 10 baris pertama
else:
    print("Tidak ada nilai 0 yang ditemukan.")

Tidak ada nilai 0 yang ditemukan.


# 8. Cek Kolom Tidak Relevan

In [None]:
# 1. Cek jumlah nilai unik per kolom
print("=== Jumlah nilai unik per kolom ===")
print(df.nunique())

=== Jumlah nilai unik per kolom ===
product_id       162205
image            160746
name             159442
shop_name         49866
shopid            49865
main_category        24
sub_category        325
dtype: int64


In [None]:
# Identifikasi kolom kandidat tidak relevan
irrelevant_cols = []

# Kolom dengan 1 nilai unik
single_value_cols = df.nunique()[df.nunique() == 1].index.tolist()
irrelevant_cols.extend(single_value_cols)

In [None]:
# Hapus kolom tidak relevan
df = df.drop(columns=set(irrelevant_cols), errors='ignore')

print("\n=== Setelah perbaikan ===")
print("Kolom dataset sekarang:", df.columns.tolist())


=== Setelah perbaikan ===
Kolom dataset sekarang: ['product_id', 'image', 'name', 'shop_name', 'shopid', 'main_category', 'sub_category']


# 9. Menampilkan Beberapa Data Yang Sudah Di Cleansing

*  Menampilkan dataset akhir setelah proses cleansing:

* Tidak ada missing value penting (semua sudah diisi dengan median, modus, atau kategori default).

* Tidak ada duplikasi data.

* Tidak ada kolom tidak relevan yang hanya memiliki 1 nilai unik.

In [None]:
print("\nContoh data bersih:")
display(df.head(10))


Contoh data bersih:


Unnamed: 0,product_id,image,name,shop_name,shopid,main_category,sub_category
0,12523520000.0,e6205d4fcb8a485235778ad210ac4d90,PS3 Super Slim 1TB Full Game + 2 Stik getar + ...,spirit elektronik,379815570.0,Elektronik,Konsol Game
1,17941080000.0,8563c9fc31b880ed105f91b8d43e6d88,GAMEBOY PVP DUAL ANALOG FULLSET BISA MAIN DI T...,Gaming Console Store,268909000.0,Elektronik,Konsol Game
2,23517490000.0,id-11134207-7qul7-leyb85rn449s74,Pulsa Min A10 K1T Remote,GAMOLBERKAH,102394348.0,Elektronik,Konsol Game
3,7978948000.0,d6b77df3df58d6a13b3d78eeb303cbb5,DUS PSP-2004/ DUS PSP STREET E1004,GAMES168,28318233.0,Elektronik,Konsol Game
4,4653308000.0,f888e78a3845120009602a0b23e65ce0,playstation 2 full game|| hardis external full...,Milkha_shop,293187724.0,Elektronik,Konsol Game
5,14120980000.0,58259a47b7f64a0456ab3012afe80c85,kartu karakter animal crossing get Nintendo sw...,Klontong Gaming,134963878.0,Elektronik,Konsol Game
6,868037500.0,87e619fd39c66ca943c3c6dfddc96f97,Garskin STICKER Premium PSP 3000 2000 1000 Fre...,Garskin Parrotskin,53400287.0,Elektronik,Konsol Game
7,4557886000.0,6d7a7b2971e19c32e85ab31cc8d1863e,PVP 3 SLIM GAME NITENDO // MAINAN ANAK // KADO...,Terang ku,9026886.0,Elektronik,Konsol Game
8,3432470000.0,id-11134201-23030-uwr66k628mov49,PS3 Slim Sony Void CFW 500GB | 1TB | Garansi 1...,Matahari Game,192917696.0,Elektronik,Konsol Game
9,3504742000.0,id-11134207-7qul6-lg63etj8ss5327,Nintendo DS Lite FREE Games Grade B,kazokugames,10548912.0,Elektronik,Konsol Game
