In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Read the data

In [2]:
df_kanzler = pd.read_csv("data/tokped_kanzler.csv")
df_fiesta = pd.read_csv("data/tokped_fiesta.csv")

## Handling NaN values

In [3]:
# check null values
print(df_kanzler.isnull().sum())
print(df_fiesta.isnull().sum())

nama_produk         0
harga_produk        0
penjual            12
lokasi_toko        12
jumlah_terjual     65
rating_produk     111
dtype: int64
nama_produk        80
harga_produk       80
penjual            80
lokasi_toko        80
jumlah_terjual    539
rating_produk     694
dtype: int64


In [4]:
# remove null values
df_kanzler_clean = df_kanzler.dropna()
df_fiesta_clean = df_fiesta.dropna()

In [5]:
# check before and after removing null values
print(df_kanzler.shape)
print(df_kanzler_clean.shape)

(2518, 6)
(2395, 6)


In [6]:
print(df_fiesta.shape)
print(df_fiesta_clean.shape)

(2539, 6)
(1845, 6)


## Data Cleaning

#### Kanzler

In [7]:
# Syntax to change symbols that are not used in values
df_kanzler_clean['harga_produk'].replace('Rp','', regex=True, inplace=True)
df_kanzler_clean['harga_produk'].replace('\.','', regex=True, inplace=True)
df_kanzler_clean['jumlah_terjual'].replace('\+ terjual','', regex=True, inplace=True)
df_kanzler_clean['jumlah_terjual'].replace('terjual','', regex=True, inplace=True)
df_kanzler_clean['jumlah_terjual'].replace('rb',"000", regex=True, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_kanzler_clean['harga_produk'].replace('Rp','', regex=True, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_kanzler_clean['harga_produk'].replace('Rp','', regex=True, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: 

In [8]:
df_kanzler_clean.harga_produk = df_kanzler_clean.harga_produk.astype('int64')
df_kanzler_clean.jumlah_terjual = df_kanzler_clean.jumlah_terjual.astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_kanzler_clean.harga_produk = df_kanzler_clean.harga_produk.astype('int64')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_kanzler_clean.jumlah_terjual = df_kanzler_clean.jumlah_terjual.astype('int64')


In [9]:
df_kanzler_clean = df_kanzler_clean.drop_duplicates()
df_kanzler_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1877 entries, 0 to 2512
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   nama_produk     1877 non-null   object 
 1   harga_produk    1877 non-null   int64  
 2   penjual         1877 non-null   object 
 3   lokasi_toko     1877 non-null   object 
 4   jumlah_terjual  1877 non-null   int64  
 5   rating_produk   1877 non-null   float64
dtypes: float64(1), int64(2), object(3)
memory usage: 102.6+ KB


#### Fiesta

In [10]:
# Syntax to change symbols that are not used in values
df_fiesta_clean['harga_produk'].replace('Rp','', regex=True, inplace=True)
df_fiesta_clean['harga_produk'].replace('\.','', regex=True, inplace=True)
df_fiesta_clean['jumlah_terjual'].replace('\+ terjual','', regex=True, inplace=True)
df_fiesta_clean['jumlah_terjual'].replace('terjual','', regex=True, inplace=True)
df_fiesta_clean['jumlah_terjual'].replace('rb',"000", regex=True, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_fiesta_clean['harga_produk'].replace('Rp','', regex=True, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fiesta_clean['harga_produk'].replace('Rp','', regex=True, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: va

In [11]:
df_fiesta_clean.harga_produk = df_fiesta_clean.harga_produk.astype('int64')
df_fiesta_clean.jumlah_terjual = df_fiesta_clean.jumlah_terjual.astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fiesta_clean.harga_produk = df_fiesta_clean.harga_produk.astype('int64')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fiesta_clean.jumlah_terjual = df_fiesta_clean.jumlah_terjual.astype('int64')


In [12]:
df_fiesta_clean = df_fiesta_clean.drop_duplicates()
df_fiesta_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1364 entries, 0 to 2533
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   nama_produk     1364 non-null   object 
 1   harga_produk    1364 non-null   int64  
 2   penjual         1364 non-null   object 
 3   lokasi_toko     1364 non-null   object 
 4   jumlah_terjual  1364 non-null   int64  
 5   rating_produk   1364 non-null   float64
dtypes: float64(1), int64(2), object(3)
memory usage: 74.6+ KB
