## Import library


In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

## Init datasets


In [2]:
# train_df = pd.read_csv("../datasets/train.csv")
train_df = pd.read_csv(
    '../datasets/train.csv',
    usecols=['date', 'num_sold'],
    skiprows=lambda x: x > 0 and np.random.rand() > 0.1
)
test_df = pd.read_csv("../datasets/test.csv")

In [3]:
train_df.head()

Unnamed: 0,date,num_sold
0,2010-01-01,1255.0
1,2010-01-01,2212.0
2,2010-01-05,34.0
3,2010-01-07,223.0
4,2010-01-08,11.0


In [4]:
test_df.head()

Unnamed: 0,id,date,country,store,product
0,230130,2017-01-01,Canada,Discount Stickers,Holographic Goose
1,230131,2017-01-01,Canada,Discount Stickers,Kaggle
2,230132,2017-01-01,Canada,Discount Stickers,Kaggle Tiers
3,230133,2017-01-01,Canada,Discount Stickers,Kerneler
4,230134,2017-01-01,Canada,Discount Stickers,Kerneler Dark Mode


## Cleaning data


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2295 entries, 0 to 2294
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   date      2295 non-null   object 
 1   num_sold  2295 non-null   float64
dtypes: float64(1), object(1)
memory usage: 36.0+ KB


**TODO:**

- date must convert to datetime.
- solve missing values in num_sold


In [6]:
# conver date from object to datetime
train_df['date'] = pd.to_datetime(train_df['date'])

In [7]:
# KNN Imputation
imputer = KNNImputer(n_neighbors=5) # n_neighbors bisa disesuaikan
train_df['num_sold_knn'] = imputer.fit_transform(train_df[['num_sold']]) 

# Membandingkan hasil imputasi
print(train_df[['num_sold', 'num_sold_knn']].head(10))

#cek missing value setelah di imputasi
print("\nJumlah Missing Value setelah imputasi:\n", train_df.isnull().sum())

In [8]:
train_df = train_df.sort_values(by='date')

# Menghapus kolom num_sold_ffill, num_sold_bfill (jika masih ada)
try: # menggunakan try except untuk mengatasi error jika kolom tidak ada
    train_df = train_df.drop(columns=['num_sold'])
except KeyError:
    pass # jika kolom tidak ada, abaikan error

train_df = train_df.rename(columns={'num_sold_knn' : 'num_sold'})

#cek missing value
print("\nJumlah Missing Value setelah perubahan:\n", train_df.isnull().sum())

In [9]:
train_df.head()

Unnamed: 0,date,num_sold
0,2010-01-01,1255.0
1,2010-01-01,2212.0
2,2010-01-05,34.0
3,2010-01-07,223.0
4,2010-01-08,11.0
