# 1. Functions, libraries and packages

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import kaggle

In [18]:
def dataframe_info(df: pd.DataFrame):
    """
    Generate a summary DataFrame containing metadata about the columns of the input DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.

    Returns:
        pd.DataFrame: A summary DataFrame with the following columns:
            - Column_name: Name of each column.
            - Total records: Total number of records in each column.
            - Missing Values: Number of missing (NaN) values in each column.
            - Data type: Data type of each column.
            - Unique values: Number of unique values in each column.
    """
    df_summary = pd.DataFrame({
        'Column_name': df.columns,
        'Total records': [df[col].size for col in df.columns],
        'Missing Values': [df[col].isna().sum() for col in df.columns],
        'Data type': [df[col].dtype for col in df.columns],
        'Unique values': [df[col].nunique() for col in df.columns]
    })

    return df_summary

# 2. Data import

In [19]:
kaggle.api.authenticate()
kaggle.api.dataset_download_files('aswathrao/demand-forecasting', path = './data', unzip = True)

Dataset URL: https://www.kaggle.com/datasets/aswathrao/demand-forecasting


In [20]:
df = pd.read_csv(r'data/train_0irEZ2H.csv')

In [21]:
df.head()

Unnamed: 0,record_ID,week,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku,units_sold
0,1,17/01/11,8091,216418,99.0375,111.8625,0,0,20
1,2,17/01/11,8091,216419,99.0375,99.0375,0,0,28
2,3,17/01/11,8091,216425,133.95,133.95,0,0,19
3,4,17/01/11,8091,216233,133.95,133.95,0,0,44
4,5,17/01/11,8091,217390,141.075,141.075,0,0,52


# 3. EDA

In [22]:
dataframe_info(df)

Unnamed: 0,Column_name,Total records,Missing Values,Data type,Unique values
0,record_ID,150150,0,int64,150150
1,week,150150,0,object,130
2,store_id,150150,0,int64,76
3,sku_id,150150,0,int64,28
4,total_price,150150,1,float64,646
5,base_price,150150,0,float64,572
6,is_featured_sku,150150,0,int64,2
7,is_display_sku,150150,0,int64,2
8,units_sold,150150,0,int64,708


In [23]:
df.loc[df.isna().any(axis = 1),:]

Unnamed: 0,record_ID,week,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku,units_sold
136949,193915,23/04/13,9436,245338,,469.5375,0,0,1


In [40]:
median_map = df.groupby('sku_id')['total_price'].median()
median_map

sku_id
216233    127.5375
216418     88.3500
216419     88.3500
216425    128.2500
217217    232.2750
217390    141.7875
217777    232.9875
219009    198.7875
219029    312.7875
219844    234.4125
222087    200.2125
222765    227.2875
223153    212.3250
223245    207.3375
245338    469.5375
245387    469.5375
300021    101.8875
300291    110.4375
320485    205.9125
327492    111.1500
378934    205.9125
398721    469.5375
545621    355.5375
546789    205.9125
547934    177.4125
600934    462.4125
673209    355.5375
679023    205.9125
Name: total_price, dtype: float64

In [45]:
df['total_price'] = df.apply(
    lambda row: median_map[row['sku_id']] if pd.isna(row['total_price']) else row['total_price'], axis=1
)

In [47]:
df[['day', 'month','year']] = df['week'].str.split('/',expand= True).astype(int)
df = df.drop(columns=['week'])

In [48]:
df.head(5)

Unnamed: 0,record_ID,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku,units_sold,day,month,year
0,1,8091,216418,99.0375,111.8625,0,0,20,17,1,11
1,2,8091,216419,99.0375,99.0375,0,0,28,17,1,11
2,3,8091,216425,133.95,133.95,0,0,19,17,1,11
3,4,8091,216233,133.95,133.95,0,0,44,17,1,11
4,5,8091,217390,141.075,141.075,0,0,52,17,1,11
