In [1]:
import pandas as pd
import numpy as np
import datetime

In [2]:
df_sales = pd.read_csv("../data/item_sales.csv")
df_calendar = pd.read_csv("./data/calendar_clean.csv") 

## Start to clean __df saless__

In [3]:
df_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30490 entries, 0 to 30489
Columns: 1920 entries, id to d_1913
dtypes: int64(1913), object(7)
memory usage: 446.6+ MB


In [4]:
print(df_sales.shape)
df_sales.head(1)

(30490, 1920)


Unnamed: 0,id,item,category,department,store,store_code,region,d_1,d_2,d_3,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,ACCESORIES_1_001_NYC_1,ACCESORIES_1_001,ACCESORIES,ACCESORIES_1,Greenwich_Village,NYC_1,New York,0,0,0,...,1,3,0,1,1,1,3,0,1,1


### Checking duplicated rows and nulls

In [5]:
df_sales.drop_duplicates(inplace=True)

In [6]:
df_sales.isnull().sum()

id            0
item          0
category      0
department    0
store         0
             ..
d_1909        0
d_1910        0
d_1911        0
d_1912        0
d_1913        0
Length: 1920, dtype: int64

### Creating a range of dates so to convert d1,d2,d3 to date

In [7]:
start_date = df_calendar['date'].min()
end_date = df_calendar['date'].max()

In [8]:
date_range = pd.date_range(start=start_date, end= end_date, freq='D')
date_range = date_range.to_list()
columns_new = df_sales.columns[:7].to_list()
columns_for_rename = columns_new + date_range

df_sales.columns = columns_for_rename

In [9]:
df_sales.head(1)

Unnamed: 0,id,item,category,department,store,store_code,region,2011-01-29 00:00:00,2011-01-30 00:00:00,2011-01-31 00:00:00,...,2016-04-15 00:00:00,2016-04-16 00:00:00,2016-04-17 00:00:00,2016-04-18 00:00:00,2016-04-19 00:00:00,2016-04-20 00:00:00,2016-04-21 00:00:00,2016-04-22 00:00:00,2016-04-23 00:00:00,2016-04-24 00:00:00
0,ACCESORIES_1_001_NYC_1,ACCESORIES_1_001,ACCESORIES,ACCESORIES_1,Greenwich_Village,NYC_1,New York,0,0,0,...,1,3,0,1,1,1,3,0,1,1


In [10]:
df_sales1 = df_sales.copy() #this copy is only for the cluster

In [11]:
#Get sum of all diary sales
df_sales1['sales']=df_sales1.select_dtypes(include='number').sum(axis=1)
df_sales1.head(1)

Unnamed: 0,id,item,category,department,store,store_code,region,2011-01-29 00:00:00,2011-01-30 00:00:00,2011-01-31 00:00:00,...,2016-04-16 00:00:00,2016-04-17 00:00:00,2016-04-18 00:00:00,2016-04-19 00:00:00,2016-04-20 00:00:00,2016-04-21 00:00:00,2016-04-22 00:00:00,2016-04-23 00:00:00,2016-04-24 00:00:00,sales
0,ACCESORIES_1_001_NYC_1,ACCESORIES_1_001,ACCESORIES,ACCESORIES_1,Greenwich_Village,NYC_1,New York,0,0,0,...,3,0,1,1,1,3,0,1,1,600


## I filter to cluster

In [12]:
df_sales_cluster = df_sales1[['item','sales']].groupby(['item']).sum()

In [13]:
print(df_sales_cluster.shape)
df_sales_cluster.sample(5)

(3049, 1)


Unnamed: 0_level_0,sales
item,Unnamed: 1_level_1
SUPERMARKET_2_205,7679
ACCESORIES_2_125,2219
HOME_&_GARDEN_2_306,1560
ACCESORIES_1_226,10994
SUPERMARKET_3_173,2956


In [14]:
# file_name = "df_sales_cluster.csv"
# df_sales_cluster.to_csv(file_name)

### keeping first 7 columns about **df_sales**, extracting date columns and transform date columns into 1 colunm

In [15]:
first_columns = df_sales.iloc[:, :7]
date_columns = [col for col in df_sales.columns[7:].to_list()]
df_sales_melted = df_sales.melt(id_vars=first_columns.columns.tolist(), value_vars=date_columns, var_name='date', value_name='sales')

In [16]:
print(df_sales_melted.shape)
df_sales_melted.head(5)

(58327370, 9)


Unnamed: 0,id,item,category,department,store,store_code,region,date,sales
0,ACCESORIES_1_001_NYC_1,ACCESORIES_1_001,ACCESORIES,ACCESORIES_1,Greenwich_Village,NYC_1,New York,2011-01-29 00:00:00,0
1,ACCESORIES_1_002_NYC_1,ACCESORIES_1_002,ACCESORIES,ACCESORIES_1,Greenwich_Village,NYC_1,New York,2011-01-29 00:00:00,0
2,ACCESORIES_1_003_NYC_1,ACCESORIES_1_003,ACCESORIES,ACCESORIES_1,Greenwich_Village,NYC_1,New York,2011-01-29 00:00:00,0
3,ACCESORIES_1_004_NYC_1,ACCESORIES_1_004,ACCESORIES,ACCESORIES_1,Greenwich_Village,NYC_1,New York,2011-01-29 00:00:00,0
4,ACCESORIES_1_005_NYC_1,ACCESORIES_1_005,ACCESORIES,ACCESORIES_1,Greenwich_Village,NYC_1,New York,2011-01-29 00:00:00,0


In [17]:
df_sales_melted.drop(columns=["item", "category", "department", "store_code", "region"], axis=1, inplace=True)

In [20]:
print(df_sales_melted.shape)
df_sales_melted.head(1)


(58327370, 4)


Unnamed: 0,id,store,date,sales
0,ACCESORIES_1_001_NYC_1,Greenwich_Village,2011-01-29 00:00:00,0


In [21]:
df_sales_melted["date"] = pd.to_datetime(df_sales_melted["date"], format='%Y-%m-%d')

In [21]:
# file_name = "sales_clean.csv"
# df_sales_melted.to_csv(file_name)