# df_sales EDA

### loading libraries

In [47]:
import pandas as pd
import numpy as np
import datetime

### loading dataframes

In [48]:
df_sales = pd.read_csv("../data/initial_data/item_sales.csv")
df_calendar = pd.read_csv("../data/clean_data/cleaned_calendar.csv")

In [49]:
df_sales

Unnamed: 0,id,item,category,department,store,store_code,region,d_1,d_2,d_3,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,ACCESORIES_1_001_NYC_1,ACCESORIES_1_001,ACCESORIES,ACCESORIES_1,Greenwich_Village,NYC_1,New York,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,ACCESORIES_1_002_NYC_1,ACCESORIES_1_002,ACCESORIES,ACCESORIES_1,Greenwich_Village,NYC_1,New York,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,ACCESORIES_1_003_NYC_1,ACCESORIES_1_003,ACCESORIES,ACCESORIES_1,Greenwich_Village,NYC_1,New York,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,ACCESORIES_1_004_NYC_1,ACCESORIES_1_004,ACCESORIES,ACCESORIES_1,Greenwich_Village,NYC_1,New York,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,ACCESORIES_1_005_NYC_1,ACCESORIES_1_005,ACCESORIES,ACCESORIES_1,Greenwich_Village,NYC_1,New York,0,0,0,...,2,1,1,0,1,1,2,2,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,SUPERMARKET_3_823_PHI_3,SUPERMARKET_3_823,SUPERMARKET,SUPERMARKET_3,Queen_Village,PHI_3,Philadelphia,0,0,2,...,2,0,0,0,0,0,1,0,0,1
30486,SUPERMARKET_3_824_PHI_3,SUPERMARKET_3_824,SUPERMARKET,SUPERMARKET_3,Queen_Village,PHI_3,Philadelphia,0,0,0,...,0,0,0,0,0,0,0,0,1,0
30487,SUPERMARKET_3_825_PHI_3,SUPERMARKET_3_825,SUPERMARKET,SUPERMARKET_3,Queen_Village,PHI_3,Philadelphia,0,6,0,...,2,1,0,2,0,1,0,0,1,0
30488,SUPERMARKET_3_826_PHI_3,SUPERMARKET_3_826,SUPERMARKET,SUPERMARKET_3,Queen_Village,PHI_3,Philadelphia,0,0,0,...,0,0,1,0,0,1,0,3,1,3


In [50]:
df_calendar.head(1)

Unnamed: 0,date,weekday,event
0,2011-01-29,Saturday,0


### Check duplicates and nulls

In [51]:
df_sales.isnull().sum()

id            0
item          0
category      0
department    0
store         0
             ..
d_1909        0
d_1910        0
d_1911        0
d_1912        0
d_1913        0
Length: 1920, dtype: int64

In [52]:
df_sales.duplicated().sum()

np.int64(0)

In [53]:
df_calendar.duplicated().sum()

np.int64(0)

In [54]:
df_calendar.isnull().sum()  

date       0
weekday    0
event      0
dtype: int64

### change date format with datetime

In [55]:
df_calendar['date'] = pd.to_datetime(df_calendar['date'], format='%Y-%m-%d')

### Create a range of dates so to convert d1,d2,d3 to date

In [56]:
start_date = df_calendar['date'].min()
end_date = df_calendar['date'].max()

### convert d1,d2,d3...to date

In [57]:
date_range = pd.date_range(start=start_date, end= end_date, freq='D')
date_range = date_range.to_list()
columns_new = df_sales.columns[:7].to_list()
columns_for_rename = columns_new + date_range

df_sales.columns = columns_for_rename

In [58]:
df_sales.head(1)

Unnamed: 0,id,item,category,department,store,store_code,region,2011-01-29 00:00:00,2011-01-30 00:00:00,2011-01-31 00:00:00,...,2016-04-15 00:00:00,2016-04-16 00:00:00,2016-04-17 00:00:00,2016-04-18 00:00:00,2016-04-19 00:00:00,2016-04-20 00:00:00,2016-04-21 00:00:00,2016-04-22 00:00:00,2016-04-23 00:00:00,2016-04-24 00:00:00
0,ACCESORIES_1_001_NYC_1,ACCESORIES_1_001,ACCESORIES,ACCESORIES_1,Greenwich_Village,NYC_1,New York,0,0,0,...,1,3,0,1,1,1,3,0,1,1


### keep first 7 columns, extract date columns and transform date columns into 1 colunm

In [59]:

first_columns = df_sales.iloc[:, :7]


date_columns = [col for col in df_sales.columns[7:].to_list()]


df_sales_melted = df_sales.melt(id_vars=first_columns.columns.tolist(), value_vars=date_columns, var_name='date', value_name='sales')

In [60]:
df_sales.shape


(30490, 1920)

In [61]:
df_sales_melted

Unnamed: 0,id,item,category,department,store,store_code,region,date,sales
0,ACCESORIES_1_001_NYC_1,ACCESORIES_1_001,ACCESORIES,ACCESORIES_1,Greenwich_Village,NYC_1,New York,2011-01-29 00:00:00,0
1,ACCESORIES_1_002_NYC_1,ACCESORIES_1_002,ACCESORIES,ACCESORIES_1,Greenwich_Village,NYC_1,New York,2011-01-29 00:00:00,0
2,ACCESORIES_1_003_NYC_1,ACCESORIES_1_003,ACCESORIES,ACCESORIES_1,Greenwich_Village,NYC_1,New York,2011-01-29 00:00:00,0
3,ACCESORIES_1_004_NYC_1,ACCESORIES_1_004,ACCESORIES,ACCESORIES_1,Greenwich_Village,NYC_1,New York,2011-01-29 00:00:00,0
4,ACCESORIES_1_005_NYC_1,ACCESORIES_1_005,ACCESORIES,ACCESORIES_1,Greenwich_Village,NYC_1,New York,2011-01-29 00:00:00,0
...,...,...,...,...,...,...,...,...,...
58327365,SUPERMARKET_3_823_PHI_3,SUPERMARKET_3_823,SUPERMARKET,SUPERMARKET_3,Queen_Village,PHI_3,Philadelphia,2016-04-24 00:00:00,1
58327366,SUPERMARKET_3_824_PHI_3,SUPERMARKET_3_824,SUPERMARKET,SUPERMARKET_3,Queen_Village,PHI_3,Philadelphia,2016-04-24 00:00:00,0
58327367,SUPERMARKET_3_825_PHI_3,SUPERMARKET_3_825,SUPERMARKET,SUPERMARKET_3,Queen_Village,PHI_3,Philadelphia,2016-04-24 00:00:00,0
58327368,SUPERMARKET_3_826_PHI_3,SUPERMARKET_3_826,SUPERMARKET,SUPERMARKET_3,Queen_Village,PHI_3,Philadelphia,2016-04-24 00:00:00,3


### create a csv file for time series so to use it afterwards

In [62]:
df_prediction = df_sales_melted.copy()

In [63]:
df_prediction.drop(["item", "category", "department", "store", "store_code", "region"],axis=1, inplace=True)

In [64]:
df_prediction.reset_index( inplace=True)


In [65]:
df_prediction

Unnamed: 0,index,id,date,sales
0,0,ACCESORIES_1_001_NYC_1,2011-01-29 00:00:00,0
1,1,ACCESORIES_1_002_NYC_1,2011-01-29 00:00:00,0
2,2,ACCESORIES_1_003_NYC_1,2011-01-29 00:00:00,0
3,3,ACCESORIES_1_004_NYC_1,2011-01-29 00:00:00,0
4,4,ACCESORIES_1_005_NYC_1,2011-01-29 00:00:00,0
...,...,...,...,...
58327365,58327365,SUPERMARKET_3_823_PHI_3,2016-04-24 00:00:00,1
58327366,58327366,SUPERMARKET_3_824_PHI_3,2016-04-24 00:00:00,0
58327367,58327367,SUPERMARKET_3_825_PHI_3,2016-04-24 00:00:00,0
58327368,58327368,SUPERMARKET_3_826_PHI_3,2016-04-24 00:00:00,3


In [66]:
df_prediction["date"] = pd.to_datetime(df_prediction["date"], format="%y-%m-%d")

In [81]:
file_name = "sales_prediction.csv"
df_prediction.to_csv(file_name, index=False)

### resample by months

In [68]:
df_sales_melted.set_index("date", inplace=True)

  return Index(sequences[0], name=names)


In [69]:
df_sales = df_sales_melted.groupby(["id", "store"])["sales"].resample("M").sum().reset_index()


  df_sales = df_sales_melted.groupby(["id", "store"])["sales"].resample("M").sum().reset_index()


In [70]:
df_sales

Unnamed: 0,id,store,date,sales
0,ACCESORIES_1_001_BOS_1,South_End,2011-01-31,0
1,ACCESORIES_1_001_BOS_1,South_End,2011-02-28,0
2,ACCESORIES_1_001_BOS_1,South_End,2011-03-31,0
3,ACCESORIES_1_001_BOS_1,South_End,2011-04-30,0
4,ACCESORIES_1_001_BOS_1,South_End,2011-05-31,0
...,...,...,...,...
1951355,SUPERMARKET_3_827_PHI_3,Queen_Village,2015-12-31,41
1951356,SUPERMARKET_3_827_PHI_3,Queen_Village,2016-01-31,49
1951357,SUPERMARKET_3_827_PHI_3,Queen_Village,2016-02-29,46
1951358,SUPERMARKET_3_827_PHI_3,Queen_Village,2016-03-31,68


In [82]:
file_name = "cleaned_sales.csv"
df_sales.to_csv(file_name, index=False)

In [83]:
df_sales

Unnamed: 0,id,store,date,sales
0,ACCESORIES_1_001_BOS_1,South_End,2011-01-31,0
1,ACCESORIES_1_001_BOS_1,South_End,2011-02-28,0
2,ACCESORIES_1_001_BOS_1,South_End,2011-03-31,0
3,ACCESORIES_1_001_BOS_1,South_End,2011-04-30,0
4,ACCESORIES_1_001_BOS_1,South_End,2011-05-31,0
...,...,...,...,...
1951355,SUPERMARKET_3_827_PHI_3,Queen_Village,2015-12-31,41
1951356,SUPERMARKET_3_827_PHI_3,Queen_Village,2016-01-31,49
1951357,SUPERMARKET_3_827_PHI_3,Queen_Village,2016-02-29,46
1951358,SUPERMARKET_3_827_PHI_3,Queen_Village,2016-03-31,68


## Clustering 

In [84]:
df_sales_cluster = df_sales.groupby(['id'])['sales'].sum().reset_index()

In [85]:
df_sales_cluster["item"] = df_sales_cluster["id"].apply(lambda x: x[:-6])

In [86]:
df_sales_cluster

Unnamed: 0,id,sales,item
0,ACCESORIES_1_001_BOS_1,260,ACCESORIES_1_001
1,ACCESORIES_1_001_BOS_2,402,ACCESORIES_1_001
2,ACCESORIES_1_001_BOS_3,385,ACCESORIES_1_001
3,ACCESORIES_1_001_NYC_1,600,ACCESORIES_1_001
4,ACCESORIES_1_001_NYC_2,542,ACCESORIES_1_001
...,...,...,...
30485,SUPERMARKET_3_827_NYC_3,2572,SUPERMARKET_3_827
30486,SUPERMARKET_3_827_NYC_4,93,SUPERMARKET_3_827
30487,SUPERMARKET_3_827_PHI_1,2732,SUPERMARKET_3_827
30488,SUPERMARKET_3_827_PHI_2,867,SUPERMARKET_3_827


In [87]:
columns = ["id","item", "sales"]

In [88]:
df_sales_cluster = df_sales_cluster[columns]

In [89]:
df_sales_cluster = df_sales_cluster.groupby(['item'])['sales'].sum().reset_index()

In [90]:
df_sales_cluster.head(1)

Unnamed: 0,item,sales
0,ACCESORIES_1_001,4093


In [91]:
file_name = "df_sales_cluster.csv"
df_sales_cluster.to_csv(file_name, index=False)