In [1]:
import pandas as pd
import numpy as np
import datetime

In [2]:
df_sales = pd.read_csv("./data/item_sales.csv")
df_calendar = pd.read_csv("./data/daily_calendar_with_events.csv")

In [3]:
df_calendar.head(1)

Unnamed: 0,date,weekday,weekday_int,d,event
0,2011-01-29,Saturday,1,d_1,


### Checking duplicated rows and nulls

In [4]:
df_sales.isnull().sum()

id            0
item          0
category      0
department    0
store         0
             ..
d_1909        0
d_1910        0
d_1911        0
d_1912        0
d_1913        0
Length: 1920, dtype: int64

In [5]:
df_calendar.isnull().sum()  

date              0
weekday           0
weekday_int       0
d                 0
event          1887
dtype: int64

In [6]:
df_sales.drop_duplicates(inplace=True)

In [7]:
df_calendar.drop_duplicates(inplace=True)

### Changing date format with datetime

In [8]:
df_calendar['date'] = pd.to_datetime(df_calendar['date'], format='%Y-%m-%d')

### Creating a range of dates so to convert d1,d2,d3 to date

In [9]:
start_date = df_calendar['date'].min()
end_date = df_calendar['date'].max()

In [10]:
df_calendar['event'] = df_calendar['event'].fillna('no event')

In [11]:
date_range = pd.date_range(start=start_date, end= end_date, freq='D')
date_range = date_range.to_list()
columns_new = df_sales.columns[:7].to_list()
columns_for_rename = columns_new + date_range

df_sales.columns = columns_for_rename

In [12]:
df_sales.head(1)

Unnamed: 0,id,item,category,department,store,store_code,region,2011-01-29 00:00:00,2011-01-30 00:00:00,2011-01-31 00:00:00,...,2016-04-15 00:00:00,2016-04-16 00:00:00,2016-04-17 00:00:00,2016-04-18 00:00:00,2016-04-19 00:00:00,2016-04-20 00:00:00,2016-04-21 00:00:00,2016-04-22 00:00:00,2016-04-23 00:00:00,2016-04-24 00:00:00
0,ACCESORIES_1_001_NYC_1,ACCESORIES_1_001,ACCESORIES,ACCESORIES_1,Greenwich_Village,NYC_1,New York,0,0,0,...,1,3,0,1,1,1,3,0,1,1


### keeping first 7 columns, extracting date columns and transform date columns into 1 colunm

In [13]:

first_columns = df_sales.iloc[:, :7]


date_columns = [col for col in df_sales.columns[7:].to_list()]


df_sales_melted = df_sales.melt(id_vars=first_columns.columns.tolist(), value_vars=date_columns, var_name='date', value_name='sales')

### Convert dates into months

In [14]:
df_sales_melted["date"] = df_sales_melted["date"].astype(str).apply(lambda x: x[0:7])

In [15]:
df_sales = df_sales_melted.copy()

In [16]:
df_sales

Unnamed: 0,id,item,category,department,store,store_code,region,date,sales
0,ACCESORIES_1_001_NYC_1,ACCESORIES_1_001,ACCESORIES,ACCESORIES_1,Greenwich_Village,NYC_1,New York,2011-01,0
1,ACCESORIES_1_002_NYC_1,ACCESORIES_1_002,ACCESORIES,ACCESORIES_1,Greenwich_Village,NYC_1,New York,2011-01,0
2,ACCESORIES_1_003_NYC_1,ACCESORIES_1_003,ACCESORIES,ACCESORIES_1,Greenwich_Village,NYC_1,New York,2011-01,0
3,ACCESORIES_1_004_NYC_1,ACCESORIES_1_004,ACCESORIES,ACCESORIES_1,Greenwich_Village,NYC_1,New York,2011-01,0
4,ACCESORIES_1_005_NYC_1,ACCESORIES_1_005,ACCESORIES,ACCESORIES_1,Greenwich_Village,NYC_1,New York,2011-01,0
...,...,...,...,...,...,...,...,...,...
58327365,SUPERMARKET_3_823_PHI_3,SUPERMARKET_3_823,SUPERMARKET,SUPERMARKET_3,Queen_Village,PHI_3,Philadelphia,2016-04,1
58327366,SUPERMARKET_3_824_PHI_3,SUPERMARKET_3_824,SUPERMARKET,SUPERMARKET_3,Queen_Village,PHI_3,Philadelphia,2016-04,0
58327367,SUPERMARKET_3_825_PHI_3,SUPERMARKET_3_825,SUPERMARKET,SUPERMARKET_3,Queen_Village,PHI_3,Philadelphia,2016-04,0
58327368,SUPERMARKET_3_826_PHI_3,SUPERMARKET_3_826,SUPERMARKET,SUPERMARKET_3,Queen_Village,PHI_3,Philadelphia,2016-04,3


### Grouping by id and date, summing sales

In [17]:
gb_sales = df_sales.groupby(["id", "date"])["sales"].sum()

### convert to frame

In [18]:
gb_sales =gb_sales.to_frame()

In [19]:
gb_sales = gb_sales.reset_index()

In [20]:
gb_sales

Unnamed: 0,id,date,sales
0,ACCESORIES_1_001_BOS_1,2011-01,0
1,ACCESORIES_1_001_BOS_1,2011-02,0
2,ACCESORIES_1_001_BOS_1,2011-03,0
3,ACCESORIES_1_001_BOS_1,2011-04,0
4,ACCESORIES_1_001_BOS_1,2011-05,0
...,...,...,...
1951355,SUPERMARKET_3_827_PHI_3,2015-12,41
1951356,SUPERMARKET_3_827_PHI_3,2016-01,49
1951357,SUPERMARKET_3_827_PHI_3,2016-02,46
1951358,SUPERMARKET_3_827_PHI_3,2016-03,68


In [21]:
file_name = "reduced_sales.csv"
gb_sales.to_csv(file_name)