# create df_prediction

### load libraries

In [2]:
import pandas as pd
import numpy as np
import datetime

### load csv files

In [6]:
df_sales = pd.read_csv("./EDA/data/sales_prediction.csv")
df_prices = pd.read_csv("./EDA/data/prices_prediction.csv")
df_calendar_with_events = pd.read_csv("./data/daily_calendar_with_events.csv")

In [10]:

del(df_sales["Unnamed: 0"])
del(df_prices["Unnamed: 0"])

### fullfill df_calendar nulls by 0 and replace event with 1

In [14]:
df_calendar_with_events["event"].fillna(0, inplace=True)

In [15]:
df_calendar_with_events["event"] = df_calendar_with_events["event"].apply(lambda x: 1 if x != 0 else 0)

In [16]:
df_calendar_with_events["date"]= pd.to_datetime(df_calendar_with_events['date'], format='%Y-%m-%d') # change type to datetime

In [37]:
df_sales

Unnamed: 0,id,date,sales
0,ACCESORIES_1_001_NYC_1,2011-01-29,0
1,ACCESORIES_1_002_NYC_1,2011-01-29,0
2,ACCESORIES_1_003_NYC_1,2011-01-29,0
3,ACCESORIES_1_004_NYC_1,2011-01-29,0
4,ACCESORIES_1_005_NYC_1,2011-01-29,0
...,...,...,...
58327365,SUPERMARKET_3_823_PHI_3,2016-04-24,1
58327366,SUPERMARKET_3_824_PHI_3,2016-04-24,0
58327367,SUPERMARKET_3_825_PHI_3,2016-04-24,0
58327368,SUPERMARKET_3_826_PHI_3,2016-04-24,3


### create new columns so to merge by it

In [None]:
df_prices["id"] = df_prices["item"] + "_" + df_prices["store_code"]

### drop unnecessary files

In [21]:
df_prices.drop(["item", "store_code"], axis=1, inplace=True)

In [50]:
del(df_sales["Unnamed: 0"])
del(df_prices["Unnamed: 0"])


### sort columns so to make df_prieces easier to read

In [26]:
columns=["id","date","sell_price"]

In [27]:
df_prices = df_prices[columns]

In [35]:
df_prices

Unnamed: 0,id,date,sell_price
0,ACCESORIES_1_001_NYC_1,2013-07-16,12.7414
1,ACCESORIES_1_001_NYC_1,2013-07-17,12.7414
2,ACCESORIES_1_001_NYC_1,2013-07-18,12.7414
3,ACCESORIES_1_001_NYC_1,2013-07-19,12.7414
4,ACCESORIES_1_001_NYC_1,2013-07-20,12.7414
...,...,...,...
45986371,SUPERMARKET_3_827_PHI_3,2016-04-20,1.2000
45986372,SUPERMARKET_3_827_PHI_3,2016-04-21,1.2000
45986373,SUPERMARKET_3_827_PHI_3,2016-04-22,1.2000
45986374,SUPERMARKET_3_827_PHI_3,2016-04-23,1.2000


### ensure all df are in datetime format

In [29]:
df_sales["date"] = pd.to_datetime(df_sales["date"])
df_calendar_with_events["date"] = pd.to_datetime(df_calendar_with_events["date"])
df_prices["date"] = pd.to_datetime(df_prices["date"])

### merge all df_prices df_sales df_calendar

In [38]:
df_merge = df_sales.merge(df_prices, on=['id', 'date'], how='left') 

In [None]:
df_merge.dropna(subset=["sell_price"], inplace=True)

In [42]:
df_merge = df_merge.merge(df_calendar_with_events, on=['date'], how='left')

In [44]:
df_merge.drop(["weekday_int","d"], axis=1, inplace=True)

In [45]:
df_merge

Unnamed: 0,id,date,sales,sell_price,weekday,event
0,ACCESORIES_1_008_NYC_1,2011-01-29,12,0.6118,Saturday,0
1,ACCESORIES_1_009_NYC_1,2011-01-29,2,2.0748,Saturday,0
2,ACCESORIES_1_010_NYC_1,2011-01-29,0,4.2161,Saturday,0
3,ACCESORIES_1_012_NYC_1,2011-01-29,0,7.9534,Saturday,0
4,ACCESORIES_1_015_NYC_1,2011-01-29,4,0.9310,Saturday,0
...,...,...,...,...,...,...
45986371,SUPERMARKET_3_823_PHI_3,2016-04-24,1,3.5760,Sunday,0
45986372,SUPERMARKET_3_824_PHI_3,2016-04-24,0,2.9760,Sunday,0
45986373,SUPERMARKET_3_825_PHI_3,2016-04-24,0,4.7760,Sunday,0
45986374,SUPERMARKET_3_826_PHI_3,2016-04-24,3,1.5360,Sunday,0


### Top 100 of items that generate best glonal income

In [46]:
df_merge.set_index("date",inplace=True)

In [47]:
df_merge["income"] = df_merge["sales"] * df_merge["sell_price"]

In [48]:
df_merge["item"]= df_merge["id"].apply(lambda x: x[:-6])

In [49]:
top100= df_merge.groupby( ["item"])["income"].sum().sort_values(ascending=False).head(100).reset_index()

In [50]:
top100

Unnamed: 0,item,income
0,SUPERMARKET_3_586,1.754179e+06
1,SUPERMARKET_3_120,1.690645e+06
2,SUPERMARKET_3_090,1.623844e+06
3,SUPERMARKET_3_202,1.501405e+06
4,SUPERMARKET_3_587,1.171340e+06
...,...,...
95,SUPERMARKET_3_389,3.248685e+05
96,SUPERMARKET_3_541,3.231147e+05
97,SUPERMARKET_3_681,3.180257e+05
98,HOME_&_GARDEN_1_497,3.177413e+05


### save them in to a list

In [51]:
Lista_top100=list(top100["item"])

### Filter df with top100 list.

In [52]:
df_top100 = df_merge[df_merge['item'].isin(Lista_top100)]

In [54]:
df_top100.drop(["id"],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_top100.drop(["id"],axis=1,inplace=True)


In [56]:
columns = ["weekday","item","sales","sell_price","event","income"]

In [57]:
df_top100 = df_top100[columns]

In [58]:
file_name = "Top100_prediction.csv"
df_top100.to_csv(file_name)

In [59]:
file_name = "df_market_by_days.csv"
df_merge.to_csv(file_name)