# df_prices EDA

### loading libraries

In [1]:
import numpy as np
import pandas as pd
import datetime
from datetime import datetime, timedelta

### loading dataframes

In [2]:
df_prices = pd.read_csv('../data/item_prices.csv')
df_calendar = pd.read_csv('../data/daily_calendar_with_events.csv')

### exploring dataframes

In [3]:
df_prices.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6965706 entries, 0 to 6965705
Data columns (total 5 columns):
 #   Column      Dtype  
---  ------      -----  
 0   item        object 
 1   category    object 
 2   store_code  object 
 3   yearweek    float64
 4   sell_price  float64
dtypes: float64(2), object(3)
memory usage: 265.7+ MB


In [4]:
df_prices.sample(10)

Unnamed: 0,item,category,store_code,yearweek,sell_price
3479516,ACCESORIES_1_037,ACCESORIES,BOS_2,201326.0,16.9841
3301611,SUPERMARKET_3_098,SUPERMARKET,BOS_1,201603.0,5.376
155194,HOME_&_GARDEN_1_102,HOME_&_GARDEN,NYC_1,201611.0,22.4625
5022830,HOME_&_GARDEN_1_013,HOME_&_GARDEN,PHI_1,201228.0,6.2125
6498403,HOME_&_GARDEN_1_498,HOME_&_GARDEN,PHI_3,201330.0,4.1
3566385,ACCESORIES_1_414,ACCESORIES,BOS_2,201418.0,9.6026
5665095,ACCESORIES_2_004,ACCESORIES,PHI_2,201416.0,3.2851
1653476,HOME_&_GARDEN_2_198,HOME_&_GARDEN,NYC_3,201129.0,9.3375
5001187,ACCESORIES_2_066,ACCESORIES,PHI_1,201331.0,6.6101
1644929,HOME_&_GARDEN_2_159,HOME_&_GARDEN,NYC_3,201420.0,4.9125


In [5]:
print(df_prices.shape)


(6965706, 5)


In [6]:
df_prices.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6965706 entries, 0 to 6965705
Data columns (total 5 columns):
 #   Column      Dtype  
---  ------      -----  
 0   item        object 
 1   category    object 
 2   store_code  object 
 3   yearweek    float64
 4   sell_price  float64
dtypes: float64(2), object(3)
memory usage: 265.7+ MB


## drop duplicated

In [7]:

df_prices.drop_duplicates(keep=False , inplace=True)

### Look for nulls and inputing them

In [8]:
df_prices.isnull().sum()

item            0
category        0
store_code      0
yearweek      425
sell_price      0
dtype: int64

In [9]:
df_prices["yearweek"].sort_values(ascending=True).unique()

array([201105., 201106., 201107., 201108., 201109., 201110., 201111.,
       201112., 201113., 201114., 201115., 201116., 201117., 201118.,
       201119., 201120., 201121., 201122., 201123., 201124., 201125.,
       201126., 201127., 201128., 201129., 201130., 201131., 201132.,
       201133., 201134., 201135., 201136., 201137., 201138., 201139.,
       201140., 201141., 201142., 201143., 201144., 201145., 201146.,
       201147., 201148., 201149., 201150., 201151., 201152., 201200.,
       201201., 201202., 201203., 201204., 201205., 201206., 201207.,
       201208., 201209., 201210., 201211., 201212., 201213., 201214.,
       201215., 201216., 201217., 201218., 201219., 201220., 201221.,
       201222., 201223., 201224., 201225., 201226., 201227., 201228.,
       201229., 201230., 201231., 201232., 201233., 201234., 201235.,
       201236., 201237., 201238., 201239., 201240., 201241., 201242.,
       201243., 201244., 201245., 201246., 201247., 201248., 201249.,
       201250., 2012

### delete nulls due to they are the weeks we have to preddict afterwards

In [10]:
df_prices.dropna(inplace=True)

### filter with groupby, for items so to use it afterwards for clustering

In [11]:
df_price_cluster = df_prices.groupby(['item']).agg(
    price_min = ('sell_price', 'min'),
    price_max = ('sell_price', 'max'),
    price_mean = ('sell_price', 'mean')
).reset_index()
print(df_price_cluster.shape)
df_price_cluster.head(5)

(3049, 4)


Unnamed: 0,item,price_min,price_max,price_mean
0,ACCESORIES_1_001,6.65,12.7414,11.032755
1,ACCESORIES_1_002,2.66,5.2801,5.275708
2,ACCESORIES_1_003,1.2502,7.9401,3.944125
3,ACCESORIES_1_004,5.0141,6.1712,5.983299
4,ACCESORIES_1_005,1.995,4.0964,3.832119


In [12]:
#file_name = "df_price_cluster.csv"
#df_price_cluster .to_csv(file_name)

### create a new list of date

In [13]:
start_date = df_calendar['date'].min()
end_date = df_calendar['date'].max()

### create a function so to convert dates into yearweeks so to merge afterwards 


In [14]:
def create_yearweek_df(start_date_str, end_date_str, initial_week):
    
    start_date = pd.to_datetime(start_date_str)
    end_date = pd.to_datetime(end_date_str)

    
    date_range = pd.date_range(start_date, end_date)
    df = pd.DataFrame(date_range, columns=['date'])

    
    df['dias_desde_inicio'] = (df['date'] - start_date).dt.days

    
    df['week_number'] = ((df['dias_desde_inicio'] // 7) + initial_week).astype(int)

    
    df['year'] = df['date'].dt.year
    df['yearweek'] = df['year'].astype(str) + df['week_number'].apply(lambda x: f"{x:02d}")

    
    df.drop(['dias_desde_inicio', 'week_number', 'year'], axis=1, inplace=True)

    return df

   
df_2011 = create_yearweek_df('2011-01-29', '2011-12-30', 5)
df_2012_1 = create_yearweek_df('2011-12-31', '2012-01-01', 0)
df_2012 = create_yearweek_df('2012-01-02', '2012-12-30', 1)
df_2012_53 = pd.DataFrame([['2012-12-31','201253']],columns=['date','yearweek'])
df_2013 = create_yearweek_df('2013-01-01', '2013-12-31', 0)
df_2014 = create_yearweek_df('2014-01-01', '2014-12-31', 0)
df_2015 = create_yearweek_df('2015-01-01', '2015-12-31', 0)
df_2016_1 = create_yearweek_df('2016-01-01', '2016-01-02', 0)
df_2016 = create_yearweek_df('2016-01-03', '2016-04-24', 1)

df_2012_1.loc[0,'yearweek']='201152'

all_years_df = pd.concat([df_2011, df_2012_1, df_2012, df_2012_53, df_2013, df_2014, df_2015,df_2016_1, df_2016]).reset_index(drop=True)

In [15]:
all_years_df['date']= pd.to_datetime(all_years_df['date'], format='%Y-%m-%d')

In [16]:
all_years_df['yearweek'] = all_years_df['yearweek'].astype(str)

In [17]:
df_prices['yearweek'] = df_prices['yearweek'].astype(str).apply(lambda x: x[:-2])

In [18]:
df_merge = df_prices.merge(all_years_df, on=['yearweek'], how='left')

In [19]:
df_merge = df_merge.drop(["yearweek","category"], axis=1)

In [20]:
df_merge.shape

(45986376, 4)

### create a file for prediction so to use it afterwards

In [21]:
file_name = "prices_prediction.csv"
df_merge.to_csv(file_name)

### group by months 

In [40]:
df_merge.set_index('date', inplace=True) 

In [41]:
df_prices = df_merge.groupby(['item', 'store_code'])['sell_price'].resample('M').mean().reset_index()

  df_prices = df_merge.groupby(['item', 'store_code'])['sell_price'].resample('M').mean().reset_index()


In [43]:
df_prices

Unnamed: 0,item,store_code,date,sell_price
0,ACCESORIES_1_001,BOS_1,2013-07-31,11.753875
1,ACCESORIES_1_001,BOS_1,2013-08-31,10.985800
2,ACCESORIES_1_001,BOS_1,2013-09-30,10.985800
3,ACCESORIES_1_001,BOS_1,2013-10-31,10.985800
4,ACCESORIES_1_001,BOS_1,2013-11-30,10.985800
...,...,...,...,...
1536771,SUPERMARKET_3_827,PHI_3,2015-12-31,1.200000
1536772,SUPERMARKET_3_827,PHI_3,2016-01-31,1.200000
1536773,SUPERMARKET_3_827,PHI_3,2016-02-29,1.200000
1536774,SUPERMARKET_3_827,PHI_3,2016-03-31,1.200000


### csv file

In [45]:
#file_name = "cleaned_prices.csv"
#df_prices.to_csv(file_name)