In [1]:
import numpy as np 
import pandas as pd

from  datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd


In [2]:
t = pd.read_csv('../../data/sell_prices.csv')

In [3]:
#type pour chaque dataframe

CAL_DTYPES={"event_name_1": "category", "event_name_2": "category", "event_type_1": "category", 
         "event_type_2": "category", "weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
        "month": "int16", "year": "int16", "snap_CA": "float32", 'snap_TX': 'float32', 'snap_WI': 'float32' }
PRICE_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32" }

In [4]:
#parametres de base pour le controle des jours à traiter

h = 28 
max_lags = 57 # nombre de jours max qu'on va regarder dans le passé
tr_last = 1913 # dernier jour à prédire dans la compet
fday = datetime(2016,4, 25) 
fday

datetime.datetime(2016, 4, 25, 0, 0)

In [5]:
def create_df(is_train = True, nrows = None, first_day = 1200):
    prices = pd.read_csv("../../data/sell_prices.csv", dtype = PRICE_DTYPES)
    
    # one hot encoding pour toutes les variables catégorielles
    # ex : sunday = 0 etc...
    for col, col_dtype in PRICE_DTYPES.items():
        if col_dtype == "category":
            prices[col] = prices[col].cat.codes.astype("int16")
            prices[col] -= prices[col].min()
            
    cal = pd.read_csv("../../data/calendar.csv", dtype = CAL_DTYPES)
    cal["date"] = pd.to_datetime(cal["date"])
    for col, col_dtype in CAL_DTYPES.items():
        if col_dtype == "category":
            cal[col] = cal[col].cat.codes.astype("int16")
            cal[col] -= cal[col].min()
    
    
    # traiter les données depuis 5 ans en arrière serait trop long 
    # on décide donc de choisir un jour à partir duquel on va commencer à traiter les données
    start_day = max(1 if is_train  else tr_last-max_lags, first_day)
    
    # on fait une liste des colonnes (elles ont des noms de type d_1300 on les selectionnes grace à ca)
    numcols = [f"d_{day}" for day in range(start_day,tr_last+1)]
    
    catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    
    dtype = {numcol:"float32" for numcol in numcols}
    dtype.update({col: "category" for col in catcols if col != "id"})
    
    df = pd.read_csv("../../data/sales_train_validation.csv", 
                     nrows = nrows, usecols = catcols + numcols, dtype = dtype)
    
    # one hot encoding sur la df de train
    for col in catcols:
        if col != "id":
            df[col] = df[col].cat.codes.astype("int16")
            df[col] -= df[col].min()
    
    # passage d'un format wide à un format long avec les jours et le nombre de sales pour la conversion
    df = pd.melt(df,
                  id_vars = catcols,
                  value_vars = [col for col in df.columns if col.startswith("d_")],
                  var_name = "d",
                  value_name = "sales")
    
    # merge de toutes les df
    df = df.merge(cal, on= "d", copy = False)
    df = df.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
    
    return df

In [6]:
def create_features(df):
    # on défini nos maximums de jours pour lesquels on va regarder
    lags = [7, 28]
    lag_cols = [f"lag_{lag}" for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):
        # on décale du nombre de jours dans le temps nos colonnes
        df[lag_col] = df[["id","sales"]].groupby("id")["sales"].shift(lag)

    # on fait la même chose mais pour les means des ventes sur chacunes des période qu'on a choisi
    means = [7, 28]
    for mean in means :
        for lag,lag_col in zip(lags, lag_cols):
            df[f"rmean_{lag}_{mean}"] = df[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(mean).mean())

    
    # traitement des dates
    date_features = {
        "wday": "weekday",
        "week": "weekofyear",
        "month": "month",
        "quarter": "quarter",
        "year": "year",
        "mday": "day",
    }
    
    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in df.columns:
            df[date_feat_name] = df[date_feat_name].astype("int16")
        else:
            df[date_feat_name] = getattr(df["date"].dt, date_feat_func).astype("int16")

In [7]:
%%time
FIRST_DAY = 350 # si 0 souvent crash à cause de la mémoire

df = create_df(is_train=True, first_day= FIRST_DAY)
df.shape

CPU times: user 33.8 s, sys: 7.11 s, total: 40.9 s
Wall time: 37.8 s


(40718219, 22)

In [8]:
df.head()

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_350,0.0,2012-01-13,11150,...,1,2012,0,0,0,0,0.0,1.0,0.0,3.97
1,HOBBIES_1_004_CA_1_validation,3,0,0,0,0,d_350,2.0,2012-01-13,11150,...,1,2012,0,0,0,0,0.0,1.0,0.0,4.34
2,HOBBIES_1_005_CA_1_validation,4,0,0,0,0,d_350,0.0,2012-01-13,11150,...,1,2012,0,0,0,0,0.0,1.0,0.0,2.48
3,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_350,0.0,2012-01-13,11150,...,1,2012,0,0,0,0,0.0,1.0,0.0,0.5
4,HOBBIES_1_009_CA_1_validation,8,0,0,0,0,d_350,2.0,2012-01-13,11150,...,1,2012,0,0,0,0,0.0,1.0,0.0,1.77


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40718219 entries, 0 to 40718218
Data columns (total 22 columns):
id              object
item_id         int16
dept_id         int16
store_id        int16
cat_id          int16
state_id        int16
d               object
sales           float32
date            datetime64[ns]
wm_yr_wk        int16
weekday         int16
wday            int16
month           int16
year            int16
event_name_1    int16
event_type_1    int16
event_name_2    int16
event_type_2    int16
snap_CA         float32
snap_TX         float32
snap_WI         float32
sell_price      float32
dtypes: datetime64[ns](1), float32(5), int16(14), object(2)
memory usage: 3.0+ GB


In [10]:
%%time

create_features(df)
df.shape

CPU times: user 2min 54s, sys: 22.1 s, total: 3min 16s
Wall time: 2min 56s


(40718219, 31)

In [11]:
df.head()

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,...,sell_price,lag_7,lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28,week,quarter,mday
0,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_350,0.0,2012-01-13,11150,...,3.97,,,,,,,2,1,13
1,HOBBIES_1_004_CA_1_validation,3,0,0,0,0,d_350,2.0,2012-01-13,11150,...,4.34,,,,,,,2,1,13
2,HOBBIES_1_005_CA_1_validation,4,0,0,0,0,d_350,0.0,2012-01-13,11150,...,2.48,,,,,,,2,1,13
3,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_350,0.0,2012-01-13,11150,...,0.5,,,,,,,2,1,13
4,HOBBIES_1_009_CA_1_validation,8,0,0,0,0,d_350,2.0,2012-01-13,11150,...,1.77,,,,,,,2,1,13


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40718219 entries, 0 to 40718218
Data columns (total 31 columns):
id              object
item_id         int16
dept_id         int16
store_id        int16
cat_id          int16
state_id        int16
d               object
sales           float32
date            datetime64[ns]
wm_yr_wk        int16
weekday         int16
wday            int16
month           int16
year            int16
event_name_1    int16
event_type_1    int16
event_name_2    int16
event_type_2    int16
snap_CA         float32
snap_TX         float32
snap_WI         float32
sell_price      float32
lag_7           float32
lag_28          float32
rmean_7_7       float32
rmean_28_7      float32
rmean_7_28      float32
rmean_28_28     float32
week            int16
quarter         int16
mday            int16
dtypes: datetime64[ns](1), float32(11), int16(17), object(2)
memory usage: 4.2+ GB


In [13]:
df.dropna(inplace = True)
df.shape

(39041269, 31)

In [14]:
from fbprophet import Prophet
from tqdm import tqdm, tnrange
from multiprocessing import Pool, cpu_count

In [15]:
calendar_df = pd.read_csv('../../data/calendar.csv')
sales_train =  pd.read_csv('../../data/sales_train_validation.csv')
sell_prices = pd.read_csv('../../data/sell_prices.csv')
submission = pd.read_csv('../../data/sample_submission.csv')

In [16]:
def run_prophet(timeserie):
    model = Prophet(uncertainty_samples=False)
    model.fit(timeserie)
    future = model.make_future_dataframe(periods=28, include_history=False)
    forecast = model.predict(future)
    return forecast

In [53]:
sales_train.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4


In [45]:
temp_series = sales_train.iloc[1,1200:]
temp_series.index = calendar_df['date'][1200:1200+len(temp_series)]
temp_series =  pd.DataFrame(temp_series)
temp_series

Unnamed: 0_level_0,1
date,Unnamed: 1_level_1
2014-05-13,0
2014-05-14,0
2014-05-15,1
2014-05-16,0
2014-05-17,0
...,...
2016-04-26,1
2016-04-27,0
2016-04-28,0
2016-04-29,0


In [24]:
start_from_ob = 1200
for i in tnrange(sales_train.shape[0]):
    temp_series = sales_train.iloc[i,start_from_ob:]
    temp_series.index = calendar_df['date'][start_from_ob:start_from_ob+len(temp_series)]
    temp_series =  pd.DataFrame(temp_series)
    temp_series = temp_series.reset_index()
    temp_series.columns = ['ds', 'y']

    #with Pool(cpu_count()) as p:
     #   forecast1 = p.map(run_prophet, [temp_series])

#    submission.iloc[i,1:] = forecast1[0]['yhat'].values

#    submission.iloc[:,1:]=submission.iloc[:,1:].where(submission.iloc[:,1:] > 0).fillna(0)

HBox(children=(IntProgress(value=0, max=30490), HTML(value='')))

INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seaso

INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
Process ForkPoolWorker-366:
Process ForkPoolWorker-361:
Process ForkPoolWorker-363:
Process ForkPoolWorker-364:
Process ForkPoolWorker-365:
Process ForkPoolWorker-367:
Process ForkPoolWorker-362:
Process ForkPoolWorker-368:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most re

KeyboardInterrupt: 