In [42]:
import pandas as pd
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
import warnings
import os
from itertools import product,combinations
from statsmodels.tsa.statespace.sarimax import SARIMAX
from datetime import timedelta,datetime, date
from dateutil.relativedelta import relativedelta
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor
from sklearn.metrics import r2_score,make_scorer
from sklearn.model_selection import GridSearchCV

%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


# Regression forecast

### Если загрузка данных из csv

In [None]:
path=r"C:\Reports\FC"
store_detail=True
data = pd.read_csv(os.path.join(path,'total_sales_day.csv'),';', index_col=['day'], parse_dates=['day'], dayfirst=True)
data=data.ix[:,:3]
data=pd.read_excel(os.path.join(path,'total_sales.xlsx'))

### Если загрузка данных из excel

In [2]:
path=r"C:\Reports\FC"
store_detail=True
data = pd.read_csv(os.path.join(path,'total_sales_new.csv'),',', dayfirst=True,encoding='cp1251')
data.columns=['store','filial','day','sales']
data.head(3)

Unnamed: 0,store,filial,day,sales
0,1001,Филиал Средне-Волжск,01.01.2014,994718.85
1,1001,Филиал Средне-Волжск,02.01.2014,2082729.68
2,1001,Филиал Средне-Волжск,03.01.2014,2159405.64


In [3]:
data['day']=pd.to_datetime(data.day,dayfirst=True)
data.set_index(['day'],inplace=True)
data['sales']=data['sales'].apply(lambda x:np.float(str(x).replace(',','')))

### Флаг детализации по магазинам
если не требуется, запускаем ячейку ниже

In [None]:
data.reset_index(inplace=True)
data=data.pivot_table(index=['day'],aggfunc=np.sum)
data['store']=1
data['filial']=1

In [6]:
%%time
moving_week_window=3
moving_month_window=3

min_date=min(data.index)
def extract_date(df):
    df.is_a_copy=False
    df['day_of_week']=df.index.dayofweek
    df['month']=df.index.month
    df['total_days']=(df.index-min_date).days
    df['num_day']=df.index.day
    df['month_part']=df.num_day.apply(lambda x:x//8 )
    df['week_num']=df.index.week

    for i in range(1,moving_week_window+3):
        df[str(i)+'_week_before']=df.index-timedelta(days=7*i)
    df['date_withot_day']=[df.index[i]-timedelta(days=int(df.num_day.values[i]-1))  for i in range(df.shape[0])]
    for i in range(1,moving_month_window+3):
        df[str(i)+'_month_before']=df['date_withot_day'].apply(lambda x:x-relativedelta(months=+1*i))
    return df
def find_in_agg_df(df,pattern):
    try:
        r=df.ix[tuple(pattern)]['sales']
        #print(r)
    except:r=np.nan
    try:    
        r=r.values[0]
    except:pass
    return r
def calc_prev_month(df):
    agg_month=df[['date_withot_day','sales','store']].pivot_table(index=['date_withot_day','store'],aggfunc=np.sum)
    for i in range(1,moving_month_window+3):
        column_name=str(i)+'_month_before'
        df[column_name]=df[[column_name,'store']].apply(lambda row:find_in_agg_df(agg_month,row),axis=1)
    return df
def create_moving_week(df):
    df.reset_index(inplace=True)
    start_date=min_date+timedelta(days=6)
    res=df[df['day']>start_date][['day','store','1_week_before']].reset_index(drop=True)

    
    res['sales']=res.apply(lambda row:np.sum(df[(df['store']==row['store'])&
                                         (df['day']>=row['1_week_before'])&
                                         (df['day']<row['day'])]['sales']),axis=1)
    
    res.drop('1_week_before',axis=1,inplace=True)
 
    res.sort_values(by=['day','store'],inplace=True)

    
    res.set_index(['day', 'store'],inplace=True)
    return res

def apply_moving_week_month(df):
    df.is_a_copy=False
    agg_week_df=create_moving_week(df)

    for i in range(1,moving_week_window+3):
        column_name=str(i)+'_week_before'
        df[column_name]=df[[column_name,'store']].apply(lambda row:find_in_agg_df(agg_week_df,row),axis=1)
        print(column_name)
   
    df.dropna(inplace=True)
    # print(df.shape)
    df.reset_index(inplace=True,drop=True)
    #return agg_week_df,df
    for i in range(1,moving_week_window+1):
    
        column_names=[str(x)+'_week_before' for x in range(i,i+3) ]
        column_name_new=str(i)+'avg_week'
        df[column_name_new]=df[column_names].apply(np.mean,axis=1)
    for i in range(1,moving_month_window+1):
        column_names=[str(x)+'_month_before' for x in range(i,i+3) ]
        column_name_new=str(i)+'avg_month'
        df[column_name_new]=df[column_names].apply(np.mean,axis=1)

    columns=[str(x)+'_week_before' for x in range(1,moving_week_window+3)]+[
        str(x)+'_month_before' for x in range(1,moving_month_window+3)]
    df.drop(columns,axis=1,inplace=True)
    return df
def generate_features(df,columns,degree,suffix):
    polynom_arr=PolynomialFeatures(degree).fit_transform(df[columns])
    polynom_df=pd.DataFrame(polynom_arr,columns=[str(el )+suffix for el in range(polynom_arr.shape[1])])
    
    for comb in combinations(range(len(columns)),2):
    
        polynom_df[suffix+columns[comb[0]]+'+'+columns[comb[1]]]=df[columns[comb[0]]]+df[columns[comb[1]]]

        polynom_df[suffix+columns[comb[0]]+'-'+columns[comb[1]]]=df[columns[comb[0]]]-df[columns[comb[1]]]
        polynom_df[suffix+columns[comb[0]]+'/'+columns[comb[1]]]=df[columns[comb[0]]]/df[columns[comb[1]]]
        polynom_df[suffix+columns[comb[0]]+'*'+columns[comb[1]]]=df[columns[comb[0]]]*df[columns[comb[1]]]
        
    return polynom_df


def transform(df,real_columns,cat_columns):
    df.is_a_copy=False

    df_real=df[real_columns]
    scaler = StandardScaler()
    df_real=pd.DataFrame(scaler.fit_transform(df_real),columns=real_columns)
    res=[df_real]
    df_cat=df[cat_columns]
    for column in df_cat.columns.values:
        data_slice=df_cat[column].astype(str)
        res.append(pd.get_dummies(data_slice,prefix =column, dummy_na=False))        
    res=pd.concat(res,axis=1)
    return res
def final_prepare_data(df):
    avg_month_cols=[str(i)+'avg_month' for i in range(1,moving_month_window+1)]
    avg_week_cols=[str(i)+'avg_week' for i in range(1,moving_week_window+1)]
    multiple_df=pd.concat([generate_features(df,avg_month_cols,2,'month'),
                generate_features(df,avg_week_cols,2,'week')],axis=1)
    avg_month_cols=[str(i)+'avg_month' for i in range(1,moving_month_window+1)]
    avg_week_cols=[str(i)+'avg_week' for i in range(1,moving_week_window+1)]

    real_features=['sales','total_days']+list(multiple_df.columns.values)#+avg_month_cols+avg_week_cols
    cat_features=['filial','store','day_of_week', 'month', 'month_part','week_num']
    
    res=pd.concat([df,multiple_df],axis=1)
    res.drop(avg_month_cols+avg_week_cols,inplace=True,axis=1)
    
    res=transform(res,real_features,cat_features)
    return res

Wall time: 168 ms


### Готовим данные

In [7]:
%%time
# извлекаем  части дат из данных
data=extract_date(data)
# добавляем данные за пред месяцы 
data=calc_prev_month(data)
# добавляем данные за пред недели
data=apply_moving_week_month(data)

1_week_before
2_week_before
3_week_before
4_week_before
5_week_before
Wall time: 17min 56s


In [8]:
data.columns

Index(['day', 'store', 'filial', 'sales', 'day_of_week', 'month', 'total_days',
       'num_day', 'month_part', 'week_num', 'date_withot_day', '1avg_week',
       '2avg_week', '3avg_week', '1avg_month', '2avg_month', '3avg_month'],
      dtype='object')

In [9]:
data.head(3)

Unnamed: 0,day,store,filial,sales,day_of_week,month,total_days,num_day,month_part,week_num,date_withot_day,1avg_week,2avg_week,3avg_week,1avg_month,2avg_month,3avg_month
0,2014-06-01,1001,Филиал Средне-Волжск,1897393.15,6,6,151,1,0,22,2014-06-01,13166810.0,13714080.0,13639930.0,60963675.9,60439630.0,60271910.0
1,2014-06-02,1001,Филиал Средне-Волжск,1652624.08,0,6,152,2,0,23,2014-06-01,13207150.0,13644270.0,13676720.0,60963675.9,60439630.0,60271910.0
2,2014-06-03,1001,Филиал Средне-Волжск,1543799.17,1,6,153,3,0,23,2014-06-01,13197460.0,13590680.0,13746560.0,60963675.9,60439630.0,60271910.0


In [None]:

#final_prepare_data(data)

In [10]:
data.to_csv(os.path.join(path,'data.csv'),sep=';')

### Строим прогноз

In [11]:
max_date=max(data.day)

def generate_day_df(df):
    df.is_a_copy=False
    start_date=max(df.day)
    temp_df=df[df['day']>max_date-timedelta(days=(moving_month_window+2)*31)][['day','store','filial','sales']]
    print(start_date)
    res_df=temp_df[['store','filial']].drop_duplicates().reset_index(drop=True)
    res_df['day']=start_date+timedelta(days=1)
    res_df['sales']=0
    res_df=pd.concat([temp_df,res_df],axis=0)
    
    #res_arr_element=[max_date+timedelta(days=1)]
    
    res_df.set_index(['day'],inplace=True)
    res_df=extract_date(res_df)
    #res_df.reset_index(inplace=True)
    #res_df=pd.concat([temp_df,res_df],axis=0)
    #
    res_df=calc_prev_month(res_df)
    
    #res_df.drop('index',axis=1,inplace=True)
    #res_df.set_index(['day'],inplace=True)
    #return res_df
    res_df=apply_moving_week_month(res_df)
    #res_df.reset_index(inplace=True)
    
    res_df=res_df[res_df['day']>start_date].reset_index(drop=True)
    
    return res_df[data.columns]

# Tuning ensemble

In [17]:
merged_data=final_prepare_data(data)
cv_split_test=15
n_folds=3
max_day=max(data.day)
train_index=data[data['day']<max_day-timedelta(days=cv_split_test*n_folds)].index
X_train=merged_data.ix[train_index].drop('sales',axis=1)
y_train=merged_data.ix[train_index]['sales'].values.ravel()

In [21]:
myCViterator = []
train_index=data[data['day']<max_day-timedelta(days=cv_split_test*n_folds)].index.values.astype(int)
test_index=data[data['day']>=max_day-timedelta(days=cv_split_test*n_folds)].index.values.astype(int)
for i in range(n_folds):
    trainIndices = train_index
    testIndices =  data[(data['day']>max_day-timedelta(days=cv_split_test*(i+1)))&
                        (data['day']<=max_day-timedelta(days=cv_split_test*i))].index.values.astype(int)
    myCViterator.append( (trainIndices, testIndices) )

In [56]:
%%time
models=[('Lasso',Lasso(),{'alpha':np.power(10.0, np.arange(-5, 5))}),
       ('Ridge',Ridge(),{'alpha':np.power(10.0, np.arange(-5, 5))}),
       ('RandomForest',RandomForestRegressor(n_estimators=100),{'min_samples_split':[3,2]}),
       ('ExtraTrees',ExtraTreesRegressor(n_estimators=100),{'bootstrap':[True,False] })]

def create_ensemble(X,cv,models,test):
    res=[]
    for model in models:
        X_train=X.drop('sales',axis=1)
        y_train=X['sales'].values.ravel()
        X_test=X.drop('sales',axis=1).ix[test]
        gs=GridSearchCV(model[1],model[2],scoring= make_scorer(r2_score),cv=cv).fit(X_train,y_train)
        best_score=gs.best_score_
        print (model[0],best_score ,gs.best_params_)
        res.append([gs.best_estimator_ ,best_score])
    return res
weighted_ensemble=create_ensemble(merged_data,myCViterator,models,test_index)



Lasso 0.826652590278 {'alpha': 0.001}
Ridge 0.826816512806 {'alpha': 1000.0}
RandomForest 0.83901319528 {'min_samples_split': 2}
ExtraTrees 0.867243696308 {'bootstrap': True}
Wall time: 1h 44min 57s


In [57]:


#ensemble=[LinearRegression(),RandomForestRegressor(n_estimators=100)]
#ensemble=[LinearRegression(),LinearRegression()]
#ensemble=[RandomForestRegressor(n_estimators=50),RandomForestRegressor(n_estimators=50)]


fitted_ensemble=lambda ensemble,data,target:[(model[0].fit(data,target),model[1]) for model in weighted_ensemble]
#fitted_ensemble=lambda ensemble,data,target:[model.fit(data,target) for model in ensemble]
#predict_ensemble=lambda fit_enseble,data:pd.DataFrame(
#    np.array([model.predict(data) for model in fit_enseble]).T).apply(np.mean,axis=1).values
predict_ensemble=lambda fit_enseble,data:pd.DataFrame(
    np.array([model[1]*model[0].predict(data) for model in fit_enseble]).T).apply(np.sum,axis=1).values/np.sum([model[1] for model in fit_enseble])

In [58]:
%%time
y_test=data.ix[test_index]['sales'].values.ravel()
X_test=merged_data.ix[test_index].drop('sales',axis=1)
X_train=merged_data.ix[trainIndices].drop('sales',axis=1)
y_train=data.ix[trainIndices]['sales'].values.ravel()
y_pred=predict_ensemble(fitted_ensemble(weighted_ensemble,X_train,y_train),X_test)



Wall time: 13min 55s


In [13]:
def forecast_day(df):
    df.reset_index(drop=True,inplace=True)
    
    add_day=generate_day_df(df)


    union_df_origin=pd.concat([df,add_day],axis=0).reset_index(drop=True)
    union_df=final_prepare_data(union_df_origin)
    
    
    train_data=union_df.ix[:df.shape[0]].drop('sales',axis=1)
    y_train=union_df_origin.ix[:df.shape[0]]['sales'].values.ravel()

    test_data=union_df.ix[df.shape[0]:].drop('sales',axis=1)
    y_test=union_df_origin.ix[df.shape[0]:]['sales'].values.ravel() 

    
    fitted_models=fitted_ensemble(ensemble,train_data,y_train)
    y_pred=predict_ensemble(fitted_models,test_data)
    add_day['sales']=y_pred
    
    
    return add_day

In [14]:
%%time
def forecast_period(df,n_days=14):
    df.is_a_copy=False
    max_date=max(df.day)
    for i in range(n_days):
        add_day=forecast_day(df)
        print(add_day.shape)
        print(add_day.sales.sum())
        df=pd.concat([df,add_day],axis=0).reset_index(drop=True)
        print(i)
        df.to_excel(os.path.join(path,'final_fc_ens.xlsx'),index=False)
    return df
forecast_df=forecast_period (data)  

2017-03-28 00:00:00
1_week_before
2_week_before
3_week_before
4_week_before
5_week_before
(86, 17)
204694565.7727001
0
2017-03-29 00:00:00
1_week_before
2_week_before
3_week_before
4_week_before
5_week_before
(86, 17)
221693292.2447471
1
2017-03-30 00:00:00
1_week_before
2_week_before
3_week_before
4_week_before
5_week_before
(86, 17)
295004251.66319996
2
2017-03-31 00:00:00
1_week_before
2_week_before
3_week_before
4_week_before
5_week_before
(86, 17)
343068431.7875
3
2017-04-01 00:00:00
1_week_before
2_week_before
3_week_before
4_week_before
5_week_before
(86, 17)
270010601.6885
4
2017-04-02 00:00:00
1_week_before
2_week_before
3_week_before
4_week_before
5_week_before
(86, 17)
216674454.07349503
5
2017-04-03 00:00:00
1_week_before
2_week_before
3_week_before
4_week_before
5_week_before
(86, 17)
212697725.74207166
6
2017-04-04 00:00:00
1_week_before
2_week_before
3_week_before
4_week_before
5_week_before
(86, 17)
212861950.44738495
7
2017-04-05 00:00:00
1_week_before
2_week_before
3_

In [None]:
forecast_df.to_excel(os.path.join(path,'fc_sales.xlsx'))

In [None]:
os.path.join(path,'total_sales.csv'

In [None]:
fitted_models=fitted_ensemble(ensemble,t[0],t[1])
y_pred=predict_ensemble(fitted_models,t[2])

In [None]:
predict_ensemble(fitted_models,t[2])

In [None]:
%%time
fitted_models=fitted_ensemble(ensemble,X.drop('sales',axis=1),data['sales'].values.ravel())

In [None]:

t=generate_day_df(data)
t.head(3)

In [None]:
max(t.day)

In [None]:
t[data.columns]

In [None]:


multiple_df=pd.concat([generate_features(data,avg_month_cols,2,'month'),
                generate_features(data,avg_week_cols,2,'week')],axis=1)

In [None]:
avg_month_cols=[str(i)+'avg_month' for i in range(1,moving_month_window+1)]
avg_week_cols=[str(i)+'avg_week' for i in range(1,moving_week_window+1)]

real_features=['sales','total_days']+list(multiple_df.columns.values)#+avg_month_cols+avg_week_cols
cat_features=['filial','store','day_of_week', 'month', 'month_part','week_num']

In [None]:
data=pd.concat([data,multiple_df],axis=1)
data.drop(avg_month_cols+avg_week_cols,inplace=True,axis=1)

In [None]:
1==1

In [None]:

X=transform(data,real_features,cat_features)

In [None]:
X.head()

In [None]:
#year=data['day'].apply(lambda t:t.year)
#train_index=year[year<2017].index
#test_index=year[year>2016].index
train_index=data[data['day']<date(2017, 3, 1)].index
test_index=data[data['day']>=date(2017, 3, 1)].index

In [None]:
train_data=X.ix[train_index].drop('sales',axis=1)
y_train=data.ix[train_index]['sales'].values.ravel()
test_data=X.ix[test_index].drop('sales',axis=1)
y_test=data.ix[test_index]['sales'].values.ravel()

In [None]:
%%time
lr=LinearRegression()
rf=RandomForestRegressor(n_estimators=100)
lr.fit(train_data,y_train)
rf.fit(train_data,y_train)
y_pred_lr=lr.predict(test_data)
y_pred_rf=rf.predict(test_data)

## Финальный прогноз

In [None]:
ensemble=[LinearRegression(),RandomForestRegressor(n_estimators=100)]
fitted_ensemble=lambda ensemble,data,target:[model.fit(data,target) for model in ensemble]
predict_ensemble=lambda fit_enseble,data:[model.predict(data) for model in fit_enseble].mean()

In [None]:
fitted_models=fitted_ensemble(ensemble,X.drop('sales',axis=1),data['sales'].values.ravel())

In [None]:
t=pd.DataFrame(np.hstack([y_pred_rf.reshape(-1,1),y_pred_lr.reshape(-1,1)]),columns=['lr'])
t

In [None]:
y_pred_rf.sum(),y_test.sum()

In [None]:
(y_pred[10]-y_test[10])/y_test[10]

In [None]:
r2_score(y_test,y_pred_lr),r2_score(y_test,y_pred_rf)

In [None]:
r2_score(y_test,y_pred_lr),r2_score(y_test,y_pred_rf)

In [None]:
((y_test+y_pred_rf)/2).sum(),y_test.sum()

In [None]:
plt.figure(figsize(15,7))
data.sales.plot()
#plt.ylabel(u'')
#plt.title('')
pylab.show()

In [None]:
data