In [1]:
from __future__ import division, print_function

from itertools import product
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import tqdm
import warnings

import seaborn as sns

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12,8)

# Для кириллицы на графиках
font = {'family': 'Verdana',
        'weight': 'normal'}
plt.rc('font', **font)

import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.api as sm

import scipy.stats as scs
from scipy.optimize import minimize

from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

warnings.filterwarnings('default')

## Задача:
    Расчитать продажы для магазинов в будущем
    Использую те фичи которые представленные в teste

In [238]:
data_train = pd.read_csv('train.csv.zip')
data_test = pd.read_csv('test.csv.zip')
data_store = pd.read_csv('store.csv.zip')

  interactivity=interactivity, compiler=compiler, result=result)


In [431]:
df = data_store.merge(data_train,on='Store')
df_test = data_test.merge(data_store, on = 'Store')

# На обучение учитываем только открытые магазины
df = df[df['Open'] != 0] 

## Собираю Pipline

In [430]:
from sklearn.preprocessing import  LabelEncoder, LabelBinarizer, OneHotEncoder
from sklearn.pipeline import make_union, make_pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, Imputer
from sklearn.preprocessing import CategoricalEncoder
from sklearn.base import BaseEstimator, TransformerMixin


class Simple_pipeline(BaseEstimator):
    
    """
    Класс обработки данных и применения к ним линейной регрессии
    
    Объект использует метод get_pipline
    Принимает DataFrame
    Возвращает обработанную разряженную матрицу 
    
    """
    
    def __init__(self, df):
        self.df = df
        
        

    def get_date(self,df):
        
        """
        Функция создает новые фьючерсы по отдельности дни недели и месяцы
        Использует фьючерс Day
        
        """
        
        df['Year'] = df['Date'].apply(lambda x: int(x[:4]))
        df['Month'] = df['Date'].apply(lambda x: int(x[5:7]))
        df['Day'] = df['Date'].apply(lambda x: int(x[8:]))
        
        
        
        return df[['Month','Day','Year']]
    
    def fill_nan_future_float(self,df):
        
        """
        Функция принимает DataFrame 
        
        Берет признаки с типом float
        
        Заполняет все пропуски 0
        
        Возвращает эти признаки в пайплайн 
        
        - Важно указать Стандартизацию данных
        
        """
        # В тесте есть баг его надо поправить
        df['Open'].fillna(1,inplace=True)
        df['Open'] = df['Open'].astype('int64')
            
        name_float = []
        for name in df.columns:
    
            if df[name].dtype == 'float64':
            
                name_float.append(name)
                df[name].fillna(df[name].mean(),inplace=True)
        
        return df[name_float]
    
    def work_object_future(self,df):
        
        """
        Функция принимает DataFrame
        Работаем здесь с Фичами объекты заполнение и подготовка к onehot encoding
        Возвращаю Фьючи для Categorical Encoding
        
        - Для линейной регресси делаем onehotencoding
        
        """
        
        # Сначала обработаем фичу 'PromoInterval'] Она содержит и пропуски и строковые обозначения с запятыми
        # преобразуем этот признак
        
        # уберем запятые чтобы Mar,Jun,Sept,Dec принял вид MarJunSeptDec и заполним пропуски "0"
        df['PromoInterval'] = df['PromoInterval'].apply(lambda x: x.replace(',','') if x == x else '0')
        
        # Подправим небольшой баг в признаке  StateHoliday
        df['StateHoliday'] = df['StateHoliday'].replace(0, '0')
        
        return df[['Store','StoreType', 'Assortment','Promo2', 'PromoInterval','DayOfWeek','Open','Promo','StateHoliday','SchoolHoliday']]
    
    
    
    
    def get_pipline(self):

        pipeline = make_union(*[
            
            # 1 Создаем Новые признаки День и месяц
            make_pipeline(FunctionTransformer(self.get_date, validate=False)),
                          
            # 2 Заполняем все числовые признаки 0
            make_pipeline(FunctionTransformer(self.fill_nan_future_float, validate=False),StandardScaler()),
            
            # 3 Преобразуем категориальные признаки
            make_pipeline(FunctionTransformer(self.work_object_future, validate=False),CategoricalEncoder(encoding='ordinal'))
            
            
        ])
        
        return pipeline.fit_transform(self.df)

In [432]:
train_pipline = Simple_pipeline(df).get_pipline()

In [433]:
test_pipline = Simple_pipeline(df_test).get_pipline()

In [434]:

train_pipline.shape, test_pipline.shape

((844392, 18), (41088, 18))

In [435]:
y = df['Sales']
y.shape

(844392,)

In [436]:
# Учтем временной период! Обучим на начале выборки протестируем в конце
train_size = int(train_pipline.shape[0]*.98)
X_train, y_train = train_pipline[:train_size], y[:train_size]
X_test, y_test = train_pipline[train_size:], y[train_size:]

In [366]:
# Следубщий код вщять с кэгла! Метрика 

def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w


def rmspe(yhat, y):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe


def rmspe_xg(yhat, y):
    # y = y.values
    y = y.get_label()
    y = np.exp(y) - 1
    yhat = np.exp(yhat) - 1
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean(w * (y - yhat)**2))
    return "rmspe", rmspe

In [367]:
# Обучим XGBoost

import xgboost as xgb

params = {"objective": "reg:linear",
          "eta": 0.3,
          "max_depth": 8,
          "subsample": 0.7,
          "colsample_bytree": 0.7,
          "silent": 1
          }
num_trees = 300

In [437]:
dtrain = xgb.DMatrix(X_train, np.log(y_train.values + 1))
dvalid = xgb.DMatrix(X_test, np.log(y_test.values + 1))
dtest = xgb.DMatrix(test_pipline)

watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=50, feval=rmspe_xg, verbose_eval=True)



[0]	eval-rmse:5.76525	train-rmse:5.79447	eval-rmspe:0.996648	train-rmspe:0.9968
Multiple eval metrics have been passed: 'train-rmspe' will be used for early stopping.

Will train until train-rmspe hasn't improved in 50 rounds.
[1]	eval-rmse:4.03497	train-rmse:4.06451	eval-rmspe:0.980543	train-rmspe:0.981398
[2]	eval-rmse:2.8316	train-rmse:2.8555	eval-rmspe:0.935398	train-rmspe:0.937844
[3]	eval-rmse:1.98833	train-rmse:2.01298	eval-rmspe:0.851412	train-rmspe:0.856237
[4]	eval-rmse:1.40901	train-rmse:1.42875	eval-rmspe:0.740795	train-rmspe:0.743241
[5]	eval-rmse:1.00732	train-rmse:1.02738	eval-rmspe:0.629199	train-rmspe:0.619027
[6]	eval-rmse:0.748134	train-rmse:0.752721	eval-rmspe:0.55119	train-rmspe:0.505526
[7]	eval-rmse:0.581209	train-rmse:0.569934	eval-rmspe:0.513984	train-rmspe:0.417714
[8]	eval-rmse:0.485106	train-rmse:0.450467	eval-rmspe:0.527615	train-rmspe:0.356284
[9]	eval-rmse:0.436194	train-rmse:0.377074	eval-rmspe:0.554924	train-rmspe:0.323405
[10]	eval-rmse:0.405674	train-

[96]	eval-rmse:0.418408	train-rmse:0.139059	eval-rmspe:0.742592	train-rmspe:0.173195
[97]	eval-rmse:0.41857	train-rmse:0.138655	eval-rmspe:0.744178	train-rmspe:0.174669
[98]	eval-rmse:0.418437	train-rmse:0.138441	eval-rmspe:0.743982	train-rmspe:0.174469
[99]	eval-rmse:0.418368	train-rmse:0.138004	eval-rmspe:0.743145	train-rmspe:0.17415
[100]	eval-rmse:0.418242	train-rmse:0.137593	eval-rmspe:0.74305	train-rmspe:0.173835
[101]	eval-rmse:0.418219	train-rmse:0.137234	eval-rmspe:0.743139	train-rmspe:0.173481
[102]	eval-rmse:0.418081	train-rmse:0.136809	eval-rmspe:0.742678	train-rmspe:0.172532
[103]	eval-rmse:0.417977	train-rmse:0.136571	eval-rmspe:0.742512	train-rmspe:0.172423
[104]	eval-rmse:0.418711	train-rmse:0.13648	eval-rmspe:0.744884	train-rmspe:0.172332
[105]	eval-rmse:0.418801	train-rmse:0.135905	eval-rmspe:0.745234	train-rmspe:0.171894
[106]	eval-rmse:0.418597	train-rmse:0.135559	eval-rmspe:0.744529	train-rmspe:0.171505
[107]	eval-rmse:0.418603	train-rmse:0.135254	eval-rmspe:0.7447

[192]	eval-rmse:0.421688	train-rmse:0.119722	eval-rmspe:0.735834	train-rmspe:0.150235
[193]	eval-rmse:0.421612	train-rmse:0.119664	eval-rmspe:0.73473	train-rmspe:0.150187
[194]	eval-rmse:0.42158	train-rmse:0.1196	eval-rmspe:0.734475	train-rmspe:0.150158
[195]	eval-rmse:0.421804	train-rmse:0.119356	eval-rmspe:0.734539	train-rmspe:0.149866
[196]	eval-rmse:0.421711	train-rmse:0.119247	eval-rmspe:0.734313	train-rmspe:0.149792
[197]	eval-rmse:0.421821	train-rmse:0.119193	eval-rmspe:0.73441	train-rmspe:0.149739
[198]	eval-rmse:0.42196	train-rmse:0.119039	eval-rmspe:0.733919	train-rmspe:0.14829
[199]	eval-rmse:0.421886	train-rmse:0.118943	eval-rmspe:0.734268	train-rmspe:0.148218
[200]	eval-rmse:0.421997	train-rmse:0.118726	eval-rmspe:0.744022	train-rmspe:0.148758
[201]	eval-rmse:0.421934	train-rmse:0.118543	eval-rmspe:0.744041	train-rmspe:0.148536
[202]	eval-rmse:0.421931	train-rmse:0.118422	eval-rmspe:0.743951	train-rmspe:0.148449
[203]	eval-rmse:0.421911	train-rmse:0.118376	eval-rmspe:0.743

[288]	eval-rmse:0.424062	train-rmse:0.110594	eval-rmspe:0.741768	train-rmspe:0.140941
[289]	eval-rmse:0.424009	train-rmse:0.110536	eval-rmspe:0.739429	train-rmspe:0.140879
[290]	eval-rmse:0.424006	train-rmse:0.110489	eval-rmspe:0.739393	train-rmspe:0.141541
[291]	eval-rmse:0.424037	train-rmse:0.110449	eval-rmspe:0.739931	train-rmspe:0.141511
[292]	eval-rmse:0.424197	train-rmse:0.110408	eval-rmspe:0.740286	train-rmspe:0.141453
[293]	eval-rmse:0.424257	train-rmse:0.110321	eval-rmspe:0.740323	train-rmspe:0.141447
[294]	eval-rmse:0.424141	train-rmse:0.11026	eval-rmspe:0.740013	train-rmspe:0.141407
[295]	eval-rmse:0.424231	train-rmse:0.110049	eval-rmspe:0.737973	train-rmspe:0.14132
[296]	eval-rmse:0.424259	train-rmse:0.109904	eval-rmspe:0.738106	train-rmspe:0.141244
[297]	eval-rmse:0.424166	train-rmse:0.109807	eval-rmspe:0.738134	train-rmspe:0.139727
[298]	eval-rmse:0.424203	train-rmse:0.109678	eval-rmspe:0.738126	train-rmspe:0.139425
[299]	eval-rmse:0.424165	train-rmse:0.109611	eval-rmspe:

In [418]:
test_probs = gbm.predict(xgb.DMatrix(test_pipline))

submission = pd.DataFrame({"Id": df_test["Id"], "Sales": np.exp(test_probs) - 1})
submission.to_csv("xgboost_submission_1.csv", index=False)