In [1]:
# load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import product
from sklearn import linear_model
import datetime
from itertools import product
import statsmodels.api as sm
import warnings
from sklearn import preprocessing

%matplotlib inline

In [2]:
def regression(res, alpha = 0.1, plot = False,verbose = False):
    X = res.drop('trip_count',axis = 1)        
    y = res.loc[:,'trip_count'];
    lassoReg = linear_model.Lasso(alpha = alpha, max_iter = 1e5,fit_intercept = True,random_state = 0);
    lassoReg.fit(X,y)
    y_pr = pd.Series(data = lassoReg.predict(X), index = res.index)
    R = lassoReg.score(X,y);
    print 'R factor is ', R
    
    if plot:
        plt.figure(figsize = [15,10])
        plt.subplot(211)
        plt.plot(y)
        plt.plot(y_pr)
        plt.legend(['Original data','Predicted'])

        plt.subplot(212)
        plt.plot(y-y_pr)
        plt.legend(['Residuals'])
        
    return [y_pr, y-y_pr, lassoReg]

In [3]:
def getRegressor(regressor, start_date = '2016-05-15 00:00:00', end_date = '2016-05-20 23:00:00'):
    predictionStart = datetime.datetime.strptime(start_date,'%Y-%m-%d %H:%M:%S')
    predictionEnd = datetime.datetime.strptime(end_date,'%Y-%m-%d %H:%M:%S')
    date_index = pd.date_range(predictionStart, predictionEnd, freq='H')
   
    #какой-то пипец. Должен быть способ сделать это проще.
    features = date_index.to_series().to_frame()
    features = addFeatures(features,verbose = True)
    features = features.drop(0,axis = 1)
    exog = regressor.predict(features)
    #print 
    #exog = np.expand_dims(,axis = 1)
    #print exog
    return pd.Series(exog,index = date_index)

In [4]:
def addFeatures(res, Kw = 6, Ka = 3,verbose = False):    
    # add linear feature
    res = res.assign(hours = (res.index - datetime.datetime(2014,1,1,0,0,0))/np.timedelta64(1, 'h'))
    
    # добавляем гармонические фичи
    for ind in range(1,Kw+1):
        res['weekCos'+str(ind)]= np.cos(np.pi*res.hours*ind/168);
        res['weekSin'+str(ind)]= np.sin(np.pi*res.hours*ind/168);
    for ind in range(1,Ka+1):
        res['yearCos'+str(ind)]= np.cos(2*np.pi*res.hours*ind/8766);
        res['yearSin'+str(ind)]= np.sin(2*np.pi*res.hours*ind/8766);
        
    # добавляем dummy variables для дней недели
    lbDays = preprocessing.LabelBinarizer()
    lbDays.fit(list(np.arange(6)))
    DoW = pd.DataFrame(lbDays.transform(res.index.dayofweek),columns = ['DayOfWeek_'+str(x) for x in np.arange(6)],
                       index = res.index)      
    res = res.merge(DoW,left_index=True,right_index=True)
 
    # добавляем dummy variables для месяца
    lbMonths = preprocessing.LabelBinarizer()
    lbMonths.fit(list(np.arange(12)))
    Months = pd.DataFrame(lbMonths.transform(res.index.month),columns = ['Month_'+str(x) for x in np.arange(12)],index = res.index)      
    res = res.merge(Months,left_index=True,right_index=True);
    return res

In [6]:
# id нужных регионов
regsDf = pd.read_csv('../crowdRegs.csv',names=['id','regId']);  

# временные ряды для этих регионов
df = pd.read_pickle('../crowdRegs2.pcl')
df.columns = regsDf.regId.values.astype('str')

# словарь с группировкой рядов
tsGroups = np.load('tsGroups.npy').item()

# словарь с оптимальными параметрами для каждой группы
paramsGroups = np.load('paramsGroups.npy').item()

*Логика скрипта:*
<ol>
<li> Выбираем одну группу
<li> В группе выбираем один ряд
<li> По номеру группы подгружаем оптимальные параметры
<li> Обучаем регрессор
<li> Обучаем SARIMAX модель
<li> Сохраняем модель (??? Может быть без данных, чтобы сэкономить место).
<li> Делаем предсказание
<li> Сохраняем предсказение
<li> Идём на второй или первый шаг
<ol>
    

In [7]:
# тестируем процедуру на одном ряду. Например, группа gr0, ряд 1283

In [None]:
# диапазон дат для обучения
startFit = '2016-01-01 0:0:0'
endFit = '2016-04-30 23:59:59'

# диапазон дат для предсказания
startPrediction = '2016-01-01 00:00:00'
endPrediction = '2016-05-30 00:00:00'
predictionRange = pd.date_range(startPrediction, endPrediction, freq='H')

# create array to save prediction results
mIndex = pd.MultiIndex.from_product([df.columns.values, predictionRange])
resDf = pd.DataFrame(index = mIndex, columns = ['y','err'])


grId = 'gr0'

for grId, ts in tsGroups.iteritems():
    print 'Group ID is', grId
    
    # получаем параметры SARIMAX модели
    params = paramsGroups.get(grId)[1] 
    
    for tsId in ts:
        print 'Regions is ', tsId
        # получаем временной ряд
        ts = df.loc[startFit:endFit,tsId] #

        # обучаем регрессор
        ts = ts.to_frame(name = 'trip_count')
        [r_pr, res, regressor] = regression(addFeatures(ts),verbose = True)

        # обучаем SARIMAX модель
        print 'Learn SARIMAX'
        try:
            mSARIMA=sm.tsa.statespace.SARIMAX(ts, order=[params[0], 1, params[1]],
                                              seasonal_order=(params[2], 1, params[3], 24),
                                              exog = r_pr, enforce_invertibility = True).fit(disp=1);
        except Exception as inst:
            print type(inst)     
            print inst          

        # получаем предсказания регрессора на весь диапазон дат (обучение+предсказание)
        exog = getRegressor(regressor,startFit,endPrediction)
        # получаем данные о поездкахы на весь диапазон дат
        endog = df.loc[startFit:endPrediction,tsId]
       
        # создаём новую модель, которую будет использовать для предсказания
        # Для чего такой финт ушами - не понимаю до сих пор
        model_fitted = sm.tsa.statespace.SARIMAX(endog, order=[params[0], 1, params[1]],
                                                 seasonal_order=(params[2], 1, params[3], 24),
                                                 exog = exog).filter(mSARIMA.params)
         
        # проходим по всему диапазону дат предсказаний
        print 'Make prediction'
        for firstLag in predictionRange:
            lastLag = firstLag+datetime.timedelta(hours = 5)
            # prediction
            try:
                predicted_data = model_fitted.predict(firstLag, lastLag, dynamic=True, exog = exog[firstLag:lastLag])
                # save results
                resDf.loc[tsId,startPrediction].y = predicted_data
                resDf.loc[tsId,startPrediction].err = (df.loc[startPrediction:endPrediction,tsId]-predicted_data).abs().mean()
            except Exception as inst:
                print 'Prediction error'
                print inst
                
    # save results
    resDf.to_pickle('predictionResults.pcl')

Group ID is gr18
Regions is  1273
R factor is  0.0348605313164
Learn SARIMAX




Make prediction


  return np.abs(self)


Prediction error
Provided exogenous values are not of the appropriate shape. Required (1, 1), got (5L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (2, 1), got (4L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (3, 1), got (3L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (4, 1), got (2L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (5, 1), got (1L,).
Regions is  1274
R factor is  0.0871641080872
Learn SARIMAX




Make prediction
Prediction error
Provided exogenous values are not of the appropriate shape. Required (1, 1), got (5L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (2, 1), got (4L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (3, 1), got (3L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (4, 1), got (2L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (5, 1), got (1L,).
Group ID is gr19
Regions is  1434
R factor is  0.018278110283
Learn SARIMAX




Make prediction
Prediction error
Provided exogenous values are not of the appropriate shape. Required (1, 1), got (5L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (2, 1), got (4L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (3, 1), got (3L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (4, 1), got (2L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (5, 1), got (1L,).
Regions is  1435
R factor is  0.101333276704
Learn SARIMAX




Make prediction
Prediction error
Provided exogenous values are not of the appropriate shape. Required (1, 1), got (5L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (2, 1), got (4L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (3, 1), got (3L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (4, 1), got (2L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (5, 1), got (1L,).
Regions is  1437
R factor is  0.0880380582527
Learn SARIMAX




Make prediction
Prediction error
Provided exogenous values are not of the appropriate shape. Required (1, 1), got (5L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (2, 1), got (4L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (3, 1), got (3L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (4, 1), got (2L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (5, 1), got (1L,).
Regions is  1438
R factor is  0.0473341631545
Learn SARIMAX




Make prediction
Prediction error
Provided exogenous values are not of the appropriate shape. Required (1, 1), got (5L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (2, 1), got (4L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (3, 1), got (3L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (4, 1), got (2L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (5, 1), got (1L,).
Regions is  1630
R factor is  0.0654912922419
Learn SARIMAX




Make prediction
Prediction error
Provided exogenous values are not of the appropriate shape. Required (1, 1), got (5L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (2, 1), got (4L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (3, 1), got (3L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (4, 1), got (2L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (5, 1), got (1L,).
Regions is  1684
R factor is  0.118775426739
Learn SARIMAX




Make prediction
Prediction error
Provided exogenous values are not of the appropriate shape. Required (1, 1), got (5L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (2, 1), got (4L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (3, 1), got (3L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (4, 1), got (2L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (5, 1), got (1L,).
Group ID is gr10
Regions is  1333
R factor is  0.0442996592015
Learn SARIMAX
Make prediction
Prediction error
Provided exogenous values are not of the appropriate shape. Required (1, 1), got (5L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (2, 1), got (4L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (3, 1), got (3L,).
Prediction error
Provided exogenous values are not 



Make prediction
Prediction error
Provided exogenous values are not of the appropriate shape. Required (1, 1), got (5L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (2, 1), got (4L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (3, 1), got (3L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (4, 1), got (2L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (5, 1), got (1L,).
Regions is  1733
R factor is  0.0433368135015
Learn SARIMAX
Make prediction
Prediction error
Provided exogenous values are not of the appropriate shape. Required (1, 1), got (5L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (2, 1), got (4L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (3, 1), got (3L,).
Prediction error
Provided exogenous values are not of the appropriat



0.0854515765533
Learn SARIMAX
Make prediction
Prediction error
Provided exogenous values are not of the appropriate shape. Required (1, 1), got (5L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (2, 1), got (4L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (3, 1), got (3L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (4, 1), got (2L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (5, 1), got (1L,).
Regions is  1223
R factor is  0.0940933039937
Learn SARIMAX




Make prediction
Prediction error
Provided exogenous values are not of the appropriate shape. Required (1, 1), got (5L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (2, 1), got (4L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (3, 1), got (3L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (4, 1), got (2L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (5, 1), got (1L,).
Regions is  1224
R factor is  0.113294640454
Learn SARIMAX


