In [1]:
# load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import linear_model, model_selection, preprocessing

import datetime
from itertools import product
import statsmodels.api as sm

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [22]:
def regression(res, alpha = 0.1, plot = False,verbose = False, searchBestFit = False):
    X = res.drop('trip_count',axis = 1)        
    y = res.loc[:,'trip_count'];
    
    if searchBestFit:
        # создать словарь параметров
        param_grid = {'alpha': [x for x in np.linspace(10,100,10)],'l1_ratio': [0.1, 0.25, 0.4, 0.5, 0.6, 0.75, 1]}
        # создать кросс-валидацию для временных рядов
        tscv = model_selection.TimeSeriesSplit()
        
        # запустить поиск оптимальных параметров
        regressor = linear_model.ElasticNet()
        clf = model_selection.GridSearchCV(regressor, param_grid, n_jobs=4, cv=tscv, verbose=1)
        clf.fit(X,y)
        regressor = clf.best_estimator_        
        print 'Best params is', clf.best_params_ 
        
        
    else:
        regressor = linear_model.Lasso(alpha = alpha, max_iter = 1e5,fit_intercept = True,random_state = 0);
        regressor.fit(X,y)
        
    y_pr = pd.Series(data = regressor.predict(X), index = res.index)
    R = regressor.score(X,y);
    print 'R factor is ', R
    
    if plot:
        plt.figure(figsize = [15,10])
        plt.subplot(211)
        plt.plot(y)
        plt.plot(y_pr)
        plt.legend(['Original data','Predicted'])

        plt.subplot(212)
        plt.plot(y-y_pr)
        plt.legend(['Residuals'])
        
    return [y_pr, y-y_pr, regressor]

In [4]:
def getRegressor(regressor, start_date = '2016-05-15 00:00:00', end_date = '2016-05-20 23:00:00'):
    predictionStart = datetime.datetime.strptime(start_date,'%Y-%m-%d %H:%M:%S')
    predictionEnd = datetime.datetime.strptime(end_date,'%Y-%m-%d %H:%M:%S')
    date_index = pd.date_range(predictionStart, predictionEnd, freq='H')
   
    #какой-то пипец. Должен быть способ сделать это проще.
    features = date_index.to_series().to_frame()
    features = addFeatures(features,verbose = True)
    features = features.drop(0,axis = 1)
    exog = regressor.predict(features)
    return pd.Series(exog,index = date_index)

In [5]:
def addFeatures(res, Kw = 6, Ka = 3,verbose = False):    
    # add linear feature
    res = res.assign(hours = (res.index - datetime.datetime(2014,1,1,0,0,0))/np.timedelta64(1, 'h'))
    
    # добавляем гармонические фичи
    for ind in range(1,Kw+1):
        res['weekCos'+str(ind)]= np.cos(np.pi*res.hours*ind/168);
        res['weekSin'+str(ind)]= np.sin(np.pi*res.hours*ind/168);
    for ind in range(1,Ka+1):
        res['yearCos'+str(ind)]= np.cos(2*np.pi*res.hours*ind/8766);
        res['yearSin'+str(ind)]= np.sin(2*np.pi*res.hours*ind/8766);
        
    # добавляем dummy variables для дней недели
    lbDays = preprocessing.LabelBinarizer()
    lbDays.fit(list(np.arange(6)))
    DoW = pd.DataFrame(lbDays.transform(res.index.dayofweek),columns = ['DayOfWeek_'+str(x) for x in np.arange(6)],
                       index = res.index)      
    res = res.merge(DoW,left_index=True,right_index=True)
 
    # добавляем dummy variables для месяца
    lbMonths = preprocessing.LabelBinarizer()
    lbMonths.fit(list(np.arange(12)))
    Months = pd.DataFrame(lbMonths.transform(res.index.month),columns = ['Month_'+str(x) for x in np.arange(12)],index = res.index)      
    res = res.merge(Months,left_index=True,right_index=True);
    return res

In [6]:
def findHyperParams(ts,pList = []):
    
    if (len(pList) ==0):
        # create list of parameters
        ps = range(2, 7)
        d  = 1
        qs = range(2, 7)

        Ps = range(1, 3)
        D  = 1
        Qs = range(1, 3)

        pList = list(product(ps, qs, Ps, Qs))
    
    results = []
    best_aic = float("inf")
    
    # add features
    ts = addFeatures(ts, Kw = 6, Ka = 3)
    # regression
    [s, r, lasso] = regression(ts,verbose = True, searchBestFit = True)
    
    # loop over parameters' list
    for param in pList:
        #try except нужен, потому что на некоторых наборах параметров модель не обучается
        print('Parameters:', param)
        try:
            mSARIMA=sm.tsa.statespace.SARIMAX(ts.loc[:,'trip_count'], order=[param[0], 1, param[1]],
                                          seasonal_order=(param[2], 1, param[3], 24),exog = r).fit(disp=1);
        #выводим параметры, на которых модель не обучается и переходим к следующему набору
        except Exception as inst:
            print inst           # __str__ allows args to be printed directly
            continue
        else:     
            aic = mSARIMA.aic
            print('AIC',aic) 
            #сохраняем лучшую модель, aic, параметры
            if aic < best_aic:
                best_model = mSARIMA
                best_aic = aic
                best_param = param
            results.append([param, mSARIMA.aic])
    return [best_aic,best_param]

In [7]:
# id нужных регионов
regsDf = pd.read_csv('../crowdRegs.csv',names=['id','regId']);  

# временные ряды для этих регионов
df = pd.read_pickle('../loadData/crowdRegs3.pcl')
df.columns = regsDf.regId.values.astype('str')

# словарь с группировкой рядов
tsGroups = np.load('tsGroups.npy').item()

# словарь с оптимальными параметрами для каждой группы
paramsGroups = np.load('paramsGroups.npy').item()

*Логика скрипта:*
<ol>
<li> Выбираем одну группу
<li> В группе выбираем один ряд
<li> По номеру группы подгружаем оптимальные параметры
<li> Обучаем регрессор
<li> Обучаем SARIMAX модель
<li> Сохраняем модель (??? Может быть без данных, чтобы сэкономить место).
<li> Делаем предсказание
<li> Сохраняем предсказение
<li> Идём на второй или первый шаг
<ol>
    

In [None]:
# диапазон дат для обучения
startFit = '2016-01-01 0:0:0'
endFit = '2016-04-30 23:00:00'

err = 0

# диапазон дат для предсказания
startPrediction = '2016-05-01 00:00:00'
endPrediction = '2016-05-31 23:00:00'
predictionRange = pd.date_range(startPrediction, endPrediction, freq='H')

# create array to save prediction results
mIndex = pd.MultiIndex.from_product([df.columns.values, predictionRange])
resDf = pd.DataFrame(index = mIndex, columns = ['y','err'])

for grId, ts in tsGroups.iteritems():
    print 'Group ID is', grId
    
    # получаем параметры SARIMAX модели
    params = paramsGroups.get(grId)[1] 
    
    for tsId in ts:
        print 'Regions is ', tsId
        # получаем временной ряд
        ts = df.loc[startFit:endFit,tsId] #

        # обучаем регрессор
        ts = ts.to_frame(name = 'trip_count')
        [r_pr, res, regressor] = regression(addFeatures(ts),verbose = True)

        # обучаем SARIMAX модель
        print 'Learn SARIMAX'
        try:
            mSARIMA=sm.tsa.statespace.SARIMAX(ts, order=[params[0], 2, params[1]],
                                              seasonal_order=(params[2], 1, params[3], 24),
                                              exog = r_pr, enforce_invertibility = True).fit(disp=1);
        except Exception as inst:
            print type(inst)     
            print inst          

        # получаем предсказания регрессора на весь диапазон дат (обучение+предсказание)
        exog = getRegressor(regressor,startFit,endPrediction)
        # получаем данные о поездкахы на весь диапазон дат
        endog = df.loc[startFit:endPrediction,tsId]
       
        # создаём новую модель, которую будет использовать для предсказания
        # Для чего такой финт ушами - не понимаю до сих пор
        try:
            model_fitted = sm.tsa.statespace.SARIMAX(endog, order=[params[0], 1, params[1]],
                                                 seasonal_order=(params[2], 1, params[3], 24),
                                                 exog = exog).filter(mSARIMA.params)
        except Exception as inst:
            print 'Can not create the model'
            print inst
            continue
            
        # проходим по всему диапазону дат предсказаний
        print 'Make prediction'
        for firstLag in predictionRange[:-5]:
            lastLag = firstLag+datetime.timedelta(hours = 5)
            # prediction
            try:
                predicted_data = model_fitted.predict(firstLag, lastLag, dynamic=True, exog = exog[firstLag:lastLag])
            except Exception as inst:
                print 'Prediction error'
                print inst
            else:
                # save results
                #resDf.loc[tsId,firstLag].y = predicted_data
                err += (df.loc[startPrediction:endPrediction,tsId]-predicted_data).abs().sum()
                #resDf.loc[tsId,firstLag].err = (df.loc[startPrediction:endPrediction,tsId]-predicted_data).abs().mean()
                    
    # save results
    #resDf.to_pickle('predictionResults.pcl')
    
print 'Total error is', err    

In [6]:
resDf = pd.read_pickle('predictionResults.pcl')

In [16]:
# выберу настроечные ряды руками
fitSeries = {'gr18':'1274','gr19':'1684','gr10':'1333','gr11':'1075','gr12':'2118','gr13':'1387','gr14':'1384','gr15':'1174'}
fitSeries.update({'gr16':'1483','gr17':'1282','gr21':'1184','gr20':'1131','gr23':'1332','gr22':'1580','gr6':'1177','gr7':'1388'})
fitSeries.update({'gr4':'1128','gr2':'1234','gr3':'1231','gr0':'1286','gr1':'1125','gr8':'1181','gr9':'1532'})

In [57]:
# подбираем параметры для группы 'gr2'
startDate = '2016-01-01 0:0:0'
endDate = '2016-04-30 23:59:59'

grName = 'gr2'
newParams = findHyperParams(df.loc[startDate:endDate,fitSeries.get(grName)].to_frame(name = 'trip_count'))
paramsGroups.update({grName:newParams})
np.save('paramsGroups.npy', paramsGroups)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=4)]: Done  27 out of  27 | elapsed:    2.3s finished


Best params is {'alpha': 0.6}
R factor is  0.0260003769959
('Parameters:', (2, 2, 1, 1))
Non-stationary starting autoregressive parameters found with `enforce_stationarity` set to True.
('Parameters:', (2, 2, 1, 2))
Non-stationary starting autoregressive parameters found with `enforce_stationarity` set to True.
('Parameters:', (2, 2, 2, 1))
Non-stationary starting autoregressive parameters found with `enforce_stationarity` set to True.
('Parameters:', (2, 2, 2, 2))
Non-stationary starting autoregressive parameters found with `enforce_stationarity` set to True.
('Parameters:', (2, 3, 1, 1))
Non-stationary starting autoregressive parameters found with `enforce_stationarity` set to True.
('Parameters:', (2, 3, 1, 2))
Non-stationary starting autoregressive parameters found with `enforce_stationarity` set to True.
('Parameters:', (2, 3, 2, 1))
Non-stationary starting autoregressive parameters found with `enforce_stationarity` set to True.
('Parameters:', (2, 3, 2, 2))
Non-stationary startin

In [58]:
newParams

[22302.510866289762, (6, 2, 2, 1)]

In [47]:
# диапазон дат для обучения
startFit = '2015-01-01 0:0:0'
endFit = '2016-05-31 23:00:00'

err = 0

# диапазон дат для предсказания
startPrediction = '2016-05-31 18:00:00'
endPrediction   = '2016-06-30 23:00:00'
predictionRange = pd.date_range(startPrediction, endPrediction, freq='H')

# словарь с оптимальными параметрами для каждой группы
paramsGroups = np.load('paramsGroups.npy').item()

# create array to save prediction results
mIndex = pd.MultiIndex.from_product([df.columns.values, predictionRange])
#resDf = pd.DataFrame(index = mIndex, columns = ['y','err'])
# load data
#resDf = pd.read_pickle('predictionResults5.pcl')
recalcRegions = [1272,1377]


for grId, ts in tsGroups.iteritems(): #{k: v for k, v in tsGroups.iteritems() if k in recalcGroups}.iteritems():
#for grId, ts in {k: v for k, v in tsGroups.iteritems() if v in recalcRegions}.iteritems():
    
    print 'Group ID is', grId
    
    # получаем параметры SARIMAX модели
    params = paramsGroups.get(grId)[1] 
        
    for tsId in ts:
        
        # 
        #if ~np.isnan(resDf.loc[tsId,'2016-06-15'].err):
        #    print 'Already done!'
        #    continue
        
        if tsId not in ['1272', '1377']:
            continue
    
        print 'Regions is ', tsId
        # получаем временной ряд
        ts = df.loc[startFit:endFit,tsId] #

        # обучаем регрессор
        ts = ts.to_frame(name = 'trip_count')
        [r_pr, res, regressor] = regression(addFeatures(ts),verbose = True, searchBestFit = True)

        # обучаем SARIMAX модель
        print 'Learn SARIMAX'
        try:
            mSARIMA=sm.tsa.statespace.SARIMAX(ts, order=[params[0]+1, 2, params[1]+1],
                                              seasonal_order=(params[2]+1, 1, params[3]+1, 24),
                                              exog = r_pr).fit(disp=1);
        except Exception as inst:
            print type(inst)     
            print inst          

        # получаем предсказания регрессора на весь диапазон дат (обучение+предсказание)
        exog = getRegressor(regressor,startFit,endPrediction)
        # получаем данные о поездкахы на весь диапазон дат
        endog = df.loc[startFit:endPrediction,tsId]
       
        # создаём новую модель, которую будет использовать для предсказания
        # Для чего такой финт ушами - не понимаю до сих пор
        try:
            model_fitted = sm.tsa.statespace.SARIMAX(endog, order=[params[0]+1, 2, params[1]+1],
                                                 seasonal_order=(params[2]+1, 1, params[3]+1, 24),
                                                 exog = exog).filter(mSARIMA.params)
        except Exception as inst:
            print 'Can not create the model'
            print inst
            continue
        else:    
            # проходим по всему диапазону дат предсказаний
            print 'Make prediction'
            for firstLag in predictionRange:
                lastLag = firstLag+datetime.timedelta(hours = 5)
                # prediction
                try:
                    predicted_data = model_fitted.predict(firstLag, lastLag, dynamic=True, exog = exog[firstLag:lastLag])
                except Exception as inst:
                    print 'Prediction error'
                    print inst
                else:
                    # save results
                    resDf.loc[tsId,firstLag].y = predicted_data
                    err += (df.loc[startPrediction:endPrediction,tsId]-predicted_data).abs().sum()
                    resDf.loc[tsId,firstLag].err = (df.loc[startPrediction:endPrediction,tsId]-predicted_data).abs().mean()

            # save results
            resDf.to_pickle('predictionResults5.pcl')
    
print 'Total error is', err    

Group ID is gr18
Group ID is gr19
Group ID is gr10
Group ID is gr11
Group ID is gr12
Group ID is gr13
Group ID is gr14
Group ID is gr15
Group ID is gr16
Group ID is gr17
Group ID is gr21
Group ID is gr20
Regions is  1272
Fitting 3 folds for each of 70 candidates, totalling 210 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    5.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   14.1s
[Parallel(n_jobs=4)]: Done 210 out of 210 | elapsed:   15.1s finished


Best params is {'alpha': 100.0, 'l1_ratio': 1}
R factor is  0.011156219673
Learn SARIMAX
Make prediction
Prediction error
Provided exogenous values are not of the appropriate shape. Required (1, 1), got (5L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (2, 1), got (4L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (3, 1), got (3L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (4, 1), got (2L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (5, 1), got (1L,).
Group ID is gr23
Regions is  1377
Fitting 3 folds for each of 70 candidates, totalling 210 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.9s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   11.2s
[Parallel(n_jobs=4)]: Done 210 out of 210 | elapsed:   12.1s finished


Best params is {'alpha': 100.0, 'l1_ratio': 1}
R factor is  0.0151016653811
Learn SARIMAX
<class 'numpy.linalg.linalg.LinAlgError'>
Singular matrix
Make prediction
Prediction error
Provided exogenous values are not of the appropriate shape. Required (1, 1), got (5L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (2, 1), got (4L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (3, 1), got (3L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (4, 1), got (2L,).
Prediction error
Provided exogenous values are not of the appropriate shape. Required (5, 1), got (1L,).
Group ID is gr22
Group ID is gr6
Group ID is gr7
Group ID is gr4
Group ID is gr2
Group ID is gr3
Group ID is gr0
Group ID is gr1
Group ID is gr8
Group ID is gr9
Total error is 220683.25


In [48]:
resDf = pd.read_pickle('predictionResults7.pcl')

In [3]:
def saveResults(df, fName):
    """
    Save dataframe df to file fName
    """
    f = open(fName, 'w')
    for ts in df.index.levels[0]:
        for lag in df.index.levels[1][6:-5]:
            for i in np.arange(6):
                try:
                    historyStart = lag - datetime.timedelta(hours = 1)
                    res = df.loc[ts,lag].y[i]
                    if res<0:
                        res = 0
                    s =  str(ts)+'_'+datetime.datetime.strftime(historyStart, "%Y-%m-%d") +'_'+ str(historyStart.hour)+'_'+str(i+1)+','+str(res)+'\n'
                except Exception as ins:
                    print lag, ts, i
                    print ins
                else:
                    f.write(s)

    f.close()

In [6]:
# load data
#resDf = pd.read_pickle('predictionResults6.pcl')
saveResults(resDf,'m9.csv')