In [1]:
%pylab inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import datetime
from sklearn.linear_model import Ridge
from IPython.display import display

Populating the interactive namespace from numpy and matplotlib


In [2]:
significant_regions = pd.read_csv('SignificantRegions.txt', sep=',', header = None)
reg_list = significant_regions.values.astype(str)[0]
reg_list.shape

(102L,)

In [3]:
data_dir = "./trip_count"

file_list = os.listdir(data_dir)
data_files = filter(lambda x: x.startswith('trip_count_'), file_list)

Загрузим все данные, оставим только колонки с данными 102 значимых районов. Проведем переиндексацию: индексом будет порядковый номер записи.

In [4]:
region_data = pd.DataFrame()

for fname in data_files:
    tmp = pd.read_csv(data_dir + '/' + fname, index_col=0)    
    region_data = region_data.append(tmp)
    
region_data = region_data[reg_list]
region_data.head()

Unnamed: 0,1075,1076,1077,1125,1126,1127,1128,1129,1130,1131,...,1630,1684,1733,1734,1783,2068,2069,2118,2119,2168
2013-01-01 00:00:00,75,149,88,95,297,538,594,769,952,267,...,12,0,9,113,27,19,6,68,18,67
2013-01-01 01:00:00,108,204,81,126,402,572,518,623,655,220,...,22,0,3,7,5,11,3,82,1,15
2013-01-01 02:00:00,82,162,83,135,361,471,470,495,484,156,...,23,0,1,6,3,2,0,28,0,16
2013-01-01 03:00:00,79,122,24,106,289,482,448,467,337,102,...,25,0,1,1,7,2,0,6,0,1
2013-01-01 04:00:00,35,89,20,59,181,332,377,343,354,92,...,26,0,2,5,2,3,0,30,2,16


Составим обучающую выборку следующим образом:
- Сформируем таблицу синусов и косинусов с К=36. Это  самое большое К из использованных на прошлой неделе. Эти признаки будут одинаковыми для каждого района, т.к. зависят от номера измерения.
- Сформируем таблицу признаков 12 месяцев года, 31 дня месяца, 7 дней недели, 24 часов. Добавим также эти признаки в категориальном виде. Эти признаки будут одинаковыми для каждого района, т.к. зависят от даты и времени измерения.

Далее будем формировать таблицы с уникальными признаками для каждого района:
- Сформируем таблицу признаков y(T), y(T-1)..., y(T-24), y(T-48).
- Сформируем признаки суммарного количества поездок за предыдущие часы.
- Сюда же добавим вектора правильных ответов y(t+1), y(t+2),...,y(t+6)

In [5]:
def CreateFeatures(source, region):

    hinw = 24*7
    hiny = 24*365
    n_obj = source.shape[0]
    
    #список признаков, которые нужно будет нормализовать
    to_normalize = []

    common_features = pd.DataFrame(index = pd.to_datetime(source.index))    
    common_features['trend'] = np.arange(source.shape[0])
    to_normalize.append('trend')

    #синусы и косинусы
    for k in xrange(37): 
        #для недельной сезонности:
        common_features['ws' + str(k)] = np.sin(np.arange(n_obj)*(2.0*np.pi*k/hinw))
        common_features['wc' + str(k)] = np.cos(np.arange(n_obj)*(2.0*np.pi*k/hinw))
        #для годовой сезонности:
        common_features['ys' + str(k)] = np.sin(np.arange(n_obj)*(2.0*np.pi*k/hiny))
        common_features['yc' + str(k)] = np.cos(np.arange(n_obj)*(2.0*np.pi*k/hiny))
        
        to_normalize.append('ws' + str(k))
        to_normalize.append('wc' + str(k))
        to_normalize.append('ys' + str(k))
        to_normalize.append('yc' + str(k))
        
    
    #месяцы года
    for i in xrange(1,13):
        common_features['m' + str(i)] = [1 if (x.month == i) else 0 for x in common_features.index]    

    #дни месяца
    for i in xrange(1,32):
        common_features['dom' + str(i)] = [1 if (x.day == i) else 0 for x in common_features.index]    

    #дни недели
    for i in xrange(1,8):
        common_features['dow' + str(i)] = [1 if (x.dayofweek == i) else 0 for x in common_features.index]   

    #часы
    for i in xrange(0,24):
        common_features['h' + str(i)] = [1 if (x.hour == i) else 0 for x in common_features.index]

    #те же признаки в категориальном виде
    common_features['m'] = [x.month for x in common_features.index]
    common_features['dom'] = [x.day for x in common_features.index]
    common_features['dow'] = [x.dayofweek for x in common_features.index]
    common_features['h'] = [x.hour for x in common_features.index]
    
    to_normalize.append('m')
    to_normalize.append('dom')
    to_normalize.append('dow')
    to_normalize.append('h')
    
    region_features = pd.DataFrame(index = common_features.index)

    #значения в данный час
    region_features['y_t'] = source[region]
    
    to_normalize.append('y_t')

    #значения за предыдущие часы
    for i in xrange(1,13):
        region_features['y_t-' + str(i)] = source[region].shift(i)
        to_normalize.append('y_t-' + str(i))

    #значения за предыдущие сутки
    for i in xrange(1,3):
        region_features['y_t-' + str(24*i)] = source[region].shift(24*i)
        to_normalize.append('y_t-' + str(24*i))

    #сумма поездок за предыдущие 3, 6, 12, 24, 48 часов
    region_features['sum_3h'] = source[region].shift(1).rolling(3, min_periods=3).sum()
    region_features['sum_6h'] = source[region].shift(1).rolling(6, min_periods=6).sum()
    region_features['sum_12h'] = source[region].shift(1).rolling(12, min_periods=12).sum()
    region_features['sum_24h'] = source[region].shift(1).rolling(24, min_periods=24).sum()
    region_features['sum_48h'] = source[region].shift(1).rolling(48, min_periods=48).sum()
    to_normalize.append('sum_3h')
    to_normalize.append('sum_6h')
    to_normalize.append('sum_12h')
    to_normalize.append('sum_24h')
    to_normalize.append('sum_48h')

    #прогнозы на 1-6 часов - целевые переменные для данного района
    for i in xrange(1,7):
        region_features['y_t+' + str(i)] = source[region].shift(-1*i)

    #Датафрейм для всех признаков
    all_features = pd.DataFrame()
    all_features = common_features.join(region_features)
    
    #удаляем записи с пропусками в начале и в конце выборки
    all_features.dropna(inplace = True)
    
    #нормализуем данные
    all_features[to_normalize] = (all_features[to_normalize] - all_features[to_normalize].mean())/all_features[to_normalize].std()
    all_features.fillna(0, inplace=True)
    
    return all_features

In [6]:
def PrepareData(f, start, end):
    #f - датафрейм, созданный CreateFeatures
    start = pd.to_datetime(start)
    end = pd.to_datetime(end)
    f = f[start:end]
    
    #целевые переменные
    y = []
    y.append(f['y_t+1'].values)
    y.append(f['y_t+2'].values)
    y.append(f['y_t+3'].values)
    y.append(f['y_t+4'].values)
    y.append(f['y_t+5'].values)
    y.append(f['y_t+6'].values)    
        
    #признаки    
    X = f.drop(['y_t+1','y_t+2','y_t+3','y_t+4','y_t+5','y_t+6'], axis=1).as_matrix()
    return X, y

In [7]:
def PrintCoefs(regressors_list, features_list):
    
    col_names =[]
    data = np.zeros([regressors_list[0].coef_.shape[0], len(regressors_list)])
    for i in xrange(len(regressors_list)):
        col_names.append('reg_' + str(i))
        data[:,i] = regressors_list[i].coef_
    
    Coefs = pd.DataFrame(index = features_list, data = data, columns = col_names)
    
    pd.set_option('display.max_rows', Coefs.shape[0]+1)
    display(Coefs)
    pd.set_option('display.max_rows', 30)
    
    return Coefs

In [8]:
def PostProcValue(val):
    if (val>0):
        return int(round(val,0))        
    else:
        return int(0)

In [9]:
#посмотрим на результат
features = CreateFeatures(region_data, '1075')
features.head()

Unnamed: 0,trend,ws0,wc0,ys0,yc0,ws1,wc1,ys1,yc1,ws2,...,sum_6h,sum_12h,sum_24h,sum_48h,y_t+1,y_t+2,y_t+3,y_t+4,y_t+5,y_t+6
2013-01-03 00:00:00,-1.731966,0.0,0.0,0.0,0.0,1.377859,-0.314306,-0.080756,1.416221,-0.612839,...,-0.403007,0.600597,-0.817118,-0.814079,8.0,2.0,3.0,4.0,6.0,18.0
2013-01-03 01:00:00,-1.731853,0.0,0.0,0.0,0.0,1.36513,-0.365645,-0.079735,1.416186,-0.706329,...,-0.700121,0.274334,-0.809947,-0.940587,2.0,3.0,4.0,6.0,18.0,38.0
2013-01-03 02:00:00,-1.731739,0.0,0.0,0.0,0.0,1.350491,-0.416472,-0.078713,1.41615,-0.795865,...,-0.992519,-0.09108,-0.792022,-1.141393,3.0,4.0,6.0,18.0,38.0,74.0
2013-01-03 03:00:00,-1.731626,0.0,0.0,0.0,0.0,1.333962,-0.466716,-0.077692,1.416113,-0.880946,...,-1.270768,-0.417343,-0.788437,-1.302038,4.0,6.0,18.0,38.0,74.0,71.0
2013-01-03 04:00:00,-1.731513,0.0,0.0,0.0,0.0,1.315566,-0.516306,-0.076671,1.416076,-0.961097,...,-1.492425,-0.870849,-0.784852,-1.454651,6.0,18.0,38.0,74.0,71.0,56.0


In [14]:
%%time
#сформируем обучающую выборку и обучим 6 моделей Ridge regression
X_train, y_train = PrepareData(features, '2013-01-03 00:00:00', '2016-04-30 22:00:00')

reg_1h  = Ridge(alpha = 0.1, random_state = 30)
reg_2h  = Ridge(alpha = 0.1, random_state = 30)
reg_3h  = Ridge(alpha = 0.1, random_state = 30)
reg_4h  = Ridge(alpha = 0.1, random_state = 30)
reg_5h  = Ridge(alpha = 0.1, random_state = 30)
reg_6h  = Ridge(alpha = 0.1, random_state = 30)

reg_1h.fit(X_train, y_train[0])
reg_2h.fit(X_train, y_train[1])
reg_3h.fit(X_train, y_train[2])
reg_4h.fit(X_train, y_train[3])
reg_5h.fit(X_train, y_train[4])
reg_6h.fit(X_train, y_train[5])

Wall time: 1.02 s


In [15]:
#Посмотрим на коэффициенты
Coefs = PrintCoefs([reg_1h, reg_2h, reg_3h, reg_4h, reg_5h, reg_6h], features.columns[:-6])

Unnamed: 0,reg_0,reg_1,reg_2,reg_3,reg_4,reg_5
trend,-0.183026,-0.274141,-0.351358,-0.418079,-0.472844,-0.516485
ws0,0.0,0.0,0.0,0.0,0.0,0.0
wc0,0.0,0.0,0.0,0.0,0.0,0.0
ys0,0.0,0.0,0.0,0.0,0.0,0.0
yc0,0.0,0.0,0.0,0.0,0.0,0.0
ws1,0.981641,6.039559,3.665459,-0.63355,2.619983,10.947115
wc1,-0.641815,-6.256411,-5.644203,3.401859,1.374307,-7.871079
ys1,-0.459383,1.061055,2.290813,3.331681,3.891032,3.96902
yc1,-0.832923,-0.386564,-0.12709,0.063109,0.287588,0.382414
ws2,0.46525,0.380252,0.426526,0.122567,0.213837,0.316151


In [16]:
Coefs[np.abs(Coefs['reg_0']) > 3].sort_values(by = 'reg_0', ascending=False)

Unnamed: 0,reg_0,reg_1,reg_2,reg_3,reg_4,reg_5
y_t,20.871383,15.64811,13.605926,10.430706,6.284619,4.383408
h7,7.922943,0.07321,-4.095285,4.690965,2.002444,-4.808734
h10,6.913152,3.743163,-3.137508,2.531281,5.83562,-7.037767
sum_24h,6.041749,10.131751,13.336665,16.100631,17.44862,17.146228
h20,5.88881,0.369892,-0.053228,1.418606,0.391162,-1.453564
h14,4.902248,-6.915329,1.136917,4.231825,-1.733319,-1.733951
h16,4.851612,6.569302,-0.047626,-0.91393,4.631806,-1.843553
m6,4.548036,7.525796,9.89695,11.643549,13.045379,14.310823
h13,4.28506,7.515776,-5.153811,2.016303,4.207407,-2.492259
h17,4.132109,-1.294899,-1.356785,4.833269,-0.881824,-1.423586


Видно, что для разных моделей важны разные коэффициенты. Можно обучить Lasso regression для отбора важных коэффициентов для каждой модели.
Посмотрим, какие результаты покажет это решение.

### Проверка на данных за май

In [18]:
%%time

f_start = pd.to_datetime('2016-04-30 22:00:00')
f_end = pd.to_datetime('2016-05-31 17:00:00')
forecast = pd.DataFrame(index = features[f_start:f_end].index)
true_data = pd.DataFrame(index = features[f_start:f_end].index)

for region in region_data.columns:
    
    features = CreateFeatures(region_data, region)
    
    #обучаем модели на данных до мая на всех районах
    X_train, y_train = PrepareData(features, '2013-01-03 00:00:00', '2016-04-30 22:00:00')
    
    reg_1h.fit(X_train, y_train[0])
    reg_2h.fit(X_train, y_train[1])
    reg_3h.fit(X_train, y_train[2])
    reg_4h.fit(X_train, y_train[3])
    reg_5h.fit(X_train, y_train[4])
    reg_6h.fit(X_train, y_train[5])
    
    #строим прогноз на май
    X_test, y_test = PrepareData(features, f_start, f_end)
    
    forecast[region + '_1h'] = reg_1h.predict(X_test)
    forecast[region + '_2h'] = reg_2h.predict(X_test)
    forecast[region + '_3h'] = reg_3h.predict(X_test)
    forecast[region + '_4h'] = reg_4h.predict(X_test)
    forecast[region + '_5h'] = reg_5h.predict(X_test)
    forecast[region + '_6h'] = reg_6h.predict(X_test)
    
    true_data[region + '_1h'] = y_test[0]
    true_data[region + '_2h'] = y_test[1]
    true_data[region + '_3h'] = y_test[2]
    true_data[region + '_4h'] = y_test[3]
    true_data[region + '_5h'] = y_test[4]
    true_data[region + '_6h'] = y_test[5]

Wall time: 12min 36s


In [19]:
forecast.head(2)

Unnamed: 0,1075_1h,1075_2h,1075_3h,1075_4h,1075_5h,1075_6h,1076_1h,1076_2h,1076_3h,1076_4h,...,2119_3h,2119_4h,2119_5h,2119_6h,2168_1h,2168_2h,2168_3h,2168_4h,2168_5h,2168_6h
2016-04-30 22:00:00,90.990066,77.588056,54.388438,31.107135,17.540148,15.349849,110.199572,103.775044,70.361819,39.152084,...,-0.980474,-9.663606,-1.643481,-2.030068,69.612705,50.722704,30.112379,20.999158,16.822271,17.067377
2016-04-30 23:00:00,76.12129,51.554428,32.132877,23.886451,20.135193,15.989254,105.707954,68.800627,43.158065,35.545597,...,-5.181007,0.176041,0.835997,2.36783,42.862075,28.526006,21.555665,18.048004,23.431405,69.343729


In [20]:
true_data.head(2)

Unnamed: 0,1075_1h,1075_2h,1075_3h,1075_4h,1075_5h,1075_6h,1076_1h,1076_2h,1076_3h,1076_4h,...,2119_3h,2119_4h,2119_5h,2119_6h,2168_1h,2168_2h,2168_3h,2168_4h,2168_5h,2168_6h
2016-04-30 22:00:00,92.0,71.0,37.0,10.0,14.0,6.0,118.0,64.0,39.0,46.0,...,0.0,0.0,0.0,0.0,39.0,85.0,35.0,0.0,0.0,1.0
2016-04-30 23:00:00,71.0,37.0,10.0,14.0,6.0,8.0,64.0,39.0,46.0,25.0,...,0.0,0.0,0.0,3.0,85.0,35.0,0.0,0.0,1.0,57.0


In [21]:
#Обнуляем отрицательные значения, округляем до целого
for cl in forecast.columns:
    forecast[cl] = map(PostProcValue, forecast[cl])
    
#Считаем Q    
Q = abs(forecast - true_data).sum().sum()
Q = Q/102/739/6
print Q

20.7265249808


Оценка гораздо лучше, чем 42.9 на прошлой неделе, хотя для этого пришлось обучить 102*6 = 612 моделей. 
Построим прогноз для соревнования.

### Прогноз для kaggle на данных за июнь

In [22]:
%%time
f_start = pd.to_datetime('2016-05-31 23:00:00')
f_end = pd.to_datetime('2016-06-30 17:00:00')
forecast = pd.DataFrame(index = features[f_start:f_end].index)
true_data = pd.DataFrame(index = features[f_start:f_end].index)

for region in region_data.columns:
    
    features = CreateFeatures(region_data, region)
    
    X_train, y_train = PrepareData(features, '2013-01-03 00:00:00', '2016-05-31 22:00:00')
    
    reg_1h.fit(X_train, y_train[0])
    reg_2h.fit(X_train, y_train[1])
    reg_3h.fit(X_train, y_train[2])
    reg_4h.fit(X_train, y_train[3])
    reg_5h.fit(X_train, y_train[4])
    reg_6h.fit(X_train, y_train[5])    
    
    X_test, y_test = PrepareData(features, f_start, f_end)
    
    forecast[region + '_1h'] = reg_1h.predict(X_test)
    forecast[region + '_2h'] = reg_2h.predict(X_test)
    forecast[region + '_3h'] = reg_3h.predict(X_test)
    forecast[region + '_4h'] = reg_4h.predict(X_test)
    forecast[region + '_5h'] = reg_5h.predict(X_test)
    forecast[region + '_6h'] = reg_6h.predict(X_test)    

for cl in forecast.columns:
    forecast[cl] = map(PostProcValue, forecast[cl])

Wall time: 12min 36s


In [23]:
submission = pd.DataFrame()

ids = []
vals = np.array([])

for time in forecast.index:
    for region in region_data.columns:    
        ids += [region+'_'+str(time.date()) + '_' + str(time.hour) + '_' + str(x) for x in range(1,7)]

for time in forecast.index:
    vals = np.hstack([vals, forecast.loc[time].values])

vals = vals.astype(int)
submission = pd.DataFrame(index = ids, columns = ['y'], data = vals)
submission.to_csv('Week5_submission.csv', index_label = 'id')

Ссылка на сабмишн: https://inclass.kaggle.com/c/yellowtaxi/leaderboard?submissionId=5001924

![title](Kaggle sub 4.png)