In [1]:
%pylab inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
import itertools

import csv
import time

from xgboost import XGBRegressor
from matplotlib import pyplot
from xgboost import plot_importance

from sklearn.metrics import mean_squared_error, r2_score
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

from bayes_opt import BayesianOptimization
import xgboost as xgb
from sklearn.cross_validation import cross_val_score

from pandas.tseries.holiday import USFederalHolidayCalendar
import json
import os

import warnings
warnings.filterwarnings("ignore")

##### first of all as new additional feature I am going to use dummy variable of federal US holidays

In [2]:
cal = USFederalHolidayCalendar()
holiday = pd.get_dummies(cal.holidays(start='2014-05-01T00:00:00.000000000', end='2016-06-30T00:00:00.000000000', return_name=True))
holiday

In [3]:
train_file = "./data/reg_bin_stat_2years.pkl"
verify_file = "./data/verif_bin_stat_5_6_2016.pkl"

In [4]:
with open(train_file, "rb") as f:
    data_2year = pickle.load(f)

with open(verify_file, "rb") as f:
    data_2month = pickle.load(f)  

In [5]:
time = pd.date_range('2014 May 1 00:00:00', periods = data_2year.shape[1] - 1, freq = 'h')
taxi = pd.DataFrame(data_2year.T[1:, :], index = time, columns=[data_2year.T[0, :].astype(int)])

prediction_time = pd.date_range('2016 May 1 00:00:00', '2016 June 30 23:00:00', freq = 'h')
taxi_prediction = pd.concat([taxi, pd.DataFrame(np.array([[None]*taxi.shape[1]]*prediction_time.shape[0]), 
                                                index = prediction_time, columns=[data_2year.T[0, :].astype(int)])], axis=0)

verify_time = pd.date_range('2016 May 1 00:00:00', periods = data_2month.shape[1] - 1, freq = 'h')
verify_taxi = pd.DataFrame(data_2month.T[1:, :], index = verify_time, columns=[data_2month.T[0, :].astype(int)])

In [6]:
print(taxi.index.values[25*24+5]) 
print(holiday.index.values[0])
print(taxi.index.values[25*24] == holiday.index.values[0])

##### create mapping between number of columns and number of region

In [6]:
region_map = dict(zip(list(range(102)), taxi.columns.values))

In [6]:
taxi.head()

In [7]:
taxi_prediction.tail()

In [8]:
verify_taxi.head()

##### get weather data like in previous week 

#### Events:  0: nothing, 1: snow, 2: fog snow, 3: rain, 4: fog, 5: fog rain, 6: rain snow, 7: rain thunderstorm, 8: rain snow thunderstorm", 9: thunderstorm

In [7]:
hour_weather = pd.read_csv("./data/NY_hour_weather.csv", usecols=["time", "daily_temp", "windchill", "humidity", "events"], encoding ="cp1252")
daily_weather = pd.read_csv("./data/NY_daily_weather.csv", usecols=["daily_temp"], encoding ="cp1252")

In [8]:
hour_weather.events = hour_weather.events.replace(hour_weather.events.unique(), range(10))
hour_weather.humidity = hour_weather.humidity.map(lambda x: x[:2])
hour_weather.time = pd.DatetimeIndex(hour_weather.time).map(lambda x: x.replace(minute=0, second=0))
hour_weather.drop_duplicates(subset=["time"], inplace=True)
hour_weather.set_index('time', inplace=True)
hour_weather.shape

""" from [2877:20386] """
test_weather = hour_weather.iloc[2877:20386]
print(test_weather.tail())
prediction_weather = hour_weather.iloc[20386:21850]
print(prediction_weather.tail())

In [23]:
plt.figure(figsize=(150,5))
plt.plot(taxi.index.values, taxi[region_map[10]].values, color="red", label='real value')
#double click on image makes it larger

##### some usefull functions for feature creation

In [9]:
def shift(data, shift_period, column_name, external_data=pd.Series([])):
    if external_data.empty:
        data[column_name] = data["taxi_call_num"].shift(periods=shift_period)
    else:    
        data[column_name] = external_data.shift(periods=shift_period) 

In [10]:
def cumsum(data, min_period, window, column_name, external_data=pd.Series([])):
    if external_data.empty:
        data[column_name] = data["taxi_call_num"].shift(1).rolling(min_periods=min_period, window=window).sum()
    else:
        data[column_name] = external_data.shift(1).rolling(min_periods=min_period, window=window).sum()

In [11]:
def binary_combinations(data, fourier_shape):
    for pair in itertools.combinations(data.columns[fourier_shape:].values, 2):
        data[pair[0] + "_" + pair[1]] = data[pair[0]] * data[pair[1]] 

##### two functions below add shifted and cumulative sum values of past taxi calls from neighbore regions, surely if they are in list of 102 considered regions; and add feature combinations - depending on "type of interaction" specified as one of the parameters of bivar_nonlinear function

##### this two functions will mot be concidered in current notebook. use neighbore features implies sequential hourly prediction for each region since prediction of t hour for region N will require knowledge of the previously predicted taxi call value for t-1 hour of neighbore regions N-1, N+1, N+50, N-50 etc.

In [135]:
def add_neighbore(data, reg_num, shift_dict, cumsum_dict):
    for nghb in region_map[reg_num] + np.array([-51, -50, -49, -1, 1, 49, 50, 51]):
        if nghb in taxi.columns.values:
            external_data = taxi.loc[:, nghb]
            for name, value in shift_dict:
                shift(data, shift_period=value, column_name=(str(nghb) + name), external_data=external_data)
            for name, value in cumsum_dict:
                cumsum(data, min_period=value[0], window=value[1], column_name=(str(nghb) + name), external_data=external_data)    

In [136]:
def bivar_nonlinear(data, f, start_feature, end_feature=None):
    for pair in itertools.combinations(data.columns[fourier_shape:].values, 2):
        data[f.__name__ + pair[0] + "_" + pair[1]] = f(data[pair[0]],  data[pair[1]])

##### add some new features not used in the last week's task - cu,ulative sum for last half year and and number of taxi calls 7, 28 and 365 days before + US state holidays dummy variable

In [12]:
def feature_constructor(region, regres, dataframe=taxi, weather=test_weather):
        
    ''' regress : ((period, component_num), ...) '''
    data = pd.DataFrame(dataframe.iloc[:, region].values, index = dataframe.index, columns=["taxi_call_num"])
    time = np.arange(0, dataframe.shape[0])
    
    ''' fourier components - as was shown in previous week - it was enought approx. 15 year and 70 week components '''
    for component in regres:
        for w in range(1, component[1]+1):
            data["sin_w_%d_%d" % (component[0], w)] = np.sin(2*np.pi*w*time/component[0]) 
            data["cos_w_%d_%d" % (component[0], w)] = np.cos(2*np.pi*w*time/component[0])
    data = data.round(3)
    fourier_shape = data.shape[1]
    
    d = [ ("d_before", 24), ("two_d_before", 48), ("three_d_before", 72), ("four_d_before", 96), ("five_d_before", 120), 
              ("six_d_before", 144), ("one_h_before", 1), ("two_h_before", 2), ("three_h_before", 3), ("four_h_before", 4), 
              ("five_h_before", 5), ("six_h_before", 6), ("seven_h_before", 7), ("eight_h_before", 8), ("nine_h_before", 9), 
              ("ten_h_before", 10), ("eleven_h_before", 11), ("twelve_h_before", 12),
              ("year_before", 8760), ("four_w_before", 672), ("week_before", 168)]

    dd = [("half_d_cumsum", (1, 12)), ("d_cumsum", (1, 24)), ("week_cumsum", (1, 168)), ("four_w_cumsum", (1, 168*4)),
          ("half_y_cumsum", (1, 4380)), ("year_cumsum", (1, 8760))]
    
            
    for name, value in d:
        shift(data, shift_period=value, column_name=name)
            
    for name, value in dd:
        cumsum(data, min_period=value[0], window=value[1], column_name=name)
        
       
    data["hour"] = data.index.hour
    data["week_day"] = data.index.dayofweek
    data["day"] = data.index.day
    data["month"] = data.index.month
    
    #add_neighbore(data, reg_num=region, shift_dict=d, cumsum_dict=dd)
    binary_combinations(data, fourier_shape)
    
    data["weekend"] = [1 if (date.weekday == 5 or date.weekday == 6) else 0 for date in data.index]
    data["federal_holiday"] = [1 if (np.datetime64(idx, 'D') in holiday.index) else 0 for idx in data.index]
    #idx.astype('datetime64[D]')    
    data = pd.concat([data, weather.iloc[:, [0, 1, 3]]], axis=1)
    data = data.fillna(0.1)
    
    #make_regression(data)
        
    return data, fourier_shape    

In [13]:
def predict(region, model, end_t_prediction, features):
    feature_mixed = 31
    
    '''feature_constructor(region, regres, dataframe=taxi, weather=test_weather)'''
    data, fourier_shape = feature_constructor(region=region, regres=((168, 70), (8760, 20)), dataframe=taxi_prediction, 
                               weather=pd.concat([test_weather, prediction_weather], axis=0))
    data = data.loc[:end_t_prediction]
    #print("full data shape: ", data.shape)
    '''2016 May 1 00:00:00'''     '''2014-05-01 00:00:00'''
    for h in pd.date_range('2016 May 1 00:00:00', end_t_prediction, freq = 'h'):
        #print(h, end=" ")
        d = [ ("d_before", 24), ("two_d_before", 48), ("three_d_before", 72), ("four_d_before", 96), ("five_d_before", 120), 
              ("six_d_before", 144), ("one_h_before", 1), ("two_h_before", 2), ("three_h_before", 3), ("four_h_before", 4), 
              ("five_h_before", 5), ("six_h_before", 6), ("seven_h_before", 7), ("eight_h_before", 8), ("nine_h_before", 9), 
              ("ten_h_before", 10), ("eleven_h_before", 11), ("twelve_h_before", 12),
              ("year_before", 8760), ("four_w_before", 672), ("week_before", 168)]
        
        dd = [("half_d_cumsum", (1, 12)), ("d_cumsum", (1, 24)), ("week_cumsum", (1, 168)), ("four_w_cumsum", (1, 168*4)),
              ("half_y_cumsum", (1, 4380)), ("year_cumsum", (1, 8760))]
        
        for name, value in d:
            data.ix[h, name] = data.loc[h - pd.Timedelta(hours=value)]['taxi_call_num']
                   
        for name, value in dd:
            data[name] = data["taxi_call_num"].shift(1).rolling(min_periods=value[0], window=value[1]).sum()
                
        
        ''' take only single feature columns '''
        multicross_coll = itertools.combinations(data.columns[fourier_shape : fourier_shape + feature_mixed].values, 2)
        for pair in multicross_coll:
            data.ix[h, pair[0] + "_" + pair[1]] = data.ix[h, pair[0]] * data.ix[h, pair[1]] 
        
        p = model.predict((data.loc[h][features].values).reshape(1, -1))
        #p_ = prediction(model, data.loc[h][features])
        
        data.ix[h, 'taxi_call_num'] = p
    
    return data

##### try on one region using xgbregressor with default parameters and plot features histograme with their significance

In [89]:
mod = XGBRegressor()
eleven, _ = feature_constructor(region=11, regres=((168, 70), (8760, 20)))
mod.fit(eleven.iloc[:, 1:], eleven.taxi_call_num.values)

In [90]:
""" plot importance of features with binary interactions between them for region number 1"""
names = eleven.columns

pyplot.figure(figsize=(eleven.shape[1]//4, 7))
pyplot.bar(range(len(mod.feature_importances_)), mod.feature_importances_)
pyplot.xticks(np.arange(names.shape[0]), names[1:], rotation=90)
pyplot.axvline(180, color='k', linestyle='dashed', linewidth=1)
pyplot.grid()
pyplot.show()
#double click on image makes it larger

In [49]:
names.shape

##### select features by the r-2 value xgboost, write them into file

In [68]:
def find_best_features(regions):
    ''' find best features using xgboost. for each region take best features with cumsum 99%  '''
    features_good_regions = []
    for reg in regions:
        print("REGION: ", reg, end=" ")
        model = XGBRegressor()
        region, _ = feature_constructor(reg, regres=((168, 70), (8760, 20)), dataframe=taxi, weather=test_weather)
        model.fit(region.iloc[:, 1:], region.taxi_call_num.values)

        significance_list = list(filter(lambda x: x[1] > 0, 
                                        sorted(zip(region.columns[1:], model.feature_importances_), 
                                               key=lambda x: x[1], reverse=True)))
        num_of_features = np.argwhere(np.cumsum([x[1] for x in significance_list]) > 0.95)[0][0]
        features_good_regions.append([x[0] for x in significance_list][:num_of_features])
    return features_good_regions

In [70]:
features = {} 
for reg in range(102):
    features[reg] = find_best_features([reg, ])

In [86]:
#with open("./data/best_features_week6.pickle", 'wb') as handle:
#    pickle.dump(features, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
with open("./data/best_features_week6.pickle", 'rb') as handle:
    features = pickle.load(handle)

In [94]:
features.keys()

##### take selected parameters for xgboost regressor found last week

In [15]:
p={'learning_rate':0.1, 'n_estimators':400, 'max_depth':7, 'min_child_weight':5, 'gamma':0.1, 'subsample':0.9, 
            'colsample_bytree':0.6, 'objective':'reg:linear', 'scale_pos_weight':1, 'seed':27, 'reg_alpha':0}

In [16]:
xgb_comb_bestparams = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 26, 27, 28, 29, 
                          30, 31, 32, 33, 34, 35, 36, 37, 38, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56, 57, 
                          58, 59, 60, 61, 63, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 76, 77, 78, 79, 81, 82, 83, 84, 88, 90, 
                          91, 92, 93, 95, 96, 97, 98, 99, 100, 101, 
                       39, 40, 52, 80, 85, 87, 89, 24, 64, 75, 10, 25, 62, 86, 94]

##### plot prediction - may 2016

In [104]:
for reg in xgb_comb_bestparams:
    print("REGION: ", reg)
    df, _ = feature_constructor(region=reg, regres=((168, 70), (8760, 20)))
    
    mod = XGBRegressor(**p)
    mod.fit(df[features[reg][0]].values, df.taxi_call_num.values)
    print("XGB train r2: ", mod.score(df[features[reg][0]].values, df.taxi_call_num.values))
    predicted_df = predict(region=reg, model=mod, end_t_prediction='2016-05-30T23:00:00.000000000', features=features[reg][0])
    
    print("train r2: ", mod.score(df[features[reg][0]].values, df.taxi_call_num.values))
    print("test r2: ", r2_score(verify_taxi.loc['2016-05-01T00:00:00.000000000': '2016-05-30T23:00:00.000000000'][region_map[reg]].values, 
                                    predicted_df.loc['2016-05-01T00:00:00.000000000': '2016-05-30T23:00:00.000000000']['taxi_call_num'].values))

    plt.figure(figsize=(80,10))
    plt.plot(verify_taxi.loc['2016-05-01T00:00:00.000000000': '2016-05-30T23:00:00.000000000'].index.values, 
             verify_taxi.loc['2016-05-01T00:00:00.000000000': '2016-05-30T23:00:00.000000000'][region_map[reg]].values,
             color="red", alpha = 0.5, label='real value')
    plt.plot(verify_taxi.loc['2016-05-01T00:00:00.000000000': '2016-05-30T23:00:00.000000000'].index.values, 
             predicted_df.loc['2016-05-01T00:00:00.000000000': '2016-05-30T23:00:00.000000000']['taxi_call_num'].values, 
             color="green", alpha = 0.7, label='predicted')
    plt.xlabel('time (hour)')
    plt.legend()
    plt.grid()
    plt.show()

##### Try to select best parametersin another way. Previous week we choose best parameters by sequential tuning parameters one after another. Now we use BayesianOptimization package.
##### pip install bayesian-optimization
##### You can read about the method here: http://xgboost.readthedocs.io/en/latest/python/python_api.html

##### Prediction results for almost all regions look better (and are better comparing r2 values) than the result of previous week. But for many regions overfitting is observed. r2 value on a train 2-year data is close to 1.0 value but on a test data set (may 2016) the result is much worse.   Tune xgboost regressor parameters for regions that showed r2 score value less than 0.8 on may 2016. 

In [17]:
def xgboostcv(max_depth,
              learning_rate,
              n_estimators,
              gamma,
              min_child_weight,
              subsample,
              colsample_bytree,
              reg_alpha,
              reg_lambda,
              silent=True,
              nthread=-1):
    return cross_val_score(xgb.XGBRegressor(max_depth=int(max_depth),
                                            learning_rate=learning_rate,
                                            n_estimators=int(n_estimators),
                                            silent=silent,
                                            nthread=nthread,
                                            gamma=gamma,
                                            min_child_weight=min_child_weight,
                                            subsample=subsample,
                                            colsample_bytree=colsample_bytree,
                                            reg_alpha=reg_alpha,
                                            reg_lambda=reg_lambda),
                           x_train,
                           y_train,
                           'r2',
                           cv=5).mean()

In [18]:
bayes_obj_dict = {}

In [None]:
region_tune = [10, 11, 12, 13, 14, 15, 22, 23, 24, 25, 26 ,27, 28, 38, 39, 40, 41, 42, 52, 60, 61, 64, 65, 70, 71, 72, 73, 74, 
               75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 90, 91, 92, 93, 94, 98, 100, 101]

In [19]:
for reg in region_tune:
    print(reg)
    train_df, _ = feature_constructor(region=reg, regres=((168, 70), (8760, 20)))
    x_train, y_train = train_df.ix[:, features[reg][0]].values, train_df.taxi_call_num.values 

    xgboostBO = BayesianOptimization(xgboostcv, {'max_depth': (3, 14), 'learning_rate': (0.01, 0.2), 'n_estimators': (50, 1000),
                                      'gamma': (0.01, 1.), 'min_child_weight': (1, 10), 'subsample': (0.5, 1), 'colsample_bytree' :(0.5, 1), 
                                                'reg_alpha': (1e-5, 10), 'reg_lambda': (1e-5, 10)})

    xgboostBO.maximize(init_points=2, n_iter = 28)
    bayes_obj_dict[reg] = xgboostBO

In [56]:
#with open("H:/Yandex machine learning/finall course coursera/bayes_optimize_obj_week6_24.pickle", 'wb') as handle:
#    pickle.dump(bayes_obj_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [109]:
#with open("F:/Yandex machine learning/finall course coursera/bayes_optimize_obj_week6_74_41_42_65.pickle", 'rb') as handle:
#    optimized = pickle.load(handle)
#optimized

##### save tune results as json files

In [116]:
for region, bayes_obj in bayes_obj_dict.items():
    with open('./data/week_6_bayes_optimization/' + str(region) + '.json', 'w') as outfile:
        json.dump(optimized[region].res["all"], outfile)

In [21]:
with open('./data/week_6_bayes_optimization/10.json') as outfile:
    data = json.load(outfile)

In [None]:
#https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.from_dict.html
#http://qaru.site/questions/355841/unable-to-load-files-using-pickle-and-multipile-modules

##### for each region sort list of different parameters ("values") and select largest seven elements, then fit prediction model with this parameters, make a prediction for test data - may 2016. select parameters  that showed the best result

In [20]:
start_prediction = '2016-05-01T00:00:00.000000000'
end_prediction = '2016-05-30T23:00:00.000000000'

def explore_params(folder_path):
    result = {}
    
    for _, _, files in os.walk(folder_path):  
        for filename in files:
            with open(os.path.join(folder_path, filename).replace("\\","/")) as outfile:
                reg_num = int(filename.split('.')[0])
                print("REGION: ", reg_num)
                
                bayes_optimize = json.load(outfile)
                e = sorted(bayes_optimize["values"])[-7]
                
                ''' take 5 biggest r2 values '''
                best_param_set = np.argwhere(np.array(bayes_optimize["values"]) >= e).flatten()
                df, _ = feature_constructor(region=reg_num, regres=((168, 70), (8760, 20)))
                
                res = {}
                for param_idx in best_param_set:
                    p = bayes_optimize["params"][param_idx]
                    p = { key:(int(round(value)) if (key == 'n_estimators') or (key == 'max_depth') else value) for key, value in p.items()}
                    
                    print("cross val score r2: ", bayes_optimize["values"][param_idx], end=" ")
                    mod = XGBRegressor(**p)
                    mod.fit(df[features[reg_num][0]].values, df.taxi_call_num.values)
                    r2_train =  mod.score(df[features[reg_num][0]].values, df.taxi_call_num.values)
    
                    print("train r2: ", r2_train, end=" ")
                    predicted_df = predict(region=reg_num, model=mod, end_t_prediction=end_prediction, features=features[reg_num][0])
                    rs_test = r2_score(verify_taxi.loc[: end_prediction][region_map[reg_num]].values, 
                                            predicted_df.loc[start_prediction: end_prediction]['taxi_call_num'].values)
                    print("test r2: ", rs_test)
                    
                    res[param_idx] = rs_test
                
                ''' get index of params element that shows best result on test set  '''
                result[reg_num] = max(res, key=res.get)
    return result            

In [18]:
%%time
best_params = explore_params("./data/week_6_bayes_optimization")

##### write out best parameters (number of region: counter number of best parameters in json)

In [20]:
''' (key: value) -> (region: counter number of best parameters in json file) '''
best_params = {64: 25, 65: 7, 11: 24, 86: 18, 70: 9, 71: 4, 72: 23, 76: 4, 74: 5, 75: 10, 12: 0, 13: 4, 14: 16, 15: 5, 80: 13, 
               81: 3, 82: 26, 90: 20, 78: 5, 22: 13, 23: 14, 88: 17, 25: 23, 26: 9, 91: 24, 93: 11, 94: 23, 100: 10, 98: 19, 
               77: 9, 101: 23, 38: 12, 39: 6, 40: 24, 41: 3, 42: 26, 10: 2, 79: 21, 52: 10, 73: 9, 60: 17, 61: 15, 87: 22, 
               84: 11, 85: 22, 87: 22, 92: 7, 24: 12, 27: 16, 28: 1}

In [25]:
#best_params.update(best_params_2)

In [62]:
print(sorted(best_params.keys()))

##### make a prediction for first week of june 2016 - for regions with initial good r2 score > 0.8 on test dataset (may 2016) will be used universal parameters set that was found in previous week, for other regions will utilize individual parameters set found by bayes optimization method and specified in json files and best_params dictionary. Selected with bayes optimization parameters helps to improve r2 score. For some regions this improvement looks impressive. 

##### In future it may be useful to add additional regression features - such as taxi calls from adjacent sectors or take advice indicated in the task pdf document - like number of passengers or type of payment from raw data. 

##### create pandas dataframe for prediction values

In [24]:
""" dataframe contains final prediction for all 102 regions from may 2016 - till june 2016 1-st week"""
prediction_time = pd.date_range('2016 May 1 00:00:00', '2016 June 30 23:00:00', freq = 'h')
full_pred_df = pd.DataFrame(np.array([[None]*taxi.shape[1]]*prediction_time.shape[0]), 
                                                index = prediction_time, columns=[data_2year.T[0, :].astype(int)])   

##### gather all parameters from json files into dictionary

In [25]:
parameters_dict = {}
folder_path = "./data/week_6_bayes_optimization"

for _, _, files in os.walk(folder_path):  
    for filename in files:
        with open(os.path.join(folder_path, filename).replace("\\","/")) as outfile:
            reg_num = int(filename.split('.')[0])
            values_params = json.load(outfile)
            
            parameters_dict[reg_num] = values_params

##### it is possible just choose parameters that showed best result on bayes optimizer cross validation

In [None]:
'''
parameters_dict_ = {}
folder_path = "./data/week_6_bayes_optimization"

for _, _, files in os.walk(folder_path):  
    for filename in files:
        with open(os.path.join(folder_path, filename).replace("\\","/")) as outfile:
            reg_num = int(filename.split('.')[0])
            values_params = json.load(outfile)
            best_index = np.argmax(np.array(values_params["values"]))
            parameters_dict_[reg_num] = values_params["params"][best_index]
'''            

##### make prediction on first week of june 2016 save it into the file and plot prediction

In [26]:
start_test = '2016-05-01T00:00:00.000000000'
end_test = '2016-05-30T23:00:00.000000000'
start_pred = '2016-06-01T00:00:00.000000000'
end_pred = '2016-06-07T23:00:00.000000000'

for reg in range(102):
    print("REGION: ", reg)
    df, _ = feature_constructor(region=reg, regres=((168, 70), (8760, 20)))
    
    if reg in best_params.keys():
        n = best_params[reg]
        p_select = { key:(int(round(value)) if (key == 'n_estimators') or (key == 'max_depth') else value) 
             for key, value in parameters_dict[reg]["params"][n].items()}
        mod = XGBRegressor(**p_select)
    else:
        mod = XGBRegressor(**p)
    
    mod.fit(df[features[reg][0]].values, df.taxi_call_num.values)
    #print("XGB train r2: ", mod.score(df[features[reg][0]].values, df.taxi_call_num.values))
    predicted_df = predict(region=reg, model=mod, end_t_prediction=end_pred, features=features[reg][0])
    
    print("train r2: ", mod.score(df[features[reg][0]].values, df.taxi_call_num.values))
    print("test round to nearest int r2: ", r2_score(verify_taxi.loc[start_test: end_test][region_map[reg]].values, 
                                    np.rint(predicted_df.loc[start_test: end_test]['taxi_call_num'].values)))
    
    print("predict round to nearest int r2: ", r2_score(verify_taxi.loc[start_pred: end_pred][region_map[reg]].values, 
                                    np.rint(predicted_df.loc[start_pred: end_pred]['taxi_call_num'].values)))
    print(" ")
    
    
    full_pred_df[region_map[reg]] = np.rint(predicted_df['taxi_call_num'])
    
    plt.figure(figsize=(25,10))
    plt.plot(verify_taxi.loc[start_pred: end_pred].index.values, verify_taxi.loc[start_pred: end_pred][region_map[reg]].values,
             color="red", alpha = 0.5, label='real value')
    plt.plot(verify_taxi.loc[start_pred: end_pred].index.values, predicted_df.loc[start_pred: end_pred]['taxi_call_num'].values, 
             color="green", alpha = 0.7, label='predicted')
    plt.xlabel('time (hour)')
    plt.legend()
    plt.grid()
    plt.show()    

In [30]:
full_pred_df.head()

In [28]:
full_pred_df.tail()

In [31]:
mypath = "./data"

In [32]:
full_pred_df.to_csv(mypath + "/" + "final_taxi_prediction_2015-05-06.csv", sep='\t', columns=list(full_pred_df.columns))

In [33]:
table = pd.read_csv(mypath + "/" + "final_taxi_prediction_2015-05-06.csv", sep='\t')

In [35]:
table.loc[start_test: end_pred, :].isnull().values.any()