In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from datetime import datetime
from math import ceil

from sklearn.metrics import mean_absolute_error

In [2]:
data = pd.read_csv('data/train.csv',
                       encoding = "ISO-8859-1",
                       sep=';',
                       skiprows=1,
                       names=['Дата','Количество поступивших вызовов',
                              'Количество принятых вызовов','Количество потерянных вызовов',
                              'Ср. скорость ответа оператора (сек.)','Ср. время разговора (сек.)',
                              'Ср. число операторов','Макс. число операторов'])
print(data.shape)

target = pd.read_csv('data/submission_sample.csv',
                       encoding = "ISO-8859-1",
                       sep=';',
                       skiprows=1,
                       names=['Дата','Количество поступивших вызовов'])
print(target.shape)

data['Дата'] = pd.to_datetime(data['Дата'], format='%d.%m.%Y')
target['Дата'] = pd.to_datetime(target['Дата'], format='%d.%m.%Y')

print(data['Дата'].dtype)
print(target['Дата'].dtype)

(549, 8)
(61, 2)
datetime64[ns]
datetime64[ns]


In [3]:
df = pd.DataFrame()
df[['date', 'target']] = data[['Дата','Количество поступивших вызовов']]

In [4]:
def get_parsed_data(time):
    parsed = time
    return parsed.month, parsed.day, parsed.weekday() #, parsed.minute, parsed.second

df['month'], df['day'], df['weekday'] = \
    zip(*df.date.apply(get_parsed_data))

def week_of_month(dt):
    """ Returns the week of the month for the specified date.
    """

    first_day = dt.replace(day=1)

    dom = dt.day
    adjusted_dom = dom + first_day.weekday()

    return int(ceil(adjusted_dom/7.0))

df['week_of_month'] = df.date.apply(week_of_month)

def is_in_13_19(dt):
    day = dt.day
    if day in [13,14,15,16,17,18,19]:
        return 1
    return 0

def is_in_14_17(dt):
    day = dt.day
    if day in [14,15,16,17]:
        return 1
    return 0

df['13_19'] = df.date.apply(is_in_13_19)
df['14_17'] = df.date.apply(is_in_14_17)

In [5]:
df.head()

Unnamed: 0,date,target,month,day,weekday,week_of_month,13_19,14_17
0,2017-03-01,2142,3,1,2,1,0,0
1,2017-03-02,1781,3,2,3,1,0,0
2,2017-03-03,1785,3,3,4,1,0,0
3,2017-03-04,1073,3,4,5,1,0,0
4,2017-03-05,572,3,5,6,1,0,0


In [None]:
pd.get_dummies()

In [8]:
categorical = ['month', 'day', 'weekday', 'week_of_month']

df = pd.get_dummies(df, columns=categorical)
df.head()

Unnamed: 0,date,target,13_19,14_17,month_1,month_2,month_3,month_4,month_5,month_6,...,weekday_3,weekday_4,weekday_5,weekday_6,week_of_month_1,week_of_month_2,week_of_month_3,week_of_month_4,week_of_month_5,week_of_month_6
0,2017-03-01,2142,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,2017-03-02,1781,0,0,0,0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,0
2,2017-03-03,1785,0,0,0,0,1,0,0,0,...,0,1,0,0,1,0,0,0,0,0
3,2017-03-04,1073,0,0,0,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,0
4,2017-03-05,572,0,0,0,0,1,0,0,0,...,0,0,0,1,1,0,0,0,0,0


In [15]:
features = [c for c in df.columns if c not in ['date', 'target']]
t = 'target'
# features

In [30]:
import lightgbm
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsRegressor

In [16]:
lgb_train_data = lightgbm.Dataset(df[features], label=df[t])
lgb_parameters = {
    'objective': 'regression',
    'metric': 'mse',
    'boosting': 'gbdt',
    'bagging_freq': 20,
    'max_depth': 3,
    'n_estimators' : 100
}

lgb_model = lightgbm.train(lgb_parameters, lgb_train_data, )

pred = lgb_model.predict(df[features])
real = df[t].values

print(mean_absolute_error(real, pred))



252.59394702428577


In [28]:
parameters = {'n_estimators':[50, 100],
                  'max_depth':[10, 5, 3],
                  'learning_rate':[.5, 1],
                 'objective':['reg:linear', 'reg:gamma']}

xg_model = XGBRegressor(random_state=42,
                        n_jobs=-1,
#                         objective='reg:linear',
                        booster='dart')

xg_gscv = GridSearchCV(xg_model, parameters, cv=10, scoring='neg_mean_absolute_error')
xg_gscv.fit(df[features], df[t])



GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='reg:linear', random_state=42,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [50, 100], 'max_depth': [10, 5, 3], 'learning_rate': [0.5, 1], 'objective': ['reg:linear', 'reg:gamma']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_absolute_error', verbose=0)

In [29]:
print(- xg_gscv.best_score_)
print(xg_gscv.cv_results_['mean_train_score'])
print(xg_gscv.cv_results_['mean_test_score'])
print(xg_gscv.best_params_)
print()
print('Feature importances')
pd.DataFrame(xg_gscv.best_estimator_.feature_importances_,
             features).sort_values(0, ascending=False).head(10)

272.6286995198123
[-3.68802192e+00 -2.20537598e+01 -1.34053435e-01 -3.11919600e+00
 -7.00531846e+01 -1.03186919e+02 -3.15555814e+01 -5.49824311e+01
 -1.50089172e+02 -1.70598586e+02 -1.06681013e+02 -1.25095480e+02
 -5.08311527e-02 -1.83437197e+00 -1.06006557e-03 -1.83331279e+00
 -2.83262839e+01 -5.53910880e+01 -6.45005349e+00 -2.04700935e+01
 -1.06633866e+02 -1.27760930e+02 -6.41845851e+01 -8.40444680e+01]
[-302.15439627 -296.89041032 -302.58181818 -301.27630682 -286.93894255
 -278.89560943 -298.02926607 -285.96097335 -276.05469556 -272.62869952
 -283.02934635 -281.52210106 -310.68835327 -322.12592056 -310.69163249
 -322.12676377 -317.61417443 -297.52449083 -319.71063872 -306.53184028
 -296.09434418 -289.77749856 -299.08434628 -296.06407391]
{'learning_rate': 0.5, 'max_depth': 3, 'n_estimators': 50, 'objective': 'reg:gamma'}

Feature importances




Unnamed: 0,0
weekday_5,0.079498
weekday_6,0.07113
month_6,0.058577
13_19,0.050209
weekday_4,0.033473
weekday_0,0.033473
month_5,0.029289
weekday_1,0.029289
month_8,0.029289
day_8,0.029289


In [50]:
import sklearn
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'accuracy', 'roc_auc', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'brier_score_loss', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted'])

In [57]:
parameters = {'n_neighbors':[2]}

model = KNeighborsRegressor(n_jobs=-1)

kn_gscv = GridSearchCV(model, parameters, cv=5, scoring='neg_mean_absolute_error')
kn_gscv.fit(df[features], df[t])

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
          weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_neighbors': [2]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn',
       scoring='neg_mean_absolute_error', verbose=0)

In [58]:
print(- kn_gscv.best_score_)
print(kn_gscv.cv_results_['mean_train_score'])
print(kn_gscv.cv_results_['mean_test_score'])
print(kn_gscv.best_params_)

331.04280510018214
[-183.13773918]
[-331.0428051]
{'n_neighbors': 2}


