In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.preprocessing import OneHotEncoder, StandardScaler
OHE = OneHotEncoder(sparse=False)
pd.set_option('display.max_rows', 30)

In [2]:
# 月日如果用起来要one-hot，之后产生的feature就太多了而且感觉没必要，可能考虑is_special更合适
# 但是前后依赖这里怎么办呢？
# train[['month','day']] = train.date.str.split('-', expand=True)
def get_x_y(df):
    data = df.drop(columns=['Unnamed: 0', 'route', 'calculated_day', 'date'])
    ohe = OHE.fit_transform(data[['hr', 'weekday']])
    ohe_df = pd.DataFrame(ohe, columns= OHE.get_feature_names(input_features=['hr', 'weekday']))
    data = pd.concat([data, ohe_df],axis = 1)
    data = data.drop(columns=['hr', 'weekday'])
    x = data.drop(columns=['label'])
    y = data.label
    return x, y

In [3]:
def mape_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true))


def scoring(reg, x, y):
    pred = reg.predict(x)
    return -mape_error(pred, y)

In [4]:
#A_2
train_path = '/Users/vayne/Desktop/dm_pro_engin/used_data_train/A_2.csv'
test_path = '/Users/vayne/Desktop/dm_pro_engin/used_data_test/A_2.csv'

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

train_x, train_y = get_x_y(train)
test_x, test_y = get_x_y(test)

gdbt = GradientBoostingRegressor(n_estimators=30, learning_rate=0.1, 
                                 max_depth=5, random_state=0, loss='ls').fit(train_x, train_y)

print(mape_error(train_y, gdbt.predict(train_x)))
print(mape_error(test_y, gdbt.predict(test_x)))
result = gdbt.predict(train_x)
train_compare = pd.concat([train_y, pd.Series(result)],axis = 1)
print(train_compare)

result = gdbt.predict(test_x)
test_compare = pd.concat([test_y, pd.Series(result)],axis = 1)
print(test_compare)

0.08255334084087726
0.08460062583348252
          label           0
0     75.086667   75.326313
1    148.053333  125.947091
2    109.863333  107.327638
3     74.633333   84.768029
4     88.636667   93.840482
..          ...         ...
346   67.673333   69.155669
347   67.823333   69.855846
348   69.840000   70.624971
349   62.610000   66.679704
350   58.156667   64.151552

[351 rows x 2 columns]
        label          0
0   75.250000  75.674498
1   95.980000  97.067129
2   80.773333  89.788348
3   62.000000  75.733621
4   63.230000  67.754606
5   62.546667  63.572396
6   73.080000  87.937307
7   80.073333  75.073820
8   80.103333  81.392324
9   89.450000  76.328087
10  85.500000  74.204509
11  71.763333  71.303731
12  71.486667  65.363734
13  68.963333  75.036181
14  68.896667  78.591293
15  61.296667  64.469059
16  77.910000  72.707433
17  81.890000  78.071813
18  70.700000  74.972945
19  75.013333  73.107701
20  69.283333  72.484336
21  66.796667  82.077155
22  67.853333  73.589466


In [5]:

train_path = '/Users/vayne/Desktop/dm_pro_engin/used_data_train/A_3.csv'
test_path = '/Users/vayne/Desktop/dm_pro_engin/used_data_test/A_3.csv'

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

train_x, train_y = get_x_y(train)
test_x, test_y = get_x_y(test)

gdbt = GradientBoostingRegressor(n_estimators=30, learning_rate=0.1, 
                                 max_depth=6, random_state=0, loss='ls').fit(train_x, train_y)

print(mape_error(train_y, gdbt.predict(train_x)))
print(mape_error(test_y, gdbt.predict(test_x)))
result = gdbt.predict(train_x)
train_compare = pd.concat([train_y, pd.Series(result)],axis = 1)
print(train_compare)

result = gdbt.predict(test_x)
test_compare = pd.concat([test_y, pd.Series(result)],axis = 1)
print(test_compare)

0.07277340260432219
0.16275976279740192
          label           0
0    146.513333  137.850583
1    203.340000  205.284957
2    168.586667  168.347214
3    139.636667  138.907044
4    108.560000  128.038185
..          ...         ...
342  110.613333  116.605373
343  140.670000  122.101508
344  257.630000  217.245434
345  137.190000  133.402179
346  129.810000  129.062094

[347 rows x 2 columns]
         label           0
0   206.073333  223.778420
1   324.796667  227.957055
2   217.020000  168.248670
3   256.776667  157.821267
4   143.293333  136.307842
5    97.700000  123.897955
6   227.373333  233.167015
7   185.443333  228.670717
8   285.266667  190.825971
9   215.743333  171.549402
10  227.686667  154.158896
11  129.380000  141.922173
12  105.200000  144.586451
13  167.313333  210.546945
14  126.723333  125.815555
15  129.363333  128.958324
16  130.300000  143.816085
17  121.636667  121.333598
18  116.326667  118.005880
19  110.940000  126.067752
20  113.813333  140.628703
21  11

In [11]:
train_path = '/Users/vayne/Desktop/dm_pro_engin/used_data_train/B_1.csv'
test_path = '/Users/vayne/Desktop/dm_pro_engin/used_data_test/B_1.csv'

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

train_x, train_y = get_x_y(train)
test_x, test_y = get_x_y(test)

gdbt = GradientBoostingRegressor(n_estimators=20, learning_rate=0.1, 
                                 max_depth=5, random_state=0, loss='ls').fit(train_x, train_y)

print(mape_error(train_y, gdbt.predict(train_x)))
print(mape_error(test_y, gdbt.predict(test_x)))
result = gdbt.predict(train_x)
train_compare = pd.concat([train_y, pd.Series(result)],axis = 1)
print(train_compare)

result = gdbt.predict(test_x)
test_compare = pd.concat([test_y, pd.Series(result)],axis = 1)
print(test_compare)

0.13131533602914258
0.13338188158616418
          label           0
0    100.320000  119.373973
1    212.310000  176.440073
2    129.945000  128.993017
3    118.616667  121.298575
4    118.785000  116.370566
..          ...         ...
288  148.810000  133.332265
289  203.920000  182.323066
290  111.363333  121.580259
291  117.240000  131.986381
292  120.330000  124.695296

[293 rows x 2 columns]
         label           0
0   111.033333  120.704641
1   127.316667  132.120867
2   110.166667  120.594447
3   135.886667  123.851094
4   109.650000  118.157664
5   125.720000  102.734853
6   112.630000  125.248746
7   111.406667  125.262227
8   132.586667  137.438328
9   119.023333  119.554817
10  111.993333  130.471694
11  133.820000  119.205362
12  148.145000  105.568240
13  110.536667  129.030810
14  141.666667  116.351726
15  117.930000  128.544325
16  133.916667  125.124187
17  198.110000  156.666204
18  142.356667  127.893709
19  105.203333  127.498075
20  122.140000  117.276505
21  14

In [12]:
train_p5= pd.read_csv('/Users/vayne/Desktop/dm_pro_engin/used_data_train/5min_shifted_with_10.1_B_1.csv')
train_m5= pd.read_csv('/Users/vayne/Desktop/dm_pro_engin/used_data_train/-5min_shifted_with_10.1_B_1.csv')


In [25]:
train_p5.reset_index(drop=True, inplace=True)
train_m5.reset_index(drop=True, inplace=True)
train.reset_index(drop=True, inplace=True)
#train1 = pd.concat([train,train_p5,train_m5])
train1 = pd.concat([train,train_p5])
train1.reset_index(drop=True, inplace=True)

In [35]:
train_x, train_y = get_x_y(train1)
test_x, test_y = get_x_y(test)

gdbt = GradientBoostingRegressor(n_estimators=5, learning_rate=0.1, 
                                 max_depth=3, random_state=0, loss='ls').fit(train_x, train_y)

print(mape_error(train_y, gdbt.predict(train_x)))
print(mape_error(test_y, gdbt.predict(test_x)))
result = gdbt.predict(train_x)
train_compare = pd.concat([train_y, pd.Series(result)],axis = 1)
print(train_compare)

result = gdbt.predict(test_x)
test_compare = pd.concat([test_y, pd.Series(result)],axis = 1)
print(test_compare)

0.22132865969004645
0.13766766452545837
          label           0
0    100.320000  123.883530
1    212.310000  139.851950
2    129.945000  123.883530
3    118.616667  123.883530
4    118.785000  123.883530
..          ...         ...
581  148.810000  129.096955
582  208.920000  131.459080
583  112.780000  130.875510
584  117.950000  129.096955
585  120.330000  127.445567

[586 rows x 2 columns]
         label           0
0   111.033333  126.211375
1   127.316667  146.082444
2   110.166667  127.038939
3   135.886667  127.038939
4   109.650000  127.276838
5   125.720000  127.038939
6   112.630000  127.038939
7   111.406667  126.211375
8   132.586667  146.082444
9   119.023333  127.038939
10  111.993333  127.038939
11  133.820000  127.276838
12  148.145000  127.038939
13  110.536667  127.038939
14  141.666667  118.883816
15  117.930000  127.862763
16  133.916667  128.075112
17  198.110000  130.077236
18  142.356667  128.075112
19  105.203333  127.862763
20  122.140000  127.862763
21  14

In [9]:
train_path = '/Users/vayne/Desktop/dm_pro_engin/used_data_train/B_3.csv'
test_path = '/Users/vayne/Desktop/dm_pro_engin/used_data_test/B_3.csv'

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

train_x, train_y = get_x_y(train)
test_x, test_y = get_x_y(test)

gdbt = GradientBoostingRegressor(n_estimators=20, learning_rate=0.1, 
                                 max_depth=5, random_state=0, loss='ls').fit(train_x, train_y)

print(mape_error(train_y, gdbt.predict(train_x)))
print(mape_error(test_y, gdbt.predict(test_x)))
result = gdbt.predict(train_x)
train_compare = pd.concat([train_y, pd.Series(result)],axis = 1)
print(train_compare)

result = gdbt.predict(test_x)
test_compare = pd.concat([test_y, pd.Series(result)],axis = 1)
print(test_compare)

0.10756328601224646
0.13220243072473067
          label           0
0    147.723333  127.267599
1    148.253333  141.846739
2    107.110000  114.405636
3    155.416667  140.574057
4    103.586667  126.071271
..          ...         ...
344  102.470000  108.592797
345  118.543333  112.058245
346   94.883333  103.088640
347  113.926667  106.591125
348   79.440000  101.265893

[349 rows x 2 columns]
         label           0
0   113.593333   97.332757
1   111.976667  103.867678
2    94.890000  104.251552
3    95.633333  105.432438
4    93.423333  115.068698
5    94.453333  101.977422
6   108.876667  107.123063
7   133.596667  101.824463
8   116.496667  106.807329
9    93.263333  113.178156
10   97.956667  109.882123
11  107.416667  127.380681
12   96.690000  105.433921
13  136.133333  112.174408
14   94.173333  101.709626
15  104.376667  100.497580
16   99.193333  141.071738
17  107.746667  104.077517
18  104.146667  101.445199
19   94.710000  111.647408
20   95.560000  108.359930
21  11

In [65]:
train_path = '/Users/vayne/Desktop/dm_pro_engin/used_data_train/C_1.csv'
test_path = '/Users/vayne/Desktop/dm_pro_engin/used_data_test/C_1.csv'

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

train_x, train_y = get_x_y(train)
test_x, test_y = get_x_y(test)

gdbt = GradientBoostingRegressor(n_estimators=8, learning_rate=0.1, 
                                 max_depth=5, random_state=0, loss='ls').fit(train_x, train_y)

print(mape_error(train_y, gdbt.predict(train_x)))
print(mape_error(test_y, gdbt.predict(test_x)))
result = gdbt.predict(train_x)
train_compare = pd.concat([train_y, pd.Series(result)],axis = 1)
print(train_compare)

result = gdbt.predict(test_x)
test_compare = pd.concat([test_y, pd.Series(result)],axis = 1)
print(test_compare)

0.13230485192406172
0.19453825303148206
          label           0
0    140.245000  176.741228
1    231.923333  196.696115
2    146.070000  174.691842
3    190.573333  169.631021
4    207.370000  198.804284
..          ...         ...
270  282.260000  207.299104
271  230.510000  216.360554
272  198.123333  186.413262
273  266.680000  223.140409
274  267.110000  213.325970

[275 rows x 2 columns]
         label           0
0   158.785000  183.628196
1   162.683333  178.591640
2   193.376667  181.290942
3   170.245000  170.812322
4   171.540000  187.730217
5   166.753333  177.119906
6   158.060000  185.567379
7   146.846667  183.628196
8   210.020000  178.591640
9   157.720000  196.689989
10  168.473333  170.812322
11  187.665000  187.730217
12  270.883333  177.119906
13  171.320000  185.567379
14  280.493333  195.236339
15  207.370000  173.943350
16  269.365000  190.148895
17  404.243333  194.380886
18  266.843333  193.499907
19  226.350000  203.376835
20  168.280000  177.004005
21  27

In [66]:
train_p5= pd.read_csv('/Users/vayne/Desktop/dm_pro_engin/used_data_train/5min_shifted_C_1.csv')
train_m5= pd.read_csv('/Users/vayne/Desktop/dm_pro_engin/used_data_train/-5min_shifted_C_1.csv')

In [67]:
train_p5.reset_index(drop=True, inplace=True)
train_m5.reset_index(drop=True, inplace=True)
train.reset_index(drop=True, inplace=True)
#train1 = pd.concat([train,train_p5,train_m5])
train1 = pd.concat([train,train_p5])
train1.reset_index(drop=True, inplace=True)

In [68]:
train_x, train_y = get_x_y(train1)
test_x, test_y = get_x_y(test)

gdbt = GradientBoostingRegressor(n_estimators=30, learning_rate=0.1, 
                                 max_depth=4, random_state=0, loss='ls').fit(train_x, train_y)

print(mape_error(train_y, gdbt.predict(train_x)))
print(mape_error(test_y, gdbt.predict(test_x)))
result = gdbt.predict(train_x)
train_compare = pd.concat([train_y, pd.Series(result)],axis = 1)
print(train_compare)

result = gdbt.predict(test_x)
test_compare = pd.concat([test_y, pd.Series(result)],axis = 1)
print(test_compare)

0.1090407309927677
0.1936652854065166
          label           0
0    140.245000  166.330834
1    231.923333  197.703157
2    146.070000  162.850243
3    190.573333  159.720588
4    207.370000  201.276029
..          ...         ...
543  283.800000  231.149707
544  230.510000  239.951900
545  198.123333  227.890675
546  261.010000  239.891815
547  267.110000  235.434175

[548 rows x 2 columns]
         label           0
0   158.785000  178.241258
1   162.683333  173.201385
2   193.376667  187.668158
3   170.245000  161.509270
4   171.540000  165.759641
5   166.753333  169.378819
6   158.060000  170.919259
7   146.846667  180.816640
8   210.020000  175.946243
9   157.720000  201.735223
10  168.473333  168.222627
11  187.665000  171.920564
12  270.883333  173.739059
13  171.320000  170.282751
14  280.493333  187.773710
15  207.370000  177.714726
16  269.365000  188.100980
17  404.243333  185.924575
18  266.843333  193.268456
19  226.350000  186.674408
20  168.280000  180.640762
21  277.

In [69]:
train_path = '/Users/vayne/Desktop/dm_pro_engin/used_data_train/C_3.csv'
test_path = '/Users/vayne/Desktop/dm_pro_engin/used_data_test/C_3.csv'

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

train_x, train_y = get_x_y(train)
test_x, test_y = get_x_y(test)

gdbt = GradientBoostingRegressor(n_estimators=30, learning_rate=0.1, 
                                 max_depth=5, random_state=0, loss='ls').fit(train_x, train_y)

print(mape_error(train_y, gdbt.predict(train_x)))
print(mape_error(test_y, gdbt.predict(test_x)))
result = gdbt.predict(train_x)
train_compare = pd.concat([train_y, pd.Series(result)],axis = 1)
print(train_compare)

result = gdbt.predict(test_x)
test_compare = pd.concat([test_y, pd.Series(result)],axis = 1)
print(test_compare)

0.10786627146418085
0.16136689463949525
          label           0
0    132.050000  152.067457
1    219.150000  191.975288
2    166.150000  172.776136
3    161.786667  191.049192
4    207.950000  204.496302
..          ...         ...
251  178.660000  186.444201
252  152.910000  188.984838
253  250.595000  195.149026
254  142.770000  175.902284
255  350.115000  230.203302

[256 rows x 2 columns]
         label           0
0   194.430000  190.127255
1   188.860000  198.920339
2   168.170000  189.462436
3   169.465000  175.393228
4   196.430000  169.789290
5          NaN  185.882848
6   141.630000  181.997898
7   179.806667  176.649185
8   173.563333  201.185904
9   178.855000  176.205615
10  131.273333  176.444455
11  147.816667  173.413913
12  152.610000  185.654517
13  155.760000  183.049125
14  189.830000  168.015589
15  147.035000  180.761487
16  151.936667  193.960496
17  224.380000  178.851561
18  204.046667  188.254823
19  199.775000  180.766729
20  146.396667  176.583275
21  14

In [70]:
train_p5= pd.read_csv('/Users/vayne/Desktop/dm_pro_engin/used_data_train/5min_shifted_C_3.csv')
train_m5= pd.read_csv('/Users/vayne/Desktop/dm_pro_engin/used_data_train/-5min_shifted_C_3.csv')

In [71]:
train_p5.reset_index(drop=True, inplace=True)
train_m5.reset_index(drop=True, inplace=True)
train.reset_index(drop=True, inplace=True)
#train1 = pd.concat([train,train_p5,train_m5])
train1 = pd.concat([train,train_p5])
train1.reset_index(drop=True, inplace=True)

In [84]:
train_x, train_y = get_x_y(train1)
test_x, test_y = get_x_y(test)

gdbt = GradientBoostingRegressor(n_estimators=28, learning_rate=0.1, 
                                 max_depth=5, random_state=0, loss='ls').fit(train_x, train_y)

print(mape_error(train_y, gdbt.predict(train_x)))
print(mape_error(test_y, gdbt.predict(test_x)))
result = gdbt.predict(train_x)
train_compare = pd.concat([train_y, pd.Series(result)],axis = 1)
print(train_compare)

result = gdbt.predict(test_x)
test_compare = pd.concat([test_y, pd.Series(result)],axis = 1)
print(test_compare)

0.12164487806141888
0.14916580370620297
          label           0
0    132.050000  153.946590
1    219.150000  202.972433
2    166.150000  177.973910
3    161.786667  196.964503
4    207.950000  201.497248
..          ...         ...
506  242.460000  194.150603
507  152.910000  189.106681
508  250.595000  199.833973
509  138.943333  182.259279
510  286.130000  206.104448

[511 rows x 2 columns]
         label           0
0   194.430000  166.002734
1   188.860000  187.275097
2   168.170000  173.661384
3   169.465000  165.699750
4   196.430000  193.291889
5          NaN  162.690654
6   141.630000  187.210574
7   179.806667  170.004715
8   173.563333  186.401284
9   178.855000  172.806315
10  131.273333  170.537555
11  147.816667  198.526976
12  152.610000  163.453082
13  155.760000  185.215104
14  189.830000  191.099721
15  147.035000  187.790587
16  151.936667  180.819540
17  224.380000  179.970521
18  204.046667  187.627105
19  199.775000  184.416156
20  146.396667  173.104521
21  14