In [57]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from matplotlib.pyplot import rcParams
rcParams['figure.figsize'] = 16, 8

In [58]:
energy_0 = pd.read_csv('../../dataset/train.0.0.csv.gz')
energy_0 = energy_0[energy_0['meter_reading']>0]
energy_0['timestamp'] = pd.to_datetime(energy_0['timestamp'])
energy_0['hour'] = energy_0['timestamp'].dt.hour
print(energy_0.head())

      building_id  meter           timestamp  meter_reading  hour
704             0      0 2016-01-30 08:00:00        43.6839     8
725             0      0 2016-01-31 05:00:00        37.5408     5
737             0      0 2016-01-31 17:00:00        52.5571    17
2366            0      0 2016-04-08 14:00:00        59.3827    14
2923            0      0 2016-05-01 19:00:00       448.0000    19


In [59]:
energy_0_train, energy_0_test = train_test_split(energy_0, test_size=0.2)
print(energy_0_train.head())

      building_id  meter           timestamp  meter_reading  hour
8063            0      0 2016-12-01 23:00:00        186.339    23
5514            0      0 2016-08-17 18:00:00        310.565    18
5296            0      0 2016-08-08 16:00:00        316.025    16
8778            0      0 2016-12-31 18:00:00        215.006    18
7255            0      0 2016-10-29 07:00:00        210.911     7


In [60]:
energy_0_train_hours = energy_0_train.groupby('hour')
energy_0_train_averages = pd.DataFrame(
    {"Среднее": energy_0_train_hours.mean()['meter_reading'],
     "Медиана": energy_0_train_hours.median()['meter_reading']})
print(energy_0_train_averages)

         Среднее   Медиана
hour                      
0     238.836757  239.5790
1     239.147773  240.2610
2     240.638328  242.3090
3     239.609180  244.3570
4     239.022647  246.4045
5     240.733725  246.0630
6     237.274081  245.0390
7     238.628381  246.4040
8     242.581172  239.5790
9     234.781514  235.4830
10    234.518965  238.2140
11    235.880467  241.6260
12    235.628473  242.6500
13    237.244396  245.0390
14    234.629169  241.6260
15    235.713419  242.3090
16    234.273124  241.6260
17    236.338387  240.6025
18    236.169610  238.8960
19    237.659319  237.8725
20    236.432311  236.8480
21    235.460808  236.8480
22    240.226905  238.8960
23    240.348907  239.5790


In [61]:
def calculate_model(x):
    meter_readind_log = np.log(x.meter_reading + 1)
    meter_readind_mean = np.log(energy_0_train_averages['Среднее'][x.hour] + 1)
    meter_readind_median = np.log(energy_0_train_averages['Медиана'][x.hour] + 1)
    x['meter_reading_mean_q'] = (meter_readind_log - meter_readind_mean)**2
    x['meter_reading_median_q'] = (meter_readind_log - meter_readind_median)**2
    x['meter_reading_zero_q'] = (meter_readind_log)**2
    return x

energy_0_test = energy_0_test.apply(calculate_model, axis=1, result_type="expand")
print(energy_0_test.head())

      building_id  meter           timestamp  meter_reading  hour  \
3464            0      0 2016-05-24 08:00:00       256.6430     8   
8543            0      0 2016-12-21 23:00:00       176.7830    23   
5262            0      0 2016-08-07 06:00:00       322.1680     6   
6655            0      0 2016-10-04 07:00:00       249.1350     7   
725             0      0 2016-01-31 05:00:00        37.5408     5   

      meter_reading_mean_q  meter_reading_median_q  meter_reading_zero_q  
3464              0.003150                0.004696             30.819984  
8543              0.093440                0.091497             26.838240  
5262              0.092873                0.074356             33.387275  
6655              0.001841                0.000121             30.492493  
725               3.371334                3.451889             13.335040  


In [62]:
energy_0_test_mean_rmsle = np.sqrt(energy_0_test['meter_reading_mean_q'].sum()/len(energy_0_test))
energy_0_test_median_rmsle = np.sqrt(energy_0_test['meter_reading_median_q'].sum()/len(energy_0_test))
energy_0_test_zero_rmsle = np.sqrt(energy_0_test['meter_reading_zero_q'].sum()/len(energy_0_test))
print(f'Количество среднего: {energy_0_test_mean_rmsle}')
print(f'Количество медианы: {energy_0_test_median_rmsle}')
print(f'Количество нуля: {energy_0_test_zero_rmsle}')

Количество среднего: 0.2448269278489756
Количество медианы: 0.2464298984016869
Количество нуля: 5.458440306885124
