In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from matplotlib.pyplot import rcParams
rcParams['figure.figsize'] = 16, 8

In [2]:
energy_0 = pd.read_csv('../../dataset/train.0.0.csv.gz')
energy_0 = energy_0[energy_0['meter_reading']>0]
energy_0['timestamp'] = pd.to_datetime(energy_0['timestamp'])
energy_0['hour'] = energy_0['timestamp'].dt.hour
print(energy_0.head())

      building_id  meter           timestamp  meter_reading  hour
704             0      0 2016-01-30 08:00:00        43.6839     8
725             0      0 2016-01-31 05:00:00        37.5408     5
737             0      0 2016-01-31 17:00:00        52.5571    17
2366            0      0 2016-04-08 14:00:00        59.3827    14
2923            0      0 2016-05-01 19:00:00       448.0000    19


In [3]:
energy_0_train, energy_0_test = train_test_split(energy_0, test_size=0.2)
print(energy_0_train.head())

      building_id  meter           timestamp  meter_reading  hour
7612            0      0 2016-11-13 04:00:00        221.149     4
5945            0      0 2016-09-04 17:00:00        236.848    17
5316            0      0 2016-08-09 12:00:00        303.739    12
6618            0      0 2016-10-02 18:00:00        235.483    18
4308            0      0 2016-06-28 12:00:00        247.769    12


In [4]:
energy_0_train_hours = energy_0_train.groupby('hour')
energy_0_train_averages = pd.DataFrame(
    {"Среднее": energy_0_train_hours.mean()['meter_reading'],
     "Медиана": energy_0_train_hours.median()['meter_reading']})
print(energy_0_train_averages)

         Среднее   Медиана
hour                      
0     237.162856  239.5790
1     240.222883  240.9440
2     240.701830  243.6740
3     241.029573  244.3570
4     241.839398  246.4040
5     236.547261  245.3805
6     237.693697  243.6740
7     240.096361  245.3805
8     243.595687  239.5790
9     234.679617  235.1420
10    236.532546  238.8960
11    236.190235  241.6260
12    237.110643  243.6740
13    237.500814  243.6740
14    235.562514  241.6260
15    237.028661  242.3090
16    238.992558  242.9910
17    234.929695  240.2610
18    234.608409  238.8960
19    237.564556  237.5310
20    236.560229  236.8480
21    238.091554  238.8960
22    238.494608  238.2140
23    238.447223  239.5790


In [5]:
def calculate_model(x):
    meter_readind_log = np.log(x.meter_reading + 1)
    meter_readind_mean = np.log(energy_0_train_averages['Среднее'][x.hour] + 1)
    meter_readind_median = np.log(energy_0_train_averages['Медиана'][x.hour] + 1)
    x['meter_reading_mean_q'] = (meter_readind_log - meter_readind_mean)**2
    x['meter_reading_median_q'] = (meter_readind_log - meter_readind_median)**2
    x['meter_reading_zero_q'] = (meter_readind_log)**2
    return x

energy_0_test = energy_0_test.apply(calculate_model, axis=1, result_type="expand")
print(energy_0_test.head())

      building_id  meter           timestamp  meter_reading  hour  \
5940            0      0 2016-09-04 12:00:00       242.3090    12   
8244            0      0 2016-12-09 12:00:00        95.5584    12   
8090            0      0 2016-12-03 02:00:00       169.2750     2   
4113            0      0 2016-06-20 09:00:00       260.7380     9   
4336            0      0 2016-06-29 16:00:00       247.7690    16   

      meter_reading_mean_q  meter_reading_median_q  meter_reading_zero_q  
5940              0.000466                0.000031             30.187687  
8244              0.814664                0.864488             20.886253  
8090              0.122703                0.131415             26.393031  
4113              0.010998                0.010591             30.995319  
4336              0.001290                0.000376             30.432045  


In [6]:
energy_0_test_mean_rmsle = np.sqrt(energy_0_test['meter_reading_mean_q'].sum()/len(energy_0_test))
energy_0_test_median_rmsle = np.sqrt(energy_0_test['meter_reading_median_q'].sum()/len(energy_0_test))
energy_0_test_zero_rmsle = np.sqrt(energy_0_test['meter_reading_zero_q'].sum()/len(energy_0_test))
print(f'Количество среднего: {energy_0_test_mean_rmsle}')
print(f'Количество медианы: {energy_0_test_median_rmsle}')
print(f'Количество нуля: {energy_0_test_zero_rmsle}')

Количество среднего: 0.2629727225722524
Количество медианы: 0.26540346271989473
Количество нуля: 5.449382285232245
