In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
building = pd.read_csv("../../dataset/building_metadata.csv.gz")
weather = pd.read_csv("../../dataset/weather_train.csv.gz")
energy_0 = pd.read_csv("../../dataset/train.0.0.csv.gz")
print(energy_0.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8784 entries, 0 to 8783
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   building_id    8784 non-null   int64  
 1   meter          8784 non-null   int64  
 2   timestamp      8784 non-null   object 
 3   meter_reading  8784 non-null   float64
dtypes: float64(1), int64(2), object(1)
memory usage: 274.6+ KB
None


In [3]:
energy_0 = pd.merge(left=energy_0, right=building, how='left', left_on='building_id', right_on='building_id')
energy_0.set_index(['timestamp', 'site_id'], inplace=True)
weather.set_index(['timestamp', 'site_id'], inplace=True)
energy_0 = pd.merge(left=energy_0, right=weather, how='left', left_index=True, right_index=True)
energy_0.reset_index(inplace=True)
energy_0 = energy_0[energy_0['meter_reading'] > 0]
energy_0['timestamp'] = pd.to_datetime(energy_0['timestamp'])
energy_0['hour'] = energy_0['timestamp'].dt.hour
print(energy_0.head())

               timestamp  site_id  building_id  meter  meter_reading  \
704  2016-01-30 08:00:00        0            0      0        43.6839   
725  2016-01-31 05:00:00        0            0      0        37.5408   
737  2016-01-31 17:00:00        0            0      0        52.5571   
2366 2016-04-08 14:00:00        0            0      0        59.3827   
2923 2016-05-01 19:00:00        0            0      0       448.0000   

     primary_use  square_feet  year_built  floor_count  air_temperature  \
704    Education         7432      2008.0          NaN              8.3   
725    Education         7432      2008.0          NaN             12.8   
737    Education         7432      2008.0          NaN             20.6   
2366   Education         7432      2008.0          NaN             21.7   
2923   Education         7432      2008.0          NaN             31.1   

      cloud_coverage  dew_temperature  precip_depth_1_hr  sea_level_pressure  \
704              NaN              6.

In [4]:
energy_0_train, energy_0_test = train_test_split(energy_0, test_size=0.2)
print(energy_0_train.head())

               timestamp  site_id  building_id  meter  meter_reading  \
5679 2016-08-24 15:00:00        0            0      0        243.674   
6935 2016-10-15 23:00:00        0            0      0        264.833   
4454 2016-07-04 14:00:00        0            0      0        253.230   
6053 2016-09-09 05:00:00        0            0      0        253.230   
6410 2016-09-24 02:00:00        0            0      0        238.896   

     primary_use  square_feet  year_built  floor_count  air_temperature  \
5679   Education         7432      2008.0          NaN             26.1   
6935   Education         7432      2008.0          NaN             26.1   
4454   Education         7432      2008.0          NaN             30.6   
6053   Education         7432      2008.0          NaN             24.4   
6410   Education         7432      2008.0          NaN             27.2   

      cloud_coverage  dew_temperature  precip_depth_1_hr  sea_level_pressure  \
5679             NaN             23.

In [5]:
energy_0_train_averages = energy_0_train.groupby('hour')['meter_reading'].mean()
energy_0_train_lr = pd.DataFrame(energy_0_train, columns=['meter_reading', 'air_temperature', 'dew_temperature'])
y = energy_0_train_lr['meter_reading']
x = energy_0_train_lr.drop(labels=['meter_reading'], axis=1)
model = LinearRegression().fit(x,y)
print(model.coef_, model.intercept_)

[2.27726879 4.09098108] 101.55332808469674


In [6]:
def calculate_model(x):
    meter_reading_log = np.log(x.meter_reading + 1)
    meter_reading_mean= np.log(energy_0_train_averages[x.hour]+1)
    meter_reading_lr = np.log(1 + x.air_temperature * model.coef_[0] +
                                  x.dew_temperature * model.coef_[1] +
                                  model.intercept_)
    x['meter_reading_lr_q'] = (meter_reading_log - meter_reading_lr)**2
    x['meter_reading_mean_q'] = (meter_reading_log - meter_reading_mean)**2
    return x
energy_0_test = energy_0_test.apply(calculate_model, axis=1, result_type='expand')
energy_0_test_lr_rmsle = np.sqrt(energy_0_test['meter_reading_lr_q'].sum()/len(energy_0_test))
energy_0_test_mean_rmsle = np.sqrt(energy_0_test['meter_reading_mean_q'].sum()/len(energy_0_test))
print(f'Количество среднего: {energy_0_test_mean_rmsle}')
print(f'Количество линейной регрессии: {energy_0_test_lr_rmsle}')

Количество среднего: 0.2683832121191359
Количество линейной регрессии: 0.23163338137714495
