 ### Задание
 Разделите набор данных на обучающие/проверочные в пропорции 80/20.
Загрузите данные и очистите значения (нулями и средними). Постройте модель линейной регрессии для каждого часа в отдельности, используя: 
* температуру воздуха (air_temperature), 
* влажность (dew_temperature), 
* атмосферное давление (sea_level_pressure),
* скорость ветра (wind_speed)
* облачность (cloud_coverage).


Рассчитайте качество построенной модели по проверочным данным. Используйте данные:
* http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz
* http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz
* http://video.ittensive.com/machine-learning/ashrae/train.0.0.csv.gz

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

загрузка данных

In [9]:
build = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz")
weather = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz")
energy_ = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/train.0.0.csv.gz")
print (energy_.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8784 entries, 0 to 8783
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   building_id    8784 non-null   int64  
 1   meter          8784 non-null   int64  
 2   timestamp      8784 non-null   object 
 3   meter_reading  8784 non-null   float64
dtypes: float64(1), int64(2), object(1)
memory usage: 274.6+ KB
None


обьединение данных

In [10]:
#обьединяем здание  и энерго потребление
energy_ = pd.merge(left=energy_, right=build, how="left",
                   left_on="building_id", right_on="building_id")

print (weather.info())
print (energy_.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139773 entries, 0 to 139772
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   site_id             139773 non-null  int64  
 1   timestamp           139773 non-null  object 
 2   air_temperature     139718 non-null  float64
 3   cloud_coverage      70600 non-null   float64
 4   dew_temperature     139660 non-null  float64
 5   precip_depth_1_hr   89484 non-null   float64
 6   sea_level_pressure  129155 non-null  float64
 7   wind_direction      133505 non-null  float64
 8   wind_speed          139469 non-null  float64
dtypes: float64(7), int64(1), object(1)
memory usage: 9.6+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 8784 entries, 0 to 8783
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   building_id    8784 non-null   int64  
 1   meter          8784 non-null  

In [11]:
# индексы для обьединения с метеоданными
energy_.set_index(["timestamp", "site_id"], inplace=True)
weather.set_index(["timestamp", "site_id"], inplace=True)
# обьеденяем с метеоданными
energy_ = pd.merge(left=energy_, right=weather, how="left",
                   left_index=True, right_index=True)
# сбросим индекс
energy_.reset_index(inplace=True)


очищаем данные

In [12]:
energy_ = energy_[energy_["meter_reading"] > 0]
energy_["air_temperature"].fillna(0, inplace=True) # air_temperature - температура воздуха, С NaN -> 0
energy_["dew_temperature"].fillna(0, inplace=True) # dew_temperature - точка росы (влажность), С NaN -> 0
energy_["cloud_coverage"].fillna(0, inplace=True) # cloud_coverage - облачность, % NaN -> 0
energy_["wind_speed"].fillna(0, inplace=True)# wind_speed - скорость ветра, м/с NaN -> 0
energy_sea_level_pressure_mean = energy_["sea_level_pressure"].mean() #sea_level_pressure:-давление, мбар NaN -> среднее
energy_["sea_level_pressure"] = energy_["sea_level_pressure"].apply(lambda x:energy_sea_level_pressure_mean if x!=x else x)

print (energy_.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5411 entries, 704 to 8783
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   timestamp           5411 non-null   object 
 1   site_id             5411 non-null   int64  
 2   building_id         5411 non-null   int64  
 3   meter               5411 non-null   int64  
 4   meter_reading       5411 non-null   float64
 5   primary_use         5411 non-null   object 
 6   square_feet         5411 non-null   int64  
 7   year_built          5411 non-null   float64
 8   floor_count         0 non-null      float64
 9   air_temperature     5411 non-null   float64
 10  cloud_coverage      5411 non-null   float64
 11  dew_temperature     5411 non-null   float64
 12  precip_depth_1_hr   5411 non-null   float64
 13  sea_level_pressure  5411 non-null   float64
 14  wind_direction      5236 non-null   float64
 15  wind_speed          5411 non-null   float64
dtypes: f

добавляем час в данные

In [13]:
energy_["timestamp"] = pd.to_datetime(energy_["timestamp"]) # преобразуем поле в "датувремя"
energy_["hour"] = energy_["timestamp"].dt.hour # бодавляем поле "час"
print (energy_.head())

               timestamp  site_id  building_id  meter  meter_reading  \
704  2016-01-30 08:00:00        0            0      0        43.6839   
725  2016-01-31 05:00:00        0            0      0        37.5408   
737  2016-01-31 17:00:00        0            0      0        52.5571   
2366 2016-04-08 14:00:00        0            0      0        59.3827   
2923 2016-05-01 19:00:00        0            0      0       448.0000   

     primary_use  square_feet  year_built  floor_count  air_temperature  \
704    Education         7432      2008.0          NaN              8.3   
725    Education         7432      2008.0          NaN             12.8   
737    Education         7432      2008.0          NaN             20.6   
2366   Education         7432      2008.0          NaN             21.7   
2923   Education         7432      2008.0          NaN             31.1   

      cloud_coverage  dew_temperature  precip_depth_1_hr  sea_level_pressure  \
704              0.0              6.

отделяем проверочные данные

In [14]:
energy_learn, energy_test = train_test_split(energy_, test_size=0.2)
print (energy_test.head())

               timestamp  site_id  building_id  meter  meter_reading  \
8552 2016-12-22 08:00:00        0            0      0        176.783   
7564 2016-11-11 04:00:00        0            0      0        176.783   
5765 2016-08-28 05:00:00        0            0      0        236.166   
3550 2016-05-27 22:00:00        0            0      0        174.735   
4208 2016-06-24 08:00:00        0            0      0        302.374   

     primary_use  square_feet  year_built  floor_count  air_temperature  \
8552   Education         7432      2008.0          NaN             11.1   
7564   Education         7432      2008.0          NaN             16.7   
5765   Education         7432      2008.0          NaN             26.7   
3550   Education         7432      2008.0          NaN             27.8   
4208   Education         7432      2008.0          NaN             25.0   

      cloud_coverage  dew_temperature  precip_depth_1_hr  sea_level_pressure  \
8552             4.0              9.

### линейная регессия
подготовим датафрейм (80%) для линейной регресии

In [15]:

energy_learn_80 = pd.DataFrame(energy_learn,
    columns=["meter_reading", "air_temperature", "dew_temperature",
             "sea_level_pressure", "wind_speed", "cloud_coverage", "hour"])

создаем список параметров, расчитываем коэфициенты для каждого часа 

In [16]:
hours = range(0,24)
energy_lr = [[]]*len(hours)
for hour in hours:
    energy_learn_80_hourly = energy_learn_80[energy_learn_80["hour"]==hour]
    y = energy_learn_80_hourly["meter_reading"]
    x = energy_learn_80_hourly.drop(labels=["meter_reading","hour"], axis=1)
    model = LinearRegression().fit(x, y)
    energy_lr[hour] = model.coef_
    energy_lr[hour] = np.append(energy_lr[hour], model.intercept_)
    del energy_learn_80_hourly
print (energy_lr)

[array([ 4.90530759e+00,  2.05811152e+00, -5.92535214e-01, -1.90001040e+00,
        1.31024942e-03,  6.85581464e+02]), array([ 6.58329478e+00,  2.47889439e-01, -1.16865187e+00, -4.18515159e+00,
        1.31122153e+00,  1.27513797e+03]), array([ 8.22856283e+00, -1.22217405e-01, -7.03103124e-02, -1.91380021e+00,
       -3.79775567e-01,  1.24382487e+02]), array([ 6.22773195e+00,  9.07892780e-01, -9.60442741e-01, -2.54643811e+00,
        5.00817039e+00,  1.05594634e+03]), array([ 7.92171743e+00, -1.36137190e+00, -6.83013916e-01, -3.72681931e+00,
        3.31819409e+00,  7.87257624e+02]), array([ 9.89700625e+00, -2.37664629e+00,  2.42904064e-03, -2.79431674e+00,
        7.38481624e-01,  6.48627762e+01]), array([ 8.89406033e+00, -1.70353620e+00, -5.27579232e-01, -1.55989986e+00,
       -1.40964059e+00,  6.18578226e+02]), array([   4.22411452,    2.7184644 ,    0.61022765,   -1.1603929 ,
          0.95172375, -524.17860824]), array([ 9.98257670e+00, -2.39536715e+00, -9.88590530e-02, -3.791443

оцениваем модель

In [17]:
def calc_model (x):
    model = energy_lr[x.hour]
    meter_reading_log = np.log(x.meter_reading + 1)
    meter_reading_lr = np.log(1 + x.air_temperature * model[0] + 
        x.dew_temperature * model[1] + x.sea_level_pressure * model[2] +
        x.wind_speed * model[3] + x.cloud_coverage * model[4] + model[5])
    x["meter_reading_lr_q"] = (meter_reading_log - meter_reading_lr)**2
    return x

energy_test = energy_test.apply(calc_model,axis=1, result_type="expand")
energy_test_lr_rmsle = np.sqrt(energy_test["meter_reading_lr_q"].sum() / len(energy_test))
print ("Качество почасовой линейной регрессии, 5 параметров:",
       energy_test_lr_rmsle, round(energy_test_lr_rmsle, 1))

Качество почасовой линейной регрессии, 5 параметров: 0.21777970895488122 0.2
