In [1]:
#Получите данные по энергопотреблению первых 20 зданий (building_id от 0 до 19).

#Заполните отсутствующие значения по погоде интерполяционными данными.

#Разделите данные на обучающие/проверочные в пропорции 80/20.

#Постройте и найдите общее качество модели линейной регрессии, построенной по часам для каждого из первых 20 зданий по следующим параметрам: air_temperature, dew_temperature, cloud_coverage, wind_speed, precip_depth_1_hr, sea_level_pressure, is_holiday. Всего требуется построить 480 моделей линейной регрессии, вычислить по ним проверочные значения энергопотребления и получить итоговую оценку качества такой модели.

#Для расчета последнего параметра (is_holiday) используйте график публичных выходных в США: USFederalHolidayCalendar

#Исходные данные:

#video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz

#video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz

#video.ittensive.com/machine-learning/ashrae/train.0.csv.gz

In [2]:

import pandas as pd 
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import numpy as np
from scipy.interpolate import interp1d
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [3]:
# Загрузка данных, отсечение 20 зданий, объединение и оптимизация
def reduce_mem_usage (df):
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if str(col_type)[:5] == "float":
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.finfo("f2").min and c_max < np.finfo("f2").max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo("f4").min and c_max < np.finfo("f4").max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
        elif str(col_type)[:3] == "int":
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo("i1").min and c_max < np.iinfo("i1").max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo("i2").min and c_max < np.iinfo("i2").max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo("i4").min and c_max < np.iinfo("i4").max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo("i8").min and c_max < np.iinfo("i8").max:
                df[col] = df[col].astype(np.int64)
        elif col == "timestamp":
            df[col] = pd.to_datetime(df[col])
        elif str(col_type)[:8] != "datetime":
            df[col] = df[col].astype("category")
    end_mem = df.memory_usage().sum() / 1024**2
    print('Потребление памяти меньше на', round(start_mem - end_mem, 2), 'Мб (минус', round(100 * (start_mem - end_mem) / start_mem, 1), '%)')
    return df

In [4]:

buildings = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz")
weather = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz")
energy = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/train.0.csv.gz")
# отсечение 20ти зданий
energy = energy[energy["building_id"]<20]

energy = pd.merge(left=energy, right=buildings, how="left",
                   left_on="building_id", right_on="building_id")
energy = energy.set_index(["timestamp", "site_id"])
weather = weather.set_index(["timestamp", "site_id"])
energy = pd.merge(left=energy, right=weather, how="left",
                  left_index=True, right_index=True)
energy.reset_index(inplace=True)
energy = energy.drop(columns=["meter", "site_id", "floor_count", "year_built", "square_feet", "floor_count"], axis=1)
del buildings
del weather
print (energy.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175680 entries, 0 to 175679
Data columns (total 11 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   timestamp           175680 non-null  object 
 1   building_id         175680 non-null  int64  
 2   meter_reading       175680 non-null  float64
 3   primary_use         175680 non-null  object 
 4   air_temperature     175620 non-null  float64
 5   cloud_coverage      99080 non-null   float64
 6   dew_temperature     175620 non-null  float64
 7   precip_depth_1_hr   175660 non-null  float64
 8   sea_level_pressure  173980 non-null  float64
 9   wind_direction      170680 non-null  float64
 10  wind_speed          175680 non-null  float64
dtypes: float64(8), int64(1), object(2)
memory usage: 14.7+ MB
None


In [5]:
# добавление часа и праздников в данные используя дату и время
energy["hour"] = energy["timestamp"].dt.hour.astype("int8") #pd.to_datetime(energy["timestamp"], format='%d-%m-%Y %H:%M', errors='coerce') 
dates_range = pd.date_range(start='2015-12-31', end='2017-01-01')
us_holidays = calendar().holidays(start=dates_range.min(), end=dates_range.max())
energy["date"] = pd.to_datetime(energy["timestamp"].dt.date) #format='%d-%m-%Y %H:%M', errors='coerce' strftime("%d/%m/%Y %H:%M")
energy["is_holiday"] = energy["date"].isin(us_holidays).astype("int8")
print(energy)


AttributeError: Can only use .dt accessor with datetimelike values

In [6]:
# интерполяция метеорологических данных
energy["precip_depth_1_hr"] = energy["precip_depth_1_hr"].apply(lambda x:x if x>0 else 0)
interpolate_columns = ["air_temperature", "dew_temperature",
                       "cloud_coverage", "wind_speed",
                       "precip_depth_1_hr", "sea_level_pressure"]
for col in interpolate_columns:
    energy[col] = energy[col].interpolate(limit_direction='both',
                            kind='cubic')

In [7]:
# разделение данных на обучающие и проверочные
energy_train, energy_test = train_test_split(energy[energy["meter_reading"] > 0], test_size=0.2)
print(energy_train.head())

                  timestamp  building_id  meter_reading primary_use  \
126954  2016-09-21 11:00:00           14        559.017   Education   
161189  2016-12-01 19:00:00            9        153.167      Office   
114829  2016-08-27 05:00:00            9        156.033      Office   
89104   2016-07-04 15:00:00            4       1667.490   Education   
86713   2016-06-29 15:00:00           13        424.211   Education   

        air_temperature  cloud_coverage  dew_temperature  precip_depth_1_hr  \
126954             25.0        6.000000             22.8                0.0   
161189             30.0        5.672131             19.4                0.0   
114829             26.7        7.782178             22.8                0.0   
89104              31.1        2.000000             23.9                0.0   
86713              28.9        4.000000             22.8                0.0   

        sea_level_pressure  wind_direction  wind_speed  
126954              1015.2             0.

In [8]:
# построение массива моделей линейной регрессии по зданию и часу всего 480 моделей
hours = range(0, 24)
buildings = range(0, energy_train["building_id"].max()+1)
lr_columns = ["meter_reading", "hour", "building_id", 
    "air_temperature", "dew_temperature",
    "sea_level_pressure", "wind_speed","cloud_coverage", 
    "is_holiday"]
energy_train_lr = pd.DataFrame(energy_train, columns=lr_columns)
energy_lr = [[]]*len(buildings)
for building in buildings:
    energy_lr[building] = [[]]*len(hours)
    energy_train_b = energy_train_lr[energy_train_lr["building_id"]==building]
    for hour in hours:
        energy_train_bh = energy_train_b[energy_train_b["hour"]==hour]
        y = energy_train_bh["meter_reading"]
        x = energy_train_bh.drop(labels=["meter_reading", 
            "hour", "building_id"], axis=1)
        model = LinearRegression().fit(x, y)
        energy_lr[building][hour] = model.coef_
        energy_lr[building][hour] = np.append(energy_lr[building][hour],
                                            model.intercept_)
print(energy_lr)


ValueError: Found array with 0 sample(s) (shape=(0, 6)) while a minimum of 1 is required.

In [9]:
# расчет качества посмотроения моделей 
def calculate_model (x):
    model = energy_lr[x.building_id][x.hour]
    lr = np.sum([x[col]*model[i] for i, col in enumerate](lr_columns[3:]))
    lr += model[len(lr_columns)-3]
    x["meter_reading_lr_q"] = (np.log(x.meter_reading + 1) - np.log(1 + lr))**2
    return x

energy_test = energy_test.apply(calculate_model, axis = 1, result_type="expand")
energy_test_lr_rmsle = np.sqrt(energy_test["meter_reading_lr_q"].sum() / len(energy_test))
print("Качество линейной регрессии, 20 здаений: ", energy_test_lr_rmsle, round(energy_test_lr_rmsle, 1))

  lr = np.sum([x[col]*model[i] for i, col in enumerate](lr_columns[3:]))
  lr = np.sum([x[col]*model[i] for i, col in enumerate](lr_columns[3:]))
  lr = np.sum([x[col]*model[i] for i, col in enumerate](lr_columns[3:]))


AttributeError: 'Series' object has no attribute 'hour'