Получите данные по энергопотреблению первых 20 зданий (building_id от 0 до 19).

Заполните отсутствующие значения по погоде интерполяционными данными.

Разделите данные на обучающие/проверочные в пропорции 80/20.

Постройте (1) первый набор моделей линейной регрессии по часам для каждого из первых 20 зданий по следующим параметрам: 
* air_temperature,
* dew_temperature, 
* cloud_coverage, 
* wind_speed, 
* sea_level_pressure.

Постройте для этих же 20 зданий (2) второй набор моделей линейной регрессии по часам по параметрам: дни недели и праздники (is_holiday). Требуется построить еще 480 моделей.

Используйте логарифм целевого показателя (meter_reading_log) для обоих наборов моделей.
Данные:
* video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz
* video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz
* video.ittensive.com/machine-learning/ashrae/train.0.csv.gz

In [2]:
# библиотеки
import pandas as pd
import numpy as np
from scipy.interpolate import interp1d
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

загрузка данных

In [3]:
buildings = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz")
weather = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz")
energy_ = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/train.0.csv.gz")

функция оптимизаци памяти

In [4]:
def reduce_mem_usage (df):
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if str(col_type)[:5] == "float":
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.finfo("f2").min and c_max < np.finfo("f2").max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo("f4").min and c_max < np.finfo("f4").max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
        elif str(col_type)[:3] == "int":
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo("i1").min and c_max < np.iinfo("i1").max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo("i2").min and c_max < np.iinfo("i2").max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo("i4").min and c_max < np.iinfo("i4").max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo("i8").min and c_max < np.iinfo("i8").max:
                df[col] = df[col].astype(np.int64)
        elif col == "timestamp":
            df[col] = pd.to_datetime(df[col])
        elif str(col_type)[:8] != "datetime":
            df[col] = df[col].astype("category")
    end_mem = df.memory_usage().sum() / 1024**2
    print('Потребление памяти меньше на', round(start_mem - end_mem, 2), 'Мб (минус', round(100 * (start_mem - end_mem) / start_mem, 1), '%)')
    return df

обьединяем

In [5]:
weather = weather[weather["site_id"] == 0]# 
energy_ = energy_[energy_["building_id"]<20] #отсекаем 20 зданий
energy_ = pd.merge(left=energy_, right=buildings, how="left",
                   left_on="building_id", right_on="building_id")
del buildings
print (energy_.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 175680 entries, 0 to 175679
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   building_id    175680 non-null  int64  
 1   meter          175680 non-null  int64  
 2   timestamp      175680 non-null  object 
 3   meter_reading  175680 non-null  float64
 4   site_id        175680 non-null  int64  
 5   primary_use    175680 non-null  object 
 6   square_feet    175680 non-null  int64  
 7   year_built     175680 non-null  float64
 8   floor_count    0 non-null       float64
dtypes: float64(3), int64(4), object(2)
memory usage: 13.4+ MB
None


заполняем интерполяцией "погоду"

In [6]:
interpolate_columns = ["air_temperature", "dew_temperature",
                       "cloud_coverage", "wind_speed",
                       "sea_level_pressure"]
for col in interpolate_columns:
    weather[col] = weather[col].interpolate(limit_direction='both',
                            kind='cubic')

обьединяем погоду и потреблением и оптимизируем

In [7]:
energy_ = energy_.set_index(["timestamp", "site_id"])
weather = weather.set_index(["timestamp", "site_id"])
energy_ = pd.merge(left=energy_, right=weather, how="left",
                  left_index=True, right_index=True)
energy_.reset_index(inplace=True)
energy_ = energy_.drop(columns=["meter", "site_id", "floor_count",
                              "primary_use", "year_built"], axis=1)
del weather
energy_ = reduce_mem_usage(energy_)
print (energy_.info())

Потребление памяти меньше на 9.88 Мб (минус 67.0 %)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175680 entries, 0 to 175679
Data columns (total 11 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   timestamp           175680 non-null  datetime64[ns]
 1   building_id         175680 non-null  int8          
 2   meter_reading       175680 non-null  float16       
 3   square_feet         175680 non-null  int32         
 4   air_temperature     175680 non-null  float16       
 5   cloud_coverage      175680 non-null  float16       
 6   dew_temperature     175680 non-null  float16       
 7   precip_depth_1_hr   175660 non-null  float16       
 8   sea_level_pressure  175680 non-null  float16       
 9   wind_direction      170680 non-null  float16       
 10  wind_speed          175680 non-null  float16       
dtypes: datetime64[ns](1), float16(8), int32(1), int8(1)
memory usage: 4.9 MB
None


добавим день недели, час и праздники в данные, добавляем 7 столбцов дня недели и значение 1 в соотвествующем дню столбце, добавляем солбей праздничный день и ставим 1 где дата праздничная. 

In [8]:
energy_["hour"] = energy_["timestamp"].dt.hour.astype("int8")
energy_["weekday"] = energy_["timestamp"].dt.weekday.astype("int8")
for weekday in range(0,7):
    energy_['is_wday' + str(weekday)] = energy_['weekday'].isin([weekday]).astype("int8")
dates_range = pd.date_range(start='2015-12-31', end='2017-01-01')
us_holidays = calendar().holidays(start=dates_range.min(),
                                  end=dates_range.max())
energy_["date"] = pd.to_datetime(energy_["timestamp"].dt.date)
energy_['is_holiday'] = (energy_['date'].isin(us_holidays)).astype("int8")
energy_["meter_reading_log"] = np.log(energy_["meter_reading"] + 1) # логорифм энерго потребления

разделим данные

In [9]:
energy_train, energy_test = train_test_split(energy_[energy_["meter_reading"]>0], test_size=0.2)
print (energy_train.head())

                 timestamp  building_id  meter_reading  square_feet  \
144383 2016-10-27 19:00:00            3     425.000000        23685   
96761  2016-07-20 14:00:00            1     132.125000         2720   
124883 2016-09-17 04:00:00            3     510.500000        23685   
165776 2016-12-11 08:00:00           16    1322.000000        54644   
165725 2016-12-11 06:00:00            5       7.507812         8000   

        air_temperature  cloud_coverage  dew_temperature  precip_depth_1_hr  \
144383        28.296875        6.332031        14.398438                0.0   
96761         30.000000        4.000000        24.406250                0.0   
124883        25.593750        2.000000        23.906250                0.0   
165776        12.203125        0.000000        10.601562                0.0   
165725        12.796875        0.000000        10.601562                0.0   

        sea_level_pressure  wind_direction  ...  is_wday0  is_wday1  is_wday2  \
144383           

In [10]:
print (energy_.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175680 entries, 0 to 175679
Data columns (total 23 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   timestamp           175680 non-null  datetime64[ns]
 1   building_id         175680 non-null  int8          
 2   meter_reading       175680 non-null  float16       
 3   square_feet         175680 non-null  int32         
 4   air_temperature     175680 non-null  float16       
 5   cloud_coverage      175680 non-null  float16       
 6   dew_temperature     175680 non-null  float16       
 7   precip_depth_1_hr   175660 non-null  float16       
 8   sea_level_pressure  175680 non-null  float16       
 9   wind_direction      170680 non-null  float16       
 10  wind_speed          175680 non-null  float16       
 11  hour                175680 non-null  int8          
 12  weekday             175680 non-null  int8          
 13  is_wday0            175680 no

массив линейной регресии по часу и зданию 

In [11]:
hours = range(0, 24)
builds = range(0, energy_train["building_id"].max() + 1)
cols = ["meter_reading_log", "hour", "building_id","air_temperature", 
        "dew_temperature", "sea_level_pressure", "wind_speed", "cloud_coverage"]
energy_learn_80 = pd.DataFrame(energy_train, columns=cols)
energy_lr = [[]]*len(builds)
for bldng in builds:
    energy_lr[bldng] = [[]]*len(hours)
    energy_train_b = energy_learn_80[energy_learn_80["building_id"]==bldng]
    for hour in hours:
        energy_train_bh = energy_train_b[energy_train_b["hour"]==hour]
        y = energy_train_bh["meter_reading_log"]
        x = energy_train_bh.drop(labels=["meter_reading_log",
            "hour", "building_id"], axis=1)
        model = LinearRegression(fit_intercept=False).fit(x, y)
        energy_lr[bldng][hour] = model.coef_
        energy_lr[bldng][hour] = np.append(energy_lr[bldng][hour], model.intercept_)
print (energy_lr)

[[array([ 0.02444564,  0.01335726,  0.00455704, -0.00762739, -0.00674671,
        0.        ]), array([ 0.02464436,  0.01092151,  0.00460251, -0.00296813, -0.00886559,
        0.        ]), array([ 0.03100818,  0.00817825,  0.00450703, -0.0031522 , -0.00388271,
        0.        ]), array([ 0.03347856,  0.00587661,  0.00454059, -0.01102236, -0.00917152,
        0.        ]), array([ 0.04968794, -0.01124984,  0.00449808, -0.01365184, -0.00136403,
        0.        ]), array([ 0.04922223, -0.01119986,  0.00452666, -0.01355359, -0.00627879,
        0.        ]), array([ 0.03633233, -0.00251091,  0.00462013, -0.00128073, -0.00195517,
        0.        ]), array([ 0.03670212, -0.00177552,  0.00464328, -0.00402096, -0.00545441,
        0.        ]), array([ 0.05164801, -0.00837163,  0.00451071, -0.01372168, -0.0179385 ,
        0.        ]), array([ 0.04295712, -0.00930951,  0.00465525, -0.0052721 , -0.01019801,
        0.        ]), array([ 0.08042799, -0.03826648,  0.00444838, -0.02113973,

используя функцию предоставленную учебным центром, расчитываем качество построенных моделей по зданию и часу

In [12]:
def calculate_model (x):
    model = energy_lr[x.building_id][x.hour]
    lr = np.sum([x[col] * model[i] for i,col in enumerate(cols[3:])])
    lr += model[len(cols)-3]
    lr = np.exp(lr)
    x["meter_reading_lr_q"] = (np.log(x.meter_reading + 1) -
                               np.log(1 + lr))**2
    return x

energy_test = energy_test.apply(calculate_model,
                                    axis=1, result_type="expand")

In [None]:
построим модели по дате

In [13]:
cols = ["meter_reading_log", "hour", "building_id", "is_holiday"]
for wday in range(0,7):
    cols.append("is_wday" + str(wday))
energy_learn_80 = pd.DataFrame(energy_train, columns=cols)
energy_lr = [[]]*len(builds)
for bldng in builds:
    energy_lr[bldng] = [[]]*len(hours)
    energy_train_b = energy_learn_80[energy_learn_80["building_id"]==bldng]
    for hour in hours:
        energy_train_bh = energy_train_b[energy_train_b["hour"]==hour]
        y = energy_train_bh["meter_reading_log"]
        x = energy_train_bh.drop(labels=["meter_reading_log",
            "hour", "building_id"], axis=1)
        model = LinearRegression(fit_intercept=False).fit(x, y)
        energy_lr[bldng][hour] = model.coef_
        energy_lr[bldng][hour] = np.append(energy_lr[bldng][hour], model.intercept_)

In [None]:
качество модели по дате

In [14]:
def calculate_model_is (x):
    model = energy_lr[x.building_id][x.hour]
    if model[0] != 0:
        lr = np.sum([x[col] * model[i] for i,col in enumerate(cols[3:])])
        lr += model[len(cols)-3]
        lr = np.exp(lr)
    if lr < 0:
        lr = 0
    x["meter_reading_is_q"] = (np.log(x.meter_reading + 1) -
                               np.log(1 + lr))**2
    return x

energy_test = energy_test.apply(calculate_model_is,
                                    axis=1, result_type="expand")

In [26]:
print (energy_test.head())

                 timestamp  building_id  meter_reading  square_feet  \
89468  2016-07-05 09:00:00            8         432.75        60809   
108669 2016-08-14 09:00:00            9         146.25        27000   
102944 2016-08-02 11:00:00            4        1812.00       116607   
152168 2016-11-13 00:00:00            8         502.25        60809   
79501  2016-06-14 15:00:00            1         134.00         2720   

        air_temperature  cloud_coverage  dew_temperature  precip_depth_1_hr  \
89468         26.093750             4.0        25.000000                0.0   
108669        25.593750             7.0        23.906250                0.0   
102944        25.000000             4.0        23.296875                0.0   
152168        21.093750             6.0         8.898438                0.0   
79501         31.703125             4.0        23.296875                0.0   

        sea_level_pressure  wind_direction  ...  is_wday2  is_wday3  is_wday4  \
89468            

In [15]:
energy_test_0_sum = energy_test[energy_test["building_id"] == 0].sum()
if energy_test_0_sum["meter_reading_is_q"] > energy_test_0_sum["meter_reading_lr_q"]:
    print ("Лучше линейная регрессия по погоде")
else:
    print ("Лучше линейная регрессия по дате")

Лучше линейная регрессия по погоде


  energy_test_0_sum = energy_test[energy_test["building_id"] == 0].sum()
