In [21]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

Построить регрессионную модель температуры воздуха в помещении в зависимости от типа охлаждения и других значимых факторов. Выбор факторов - на усмотрение исполнителя.

In [22]:
df = pd.read_csv('data/cleared_data.csv')

df.head()

Unnamed: 0,year,season,climate,city,country,cooling_method,mixed_cooling_mode,heating_method,age,gender,...,air_speed,height,weight,curtains,fan_mode,window_status,door_status,heating_mode,average_monthly_outdoor_temperature,number_of_complaints
0,2011,Лето,Cубтропический океанический,Техас,США,Кондиционирование,not applicable,No mechanical heating,42,Unknown,...,0.08,,,False,False,True,True,True,32.8,0
1,2011,Лето,Cубтропический океанический,Техас,США,Кондиционирование,not applicable,No mechanical heating,27,Unknown,...,0.21,,,True,True,True,True,True,28.9,0
2,2011,Лето,Cубтропический океанический,Техас,США,Кондиционирование,not applicable,No mechanical heating,32,Unknown,...,0.1,,,True,True,True,True,True,32.8,0
3,2011,Лето,Cубтропический океанический,Техас,США,Кондиционирование,not applicable,No mechanical heating,43,Unknown,...,0.15,,,True,True,True,True,True,32.8,0
4,2011,Лето,Cубтропический океанический,Техас,США,Кондиционирование,not applicable,No mechanical heating,32,Unknown,...,0.14,,,False,False,True,True,True,32.8,0


In [23]:
df.isna().sum()

year                                     0
season                                   0
climate                                  0
city                                     0
country                                  0
cooling_method                           0
mixed_cooling_mode                       0
heating_method                           0
age                                      0
gender                                   0
temperature_feeling                      0
temperature_feeling_bool                 0
preferred_temperature_change             0
air_movement_feeling_bool                0
preferred_air_movement_change            0
comfort_rating                         243
clothing_insulation                      0
indoor_air_temperature                   0
outdoor_air_temperature                335
relative_humidity                        0
air_speed                                0
height                                 280
weight                                 231
curtains   

In [24]:
df = df.dropna(axis=1)

In [5]:
df.shape

(372, 26)

Всего 372 наблюдения, значит должно быть не больше 37 признаков.

Основываясь на корреляции между параметрами, я выбрал признаки:
- season
- climate
- cooling_method
- heating_method
- relative_humidity
- air_speed
- fan_mode
- window_status
- door_status
- heating_mode


In [6]:
df.columns

Index(['year', 'season', 'climate', 'city', 'country', 'cooling_method',
       'mixed_cooling_mode', 'heating_method', 'age', 'gender',
       'temperature_feeling', 'temperature_feeling_bool',
       'preferred_temperature_change', 'air_movement_feeling_bool',
       'preferred_air_movement_change', 'clothing_insulation',
       'indoor_air_temperature', 'relative_humidity', 'air_speed', 'curtains',
       'fan_mode', 'window_status', 'door_status', 'heating_mode',
       'average_monthly_outdoor_temperature', 'number_of_complaints'],
      dtype='object')

In [7]:
X = df[['season', 'climate', 'cooling_method', 'heating_method', 'relative_humidity', 'air_speed', 'fan_mode', 'window_status', 'door_status', 'heating_mode']]
y = df['indoor_air_temperature']

In [8]:
X.head()

Unnamed: 0,season,climate,cooling_method,heating_method,relative_humidity,air_speed,fan_mode,window_status,door_status,heating_mode
0,Лето,Cубтропический океанический,Кондиционирование,No mechanical heating,34.7,0.08,False,True,True,True
1,Лето,Cубтропический океанический,Кондиционирование,No mechanical heating,34.8,0.21,True,True,True,True
2,Лето,Cубтропический океанический,Кондиционирование,No mechanical heating,42.2,0.1,True,True,True,True
3,Лето,Cубтропический океанический,Кондиционирование,No mechanical heating,34.7,0.15,True,True,True,True
4,Лето,Cубтропический океанический,Кондиционирование,No mechanical heating,38.4,0.14,False,True,True,True


In [9]:
X['climate'].value_counts()

climate
Cубтропический океанический         231
Тропическая влажная саванна          59
Жаркий полузасушливый                47
Влажный субтропический муссонный     19
Субтропическое высокогорье           16
Name: count, dtype: int64

In [10]:
X.isna().sum()

season               0
climate              0
cooling_method       0
heating_method       0
relative_humidity    0
air_speed            0
fan_mode             0
window_status        0
door_status          0
heating_mode         0
dtype: int64

In [11]:
X.dtypes

season                object
climate               object
cooling_method        object
heating_method        object
relative_humidity    float64
air_speed            float64
fan_mode                bool
window_status           bool
door_status             bool
heating_mode            bool
dtype: object

In [12]:
not_categorical = X.select_dtypes(include=['int64', 'float64', 'bool'])
categorical = X.select_dtypes(include=['object', 'category'])

encoder = OneHotEncoder(sparse_output=False, drop='first')
X_cat_encoded = encoder.fit_transform(categorical)
X_cat_encoded_df = pd.DataFrame(X_cat_encoded, columns=encoder.get_feature_names_out(categorical.columns))

final_X = pd.concat([not_categorical, X_cat_encoded_df], axis=1)

In [13]:
not_categorical

Unnamed: 0,relative_humidity,air_speed,fan_mode,window_status,door_status,heating_mode
0,34.7,0.08,False,True,True,True
1,34.8,0.21,True,True,True,True
2,42.2,0.10,True,True,True,True
3,34.7,0.15,True,True,True,True
4,38.4,0.14,False,True,True,True
...,...,...,...,...,...,...
367,51.5,0.12,True,False,True,True
368,50.1,0.12,False,True,True,True
369,40.0,0.01,True,False,False,True
370,61.0,0.16,True,False,False,True


In [14]:
categorical

Unnamed: 0,season,climate,cooling_method,heating_method
0,Лето,Cубтропический океанический,Кондиционирование,No mechanical heating
1,Лето,Cубтропический океанический,Кондиционирование,No mechanical heating
2,Лето,Cубтропический океанический,Кондиционирование,No mechanical heating
3,Лето,Cубтропический океанический,Кондиционирование,No mechanical heating
4,Лето,Cубтропический океанический,Кондиционирование,No mechanical heating
...,...,...,...,...
367,Лето,Влажный субтропический муссонный,Кондиционирование,No mechanical heating
368,Зима,Тропическая влажная саванна,Кондиционирование,No mechanical heating
369,Лето,Тропическая влажная саванна,Смешанный,No mechanical heating
370,Зима,Тропическая влажная саванна,Смешанный,No mechanical heating


In [15]:
final_X

Unnamed: 0,relative_humidity,air_speed,fan_mode,window_status,door_status,heating_mode,season_Зима,season_Лето,season_Осень,climate_Влажный субтропический муссонный,climate_Жаркий полузасушливый,climate_Субтропическое высокогорье,climate_Тропическая влажная саванна,cooling_method_Кондиционирование,cooling_method_Смешанный,heating_method_Механическое отопление
0,34.7,0.08,False,True,True,True,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,34.8,0.21,True,True,True,True,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,42.2,0.10,True,True,True,True,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,34.7,0.15,True,True,True,True,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,38.4,0.14,False,True,True,True,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
367,51.5,0.12,True,False,True,True,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
368,50.1,0.12,False,True,True,True,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
369,40.0,0.01,True,False,False,True,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
370,61.0,0.16,True,False,False,True,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [16]:
X_train, X_test, y_train, y_test = train_test_split(final_X, y, test_size=0.2, random_state=42)

In [17]:
lin_regr = LinearRegression()

lin_regr.fit(X_train, y_train)

In [18]:
def metrics_model(fact, prediction):
    print(f'R2: {r2_score(fact, prediction)}')
    print(f'MAE: {mean_absolute_error(fact, prediction)}')
    print(f'MSE: {mean_squared_error(fact, prediction)}')
    print(f'MAPE: {mean_absolute_percentage_error(fact, prediction)}')

In [19]:
prediction_test = lin_regr.predict(X_test)

In [20]:
metrics_model(y_test, prediction_test)

R2: 0.05775635662962064
MAE: 0.8995613663349673
MSE: 1.4111275317088263
MAPE: 0.036427314781910955


Проанализировав результаты регрессионой модели можно прийти к выводам:
- по заданным признакам невозможно достаточно точно предсказать температуру в помещении
- возможно стоит добавить признаков или изменить обработку текущих