# Предсказание цены

**Подлючение библиотек**

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.simplefilter('ignore')

from sklearn.metrics import r2_score as r2, mean_absolute_error as mae, mean_squared_error as mse

## Загрузка Train DataSet для обучения модели

In [2]:
LOAD_PREPARED_DATASET_TRAIN = 'save_data/housing_prepared_train.csv'

In [3]:
train = pd.read_csv(LOAD_PREPARED_DATASET_TRAIN)
train.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
0,14038,35,2.0,47.981561,29.442751,6.0,7.0,9.0,1969.0,0.08904,1,1,33,7976,5,1143.0,0,11,1,184966.93073
1,15053,41,3.0,65.68364,40.049543,8.0,7.0,9.0,1978.0,7e-05,1,1,46,10309,1,240.0,1,16,1,300009.450063
2,4765,53,2.0,44.947953,29.197612,0.0,8.0,12.0,1968.0,0.049637,1,1,34,7759,0,229.0,1,3,1,220925.908524
3,5809,58,2.0,53.352981,52.731512,9.0,8.0,17.0,1977.0,0.437885,1,1,23,5735,3,1084.0,0,5,1,175616.227217
4,10783,99,1.0,39.649192,23.776169,7.0,11.0,12.0,1976.0,0.012339,1,1,35,5776,1,2078.0,2,4,1,150226.531644


### Подготовка данных к разбиению

Целевым значением для нас будет значение `"Price"` - Цена на недвижимость. Вынесем этот столбец в отдельную переменную `y`, а в переменную `X` запишем все столбцы из таблицы `df` кроме столбца `target`.

In [4]:
y = pd.DataFrame(train['Price'])
X = train.drop(['Price', 'Id'], axis=1)

### Разбиение данных на тренировочный и тестовый DataSet

Обычно при обучении моделей машинного обучения используют две выборки: тренировочную и тестовую. Первая нужна для того, чтобы обучить модель. Вторая - для проверки качества обученной модели: мы можем сравнить предсказанную на этих данных цену с реальной, поскольку она у нас тоже имеется.

Разбиение данных на тренировочную и тестовую выборку можно выполнить с помощью функции `train_test_split` из модуля `sklearn.model_selection`.

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.30, random_state=42)

## `GradientBoostingRegressor`

**Подключение библиотек**

In [7]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

### Настройка параметров

In [8]:
parameters = [{'n_estimators': np.arange(50, 300, 50),
               'max_features': np.arange(5, 10),
               'max_depth': np.arange(5, 10)}]

In [9]:
gbr = GridSearchCV(estimator=GradientBoostingRegressor(random_state=42),
                   param_grid=parameters,
                   scoring='r2',
                   cv=5,
                   n_jobs =-1)

### Тренировка модели с подбором параметров

In [10]:
gbr.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter_n...ne,
                         

### Оценка эффективности параметров

In [11]:
print(f' best_params = {gbr.best_params_}\n best_score = {gbr.best_score_}')

 best_params = {'max_depth': 6, 'max_features': 6, 'n_estimators': 150}
 best_score = 0.7491836893391154


In [12]:
cv_results = pd.DataFrame(gbr.cv_results_)

cv_results.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_max_depth', 'param_max_features', 'param_n_estimators', 'params',
       'split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score',
       'std_test_score', 'rank_test_score'],
      dtype='object')

**Изобразим в виде DataFrame**

In [13]:
param_columns = [
    column
    for column in cv_results.columns
    if column.startswith('param_')
]

score_columns = ['mean_test_score', 'std_test_score']

cv_results = (cv_results[param_columns + score_columns]
              .sort_values(by=score_columns, ascending=False))

cv_results.head(10)

Unnamed: 0,param_max_depth,param_max_features,param_n_estimators,mean_test_score,std_test_score
32,6,6,150,0.749184,0.019432
46,6,9,100,0.748188,0.020621
33,6,6,200,0.748172,0.018828
31,6,6,100,0.747974,0.018629
47,6,9,150,0.74758,0.020855
41,6,8,100,0.747311,0.021096
22,5,9,150,0.747017,0.022821
56,7,6,100,0.746834,0.022722
27,6,5,150,0.746689,0.019498
17,5,8,150,0.746621,0.023779


**Просмотр r2 для лучшей модели на отложенной выборке**

In [14]:
y_pred = gbr.predict(X_valid)
r2(y_valid, y_pred)

0.7404705990537004

### Обучение модели на лучших параметрах

In [15]:
gbr = GradientBoostingRegressor(n_estimators=150, max_depth=6, max_features=6, random_state=42)

gbr.fit(X, y)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=6,
                          max_features=6, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=150,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

**Тренировочные данные**

In [16]:
y_pred = gbr.predict(X_train)
r2(y_train, y_pred)

0.9000936124830651

In [17]:
feature_importances = pd.DataFrame(zip(X_train.columns, gbr.feature_importances_), 
                                   columns=['feature_name', 'importance'])

feature_importances.sort_values(by='importance', ascending=False)

Unnamed: 0,feature_name,importance
2,Square,0.316551
1,Rooms,0.15522
11,Social_1,0.092754
13,Social_3,0.069214
0,DistrictId,0.063667
12,Social_2,0.056026
3,LifeSquare,0.043736
8,Ecology_1,0.043211
4,KitchenSquare,0.038218
7,HouseYear,0.030672


# Загрузка Test DataSet для предсказания цены

In [18]:
LOAD_PREPARED_DATASET_TEST = 'save_data/housing_prepared_test.csv'
SAVE_RESULT = 'Save_data/sample_submission.csv'

In [19]:
test = pd.read_csv(LOAD_PREPARED_DATASET_TEST)
test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
0,725,58,2.0,49.882643,33.432782,6.0,6.0,14.0,1972.0,0.310199,1,1,11,2748,1,1147.0,0,0,1
1,15856,74,2.0,69.263183,58.873706,1.0,6.0,7.0,1977.0,0.075779,1,1,6,1437,3,1147.0,0,2,1
2,5480,190,1.0,13.597819,15.948246,12.0,2.0,5.0,1909.0,0.0,1,1,30,7538,87,4702.0,5,5,1
3,15664,47,2.0,73.046609,51.940842,9.0,22.0,22.0,2007.0,0.101872,1,1,23,4583,3,1147.0,3,3,1
4,14275,27,1.0,47.527111,43.387569,1.0,17.0,17.0,2017.0,0.072158,1,1,2,629,1,1147.0,0,0,0


**Проверка данных**

In [20]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             5000 non-null   int64  
 1   DistrictId     5000 non-null   int64  
 2   Rooms          5000 non-null   float64
 3   Square         5000 non-null   float64
 4   LifeSquare     5000 non-null   float64
 5   KitchenSquare  5000 non-null   float64
 6   Floor          5000 non-null   float64
 7   HouseFloor     5000 non-null   float64
 8   HouseYear      5000 non-null   float64
 9   Ecology_1      5000 non-null   float64
 10  Ecology_2      5000 non-null   int64  
 11  Ecology_3      5000 non-null   int64  
 12  Social_1       5000 non-null   int64  
 13  Social_2       5000 non-null   int64  
 14  Social_3       5000 non-null   int64  
 15  Healthcare_1   5000 non-null   float64
 16  Helthcare_2    5000 non-null   int64  
 17  Shops_1        5000 non-null   int64  
 18  Shops_2 

**Требуемы колонки для обучения модели**

In [21]:
columns = ['DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Ecology_2',
       'Ecology_3', 'Social_1', 'Social_2', 'Social_3', 'Healthcare_1',
       'Helthcare_2', 'Shops_1', 'Shops_2']

### Предсказание цены с использованием самой лучшей модели

In [22]:
test_pred = gbr.predict(test[columns])

**Создание DataFrame для загрузки на Kaggle**

In [23]:
Price = pd.DataFrame(test_pred, columns=['Price'])

In [24]:
test['Price'] = Price

In [25]:
test_pred = test[['Id', 'Price']]
test_pred.head()

Unnamed: 0,Id,Price
0,725,160947.838073
1,15856,217896.212991
2,5480,133405.898924
3,15664,364746.960526
4,14275,142845.165105


### Сохранение результатов

In [26]:
test_pred.to_csv(SAVE_RESULT, index=False, encoding='utf-8')