In [77]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression

In [72]:
train_df = pd.read_csv('data/Train.csv')
test_df = pd.read_csv('data/Test.csv')

# преобразование поля "дата"
train_df['year'] = train_df['date'].apply(lambda x: int(x.split('-')[0]))
train_df['month'] = train_df['date'].apply(lambda x: int(x.split('-')[1]))
train_df.drop('date', axis=1, inplace=True)

test_df['year'] = test_df['date'].apply(lambda x: int(x.split('-')[0]))
test_df['month'] = test_df['date'].apply(lambda x: int(x.split('-')[1]))
test_df.drop('date', axis=1, inplace=True)

# удаление столбца "id"
train_df.drop('id', axis=1, inplace=True)

# разделение данных на признаки и целевую переменную
X_train = train_df.drop('price', axis=1)
y_train = train_df['price']
X_test = test_df.drop('id', axis=1)

## Обработка данных

In [100]:
# определение числовых и категориальных признаков
numeric_features = ['floor', 'area', 'metro_dist', 'n_photos', 'year']
categorical_features = ['street_id', 'build_tech', 'rooms', 'balcon', 'g_lift', 'month',
                        'kw1', 'kw2', 'kw3', 'kw4', 'kw5', 'kw6', 'kw7', 'kw8', 'kw9', 'kw10',
                        'kw11', 'kw12', 'kw13']

# создание пайплайнов для обработки числовых и категориальных признаков
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# создание препроцессора, объединяющего числовой и категориальный пайплайны
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# подгонка препроцессора к обучающей выборке
preprocessor.fit(X_train)

## Обучение RandomForest

In [10]:
# создание модели случайного леса и ее обучение на обучающей выборке
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

### Начинаем менять параметры моделей и подключать preprocessor

## Обучение RandomForest

In [68]:
# rf_model = RandomForestRegressor(random_state=42, max_features='sqrt')
# объединение пайплайна для обработки признаков с моделью случайного леса
rf_model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor(random_state=42))])


# создание сетки параметров для перебора их в поиске наилучшей модели
# param_grid = {
#     'regressor__n_estimators': [50, 200],
#     'regressor__max_depth': [1, 5, 10],
#     'regressor__min_samples_split': [2, 5, 10],
#     'regressor__min_samples_leaf': [2, 4]
# }
# Best parameters: {'regressor__max_depth': 10, 'regressor__min_samples_leaf': 2, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 200}
# Training R2 score: 0.6526464108891502
# Test R2 score: 0.6022031332511567

param_grid = {
    'regressor__n_estimators': [300, 400],
    'regressor__max_depth': [5, 10],
    'regressor__min_samples_split': [10, 15],
    'regressor__min_samples_leaf': [2]
}

# настройка гиперпараметров модели с помощью GridSearchCV
rf_model_grid = GridSearchCV(rf_model, param_grid, cv=3, n_jobs=-1, verbose=2)
rf_model_grid.fit(X_train, y_train)

# оценка качества модели на тестовой выборке
print("Best parameters:", rf_model_grid.best_params_)
print("Training R2 score:", rf_model_grid.score(X_train, y_train))

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END regressor__max_depth=1, regressor__min_samples_leaf=2, regressor__min_samples_split=2, regressor__n_estimators=50; total time=   1.5s
[CV] END regressor__max_depth=1, regressor__min_samples_leaf=2, regressor__min_samples_split=2, regressor__n_estimators=50; total time=   1.5s
[CV] END regressor__max_depth=1, regressor__min_samples_leaf=2, regressor__min_samples_split=2, regressor__n_estimators=50; total time=   1.7s
[CV] END regressor__max_depth=1, regressor__min_samples_leaf=2, regressor__min_samples_split=2, regressor__n_estimators=50; total time=   2.0s
[CV] END regressor__max_depth=1, regressor__min_samples_leaf=2, regressor__min_samples_split=2, regressor__n_estimators=50; total time=   1.8s
[CV] END regressor__max_depth=1, regressor__min_samples_leaf=2, regressor__min_samples_split=5, regressor__n_estimators=50; total time=   1.6s
[CV] END regressor__max_depth=1, regressor__min_samples_leaf=2, regressor__min_s

### Обучение GradientBoostingRegressor

In [121]:
# объединение пайплайна для обработки признаков с моделью градиентного бустинга
gb_model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', GradientBoostingRegressor(random_state=42))])

# создание сетки параметров для перебора их в поиске наилучшей модели
# param_grid = {
#     'regressor__n_estimators': [100, 250],
#     'regressor__max_depth': [3, 5, 10],
#     'regressor__learning_rate': [0.1, 0.01]
# }

# Best parameters: {'regressor__learning_rate': 0.1, 'regressor__max_depth': 10, 'regressor__n_estimators': 250}
# Training R2 score: 0.8917769125359671
# Test R2 score: 0.8062182587934098

param_grid = {
    'regressor__n_estimators': [250, 400],
    'regressor__max_depth': [10],
    'regressor__learning_rate': [0.1]
}
# Best parameters: {'regressor__learning_rate': 0.1, 'regressor__max_depth': 10, 'regressor__n_estimators': 400}
# Training R2 score: 0.9255214138108229

# настройка гиперпараметров модели с помощью GridSearchCV
gb_model_grid = GridSearchCV(gb_model, param_grid, cv=3, n_jobs=-1, verbose=4)
gb_model_grid.fit(X_train, y_train)

# оценка качества модели на тестовой выборке
print("Best parameters:", gb_model_grid.best_params_)
print("Training R2 score:", gb_model_grid.score(X_train, y_train))


Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV 1/3] END regressor__learning_rate=0.1, regressor__max_depth=10, regressor__n_estimators=250;, score=0.797 total time= 2.3min
[CV 2/3] END regressor__learning_rate=0.1, regressor__max_depth=10, regressor__n_estimators=250;, score=0.809 total time= 2.6min
[CV 3/3] END regressor__learning_rate=0.1, regressor__max_depth=10, regressor__n_estimators=250;, score=0.751 total time= 2.6min
[CV 1/3] END regressor__learning_rate=0.1, regressor__max_depth=10, regressor__n_estimators=400;, score=0.817 total time= 3.1min
[CV 3/3] END regressor__learning_rate=0.1, regressor__max_depth=10, regressor__n_estimators=400;, score=0.776 total time= 3.2min
[CV 2/3] END regressor__learning_rate=0.1, regressor__max_depth=10, regressor__n_estimators=400;, score=0.832 total time= 3.2min
Best parameters: {'regressor__learning_rate': 0.1, 'regressor__max_depth': 10, 'regressor__n_estimators': 400}
Training R2 score: 0.9255214138108229


### Обучение CatBoostRegressor

In [165]:
# объединение пайплайна для обработки признаков с моделью градиентного бустинга CatBoost
catboost_model = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('regressor', CatBoostRegressor(random_seed=42, verbose=0))])

# создание сетки параметров для перебора их в поиске наилучшей модели
# param_grid = {'regressor__learning_rate': [0.03, 0.1],
#               'regressor__depth': [4, 10],
#               'regressor__iterations': [250, 400]}

# Best parameters: {'regressor__depth': 10, 'regressor__iterations': 400, 'regressor__learning_rate': 0.1}
# Training R2 score: 0.8427896212358248

# param_grid = {'regressor__learning_rate': [0.1],
#               'regressor__depth': [10],
#               'regressor__iterations': [400, 600, 1000]}

# Best parameters: {'regressor__depth': 10, 'regressor__iterations': 1000, 'regressor__learning_rate': 0.1}
# Training R2 score: 0.9010933031862454

param_grid = {'regressor__learning_rate': [0.1],
              'regressor__depth': [10, 12, 15],
              'regressor__iterations': [1000]}
# настройка гиперпараметров модели с помощью GridSearchCV
catboost_model_grid = GridSearchCV(catboost_model, param_grid, cv=3, n_jobs=-1, verbose=4)
catboost_model_grid.fit(X_train, y_train)

# оценка качества модели на тестовой выборке
print("Best parameters:", catboost_model_grid.best_params_)
print("Training R2 score:", catboost_model_grid.score(X_train, y_train))

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV 2/3] END regressor__depth=10, regressor__iterations=400, regressor__learning_rate=0.1;, score=0.819 total time= 1.4min
[CV 3/3] END regressor__depth=10, regressor__iterations=400, regressor__learning_rate=0.1;, score=0.765 total time= 1.4min
[CV 1/3] END regressor__depth=10, regressor__iterations=400, regressor__learning_rate=0.1;, score=0.813 total time= 1.6min
[CV 1/3] END regressor__depth=10, regressor__iterations=600, regressor__learning_rate=0.1;, score=0.835 total time= 2.0min
[CV 3/3] END regressor__depth=10, regressor__iterations=600, regressor__learning_rate=0.1;, score=0.788 total time= 2.0min
[CV 2/3] END regressor__depth=10, regressor__iterations=600, regressor__learning_rate=0.1;, score=0.841 total time= 2.0min
[CV 1/3] END regressor__depth=10, regressor__iterations=1000, regressor__learning_rate=0.1;, score=0.853 total time= 2.4min
[CV 2/3] END regressor__depth=10, regressor__iterations=1000, regressor__learni

### LinearRegressor

In [92]:
# объединение пайплайна для обработки признаков с моделью линейной регрессии
linear_model = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', LinearRegression())])

# обучение модели
linear_model.fit(X_train, y_train)

# оценка качества модели на тестовой выборке
print("Training R2 score:", linear_model.score(X_train, y_train))
# print("Test R2 score:", linear_model.score(X_test, y_test))

Training R2 score: 0.8021908349244739


In [108]:
gbr_linear_model = GradientBoostingRegressor(
    n_estimators=250,
    learning_rate=0.1,
    max_depth=10,
    init=linear_model
)

## Предсказания

In [148]:
train_df = pd.read_csv('data/Train.csv')
test_df = pd.read_csv('data/Test.csv')

# преобразование поля "дата"
train_df['year'] = train_df['date'].apply(lambda x: int(x.split('-')[0]))
train_df['month'] = train_df['date'].apply(lambda x: int(x.split('-')[1]))
train_df.drop('date', axis=1, inplace=True)

test_df['year'] = test_df['date'].apply(lambda x: int(x.split('-')[0]))
test_df['month'] = test_df['date'].apply(lambda x: int(x.split('-')[1]))
test_df.drop('date', axis=1, inplace=True)

# удаление столбца "id"
train_df.drop('id', axis=1, inplace=True)

# разделение данных на признаки и целевую переменную
X_train = train_df.drop('price', axis=1)
y_train = train_df['price']
X_test = test_df.drop('id', axis=1)

In [189]:
def gen_submit(model, FILE=False, FIT=False):
    if (FIT):
        model.fit(X_train, y_train)
    # формирование предсказаний на тестовой выборке
    predictions = model.predict(X_test)

    if (FILE):
        # сохранение предсказаний в файл
        submission = pd.DataFrame({'id': test_df['id'], 'price': predictions})
        submission.to_csv('submission_test.csv', index=False)

    # оценка качества модели на обучающей выборке с помощью метрики MAE
    y_train_pred = model.predict(X_train)
    mae = mean_absolute_error(y_train, y_train_pred)
    print('Train MAE:', mae)

In [160]:
final_rf_model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor(
                            random_state=42,
                            max_depth=10,
                            min_samples_leaf=2,
                            min_samples_split=10,
                            n_estimators=250))])
final_rf_model.fit(X_train, y_train)

# объединение пайплайна для обработки признаков с моделью градиентного бустинга
final_gb_model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', GradientBoostingRegressor(
                                random_state=42, 
                                learning_rate=0.1,
                                max_depth=10,
                                # n_estimators=1000, #Train MAE: 426651.79474762094
                                n_estimators=5000, #Train MAE: 66043.40044810559 при этом на test дал score 1134643.76260
                            ))]) 
final_gb_model.fit(X_train, y_train)

# объединение пайплайна для обработки признаков с моделью градиентного бустинга CatBoost
final_catboost_model = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('regressor', CatBoostRegressor(
                                    random_seed=42, 
                                    verbose=0,
                                    depth=10,
                                    # iterations=1000, #Train MAE: 601373.6798611673
                                    iterations=10000, #Train MAE: 313205.59425318876 при этом на test дал score 1056386.50597 
                                    learning_rate=0.1))])
final_catboost_model.fit(X_train, y_train)


In [177]:
gen_submit(rf_model_grid)
gen_submit(gb_model_grid)
gen_submit(catboost_model_grid)

print("Final models:")
gen_submit(final_rf_model)
gen_submit(final_gb_model, FILE=True)
gen_submit(final_catboost_model)
gen_submit(linear_model, FIT=True)

Train MAE: 1732804.1989514765
Train MAE: 795396.3089723869
Train MAE: 759793.9377891078
Final models:
Train MAE: 1723098.9354048106
Train MAE: 426651.79474762094
Train MAE: 601373.6798611673
Train MAE: 1018673.5445415583


In [185]:
gen_submit(rf_model_grid)
gen_submit(gb_model_grid)
gen_submit(catboost_model_grid)

print("Final models:")
gen_submit(final_rf_model)
gen_submit(final_gb_model, FILE=True)
gen_submit(final_catboost_model)
gen_submit(linear_model, FIT=True)

Train MAE: 1732804.1989514765
Train MAE: 795396.3089723869
Train MAE: 759793.9377891078
Final models:
Train MAE: 1723098.9354048106
Train MAE: 66043.40044810559
Train MAE: 313205.59425318876
Train MAE: 1018673.5445415583


In [186]:
# Сохраняем результат лучшей модели
gen_submit(final_gb_model, FILE=True)
gen_submit(final_catboost_model, FILE=True) # Дала лучший результат на тесте


Train MAE: 66043.40044810559


### Подбор параметров вручную

In [181]:
# объединение пайплайна для обработки признаков с моделью градиентного бустинга CatBoost
final_catboost_model = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('regressor', CatBoostRegressor(
                                    random_seed=42, 
                                    verbose=0,
                                    depth=10,
                                    iterations=10000,
                                    learning_rate=0.1))])
final_catboost_model.fit(X_train, y_train)

In [188]:
gen_submit(final_catboost_model)

Train MAE: 313205.59425318876


In [183]:
# объединение пайплайна для обработки признаков с моделью градиентного бустинга
final_gb_model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', GradientBoostingRegressor(
                                random_state=42, 
                                learning_rate=0.1,
                                max_depth=10,
                                n_estimators=5000))])
final_gb_model.fit(X_train, y_train)
gen_submit(final_gb_model)

Train MAE: 66043.40044810559


In [184]:
gen_submit(final_gb_model)

Train MAE: 66043.40044810559
