In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
import numpy as np
import time

In [None]:
train_df = pd.read_csv('data/train.csv')
test_dfX = pd.read_csv('data/test.csv')
test_dfY = pd.read_csv('data/sample_submission.csv')
n_train = train_df.shape[0]
filter_df = pd.concat([train_df, test_dfX], ignore_index=True)

In [None]:
#train_df.info()
#train_df[['MSSubClass', 'MSZoning', 'LotArea', 'Utilities', 'Neighborhood',
#'Condition1', 'OverallQual', 'OverallCond', 'YearRemodAdd', 'Exterior1st',
#'ExterCond', 'Foundation', 'HeatingQC', 'Electrical', '1stFlrSF', 'Functional',
#'GarageQual', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal']].isnull().sum()

In [None]:
X_train = train_df[[
    'MSSubClass', 'MSZoning', 'LotArea', 'Utilities', 'Neighborhood',
    'Condition1', 'OverallQual', 'OverallCond', 'YearRemodAdd', 'Exterior1st',
    'ExterCond', 'Foundation', 'HeatingQC', 'Electrical', '1stFlrSF', 'Functional',
    'GarageQual', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal'
]].copy()

imputer = SimpleImputer(strategy='constant', fill_value='NA')
X_train[[
    'Electrical', 'GarageQual', 'PoolQC', 'Fence', 'MiscFeature'
]] = imputer.fit_transform(X_train[[
    'Electrical', 'GarageQual', 'PoolQC', 'Fence', 'MiscFeature'
]])

X_train['MSSubClass'] = X_train['MSSubClass'].astype(str)
X_train = pd.get_dummies(X_train, drop_first=True)

print("Размер таблицы X_train:", X_train.shape)
print("Пропусков:", X_train.isnull().sum().sum())

In [None]:
X_test = test_dfX[[
    'MSSubClass', 'MSZoning', 'LotArea', 'Utilities', 'Neighborhood',
    'Condition1', 'OverallQual', 'OverallCond', 'YearRemodAdd', 'Exterior1st',
    'ExterCond', 'Foundation', 'HeatingQC', 'Electrical', '1stFlrSF', 'Functional',
    'GarageQual', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal'
]].copy()

imputer = SimpleImputer(strategy='constant', fill_value='NA')
X_test[[
    'Electrical', 'GarageQual', 'PoolQC', 'Fence', 'MiscFeature'
]] = imputer.fit_transform(X_test[[
    'Electrical', 'GarageQual', 'PoolQC', 'Fence', 'MiscFeature'
]])

X_test['MSSubClass'] = X_test['MSSubClass'].astype(str)
X_test = pd.get_dummies(X_test, drop_first=True)

print("Размер таблицы X_test:", X_test.shape)
print("Пропусков:", X_test.isnull().sum().sum())

In [None]:
Y_train = train_df[['SalePrice']].copy()
Y_test = test_dfY[['SalePrice']].copy()

In [None]:
housePriceModel = RandomForestRegressor(
    n_estimators=50,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
    bootstrap=True,
    n_jobs=-1,
    random_state=42,
)

start = time.time()
housePriceModel.fit(X_train, Y_train)
end = time.time()

print(f"\nОбучение заняло: {end - start:.3f} секунд")

predictions = housePriceModel.predict(X_test)

mae = mean_absolute_error(Y_test, predictions)

print(f"Средняя ошибка: {mae:,.2f} долларов") 

feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': housePriceModel.feature_importances_
})
print("\nВажность признаков:")
print(feature_importance.sort_values('importance', ascending=False).head(10))