In [1]:
############## ЗАГРУЗКА ДАННЫХ ################
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Read the data
X = pd.read_csv('train_c.csv', index_col='Id')
X_test = pd.read_csv('test_c.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [2]:
############## ИЗБАВЛЯЕМСЯ ОТ ПРОПАВШИХ ДАННЫХ ###############
from sklearn.impute import SimpleImputer

# Fill in the lines below: imputation
my_imputer = SimpleImputer(strategy = 'most_frequent') # Your code here
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train), columns = X_train.columns).astype(X_train.dtypes.to_dict())
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid), columns = X_valid.columns).astype(X_valid.dtypes.to_dict())
imputed_X_test = pd.DataFrame(my_imputer.transform(X_test), columns = X_test.columns).astype(X_test.dtypes.to_dict())

imputed_X_train.dtypes

MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
                  ...   
MiscVal            int64
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
Length: 79, dtype: object

In [3]:
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns
imputed_X_test.columns = X_test.columns

# Fill in the lines below: imputation removed column names; put them back
final_X_train = imputed_X_train
final_X_valid = imputed_X_valid
final_X_test = imputed_X_test

In [4]:
############# ОБРАБАТЫВАЕМ CATEGORICAL ПЕРЕМЕННЫЕ ##############

In [5]:
#### СНАЧАЛА НАХОДИМ ПРОБЛЕМНЫЕ CТОЛБЦЫ ##########

In [6]:
s = (final_X_train.dtypes == 'object')
object_cols = list(s[s].index)

In [7]:
# All categorical columns
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]
# Columns that can be safely label encoded
good_label_cols = [col for col in object_cols if 
                   set(final_X_train[col]) == set(final_X_test[col]) and set(final_X_train[col]) == set(final_X_valid[col])]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))
        
print('Categorical columns that will be label encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)

Categorical columns that will be label encoded: ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'LotConfig', 'BldgType', 'MasVnrType', 'ExterQual', 'BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'CentralAir', 'KitchenQual', 'FireplaceQu', 'GarageFinish', 'GarageCond', 'PavedDrive', 'Fence', 'SaleCondition']

Categorical columns that will be dropped from the dataset: ['RoofStyle', 'HeatingQC', 'Exterior2nd', 'MiscFeature', 'Functional', 'Condition2', 'PoolQC', 'Exterior1st', 'Neighborhood', 'Electrical', 'SaleType', 'GarageQual', 'LandSlope', 'GarageType', 'Heating', 'Utilities', 'HouseStyle', 'BsmtCond', 'Foundation', 'Condition1', 'ExterCond', 'RoofMatl']


In [8]:
from sklearn.preprocessing import LabelEncoder

# Drop categorical columns that will not be encoded
label_X_train = final_X_train.drop(bad_label_cols, axis=1)
label_X_valid = final_X_valid.drop(bad_label_cols, axis=1)
label_X_test = final_X_test.drop(bad_label_cols, axis=1)

In [9]:
### ТЕПЕРЬ ПРЕОБРАЗУЕМ УНИКАЛЬНЫЕ ОБРАЗЦЫ В ЧИСЛА #########

In [10]:
from sklearn.preprocessing import LabelEncoder
s = (label_X_train.dtypes == 'object')
object_cols = list(s[s].index)
# Apply label encoder 
label_encoder = LabelEncoder() # Your code here
for col in object_cols:
    label_X_train[col] = label_encoder.fit_transform(final_X_train[col])
    label_X_valid[col] = label_encoder.transform(final_X_valid[col])
    label_X_test[col] = label_encoder.transform(final_X_test[col])

In [11]:
label_X_train.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'LotConfig', 'BldgType', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'CentralAir',
       '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath',
       'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
       'KitchenQual', 'TotRmsAbvGrd', 'Fireplaces', 'FireplaceQu',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageCond',
       'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'Fence', 'MiscVal', 'MoSold', 'YrSold',
       'SaleCondition'],
      dtype='object')

In [12]:
label_X_valid

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,LotConfig,BldgType,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,Fence,MiscVal,MoSold,YrSold,SaleCondition
0,20,3,60.0,32668,1,0,0,3,1,0,...,0,200,0,0,0,2,0,3,2007,2
1,50,3,79.0,9490,1,0,3,3,4,0,...,0,32,0,0,0,2,0,8,2006,4
2,50,3,60.0,7015,1,0,0,0,0,0,...,0,248,0,0,0,2,0,7,2009,4
3,60,3,83.0,10005,1,0,3,3,4,0,...,117,0,0,0,0,2,0,3,2008,4
4,160,4,21.0,1680,1,0,3,3,4,3,...,0,0,0,0,0,2,0,3,2010,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287,120,3,32.0,10846,1,0,0,3,1,4,...,30,0,0,0,0,2,0,5,2008,4
288,20,3,105.0,15431,1,0,3,3,4,0,...,72,0,0,170,0,2,0,4,2009,4
289,50,4,60.0,8520,1,0,3,3,4,0,...,15,0,0,0,0,1,0,8,2007,3
290,30,3,50.0,5330,1,0,3,1,4,0,...,0,0,0,0,0,2,0,12,2009,4


In [13]:
################# ЭТАП НОРМАЛИЗАЦИИ ДАННЫХ ДЛЯ МОДЕЛИ KERAS ######################

In [14]:
label_X_test

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,LotConfig,BldgType,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,Fence,MiscVal,MoSold,YrSold,SaleCondition
0,20,2,80.0,11622,1,0,3,3,4,0,...,0,0,0,120,0,2,0,6,2010,4
1,20,3,81.0,14267,1,0,0,3,0,0,...,36,0,0,0,0,2,12500,6,2010,4
2,60,3,74.0,13830,1,0,0,3,4,0,...,34,0,0,0,0,2,0,3,2010,4
3,60,3,78.0,9978,1,0,0,3,4,0,...,36,0,0,0,0,2,0,6,2010,4
4,120,3,43.0,5005,1,0,0,1,4,4,...,82,0,0,144,0,2,0,1,2010,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160,4,21.0,1936,1,0,3,3,4,3,...,0,0,0,0,0,2,0,6,2006,4
1455,160,4,21.0,1894,1,0,3,3,4,4,...,24,0,0,0,0,2,0,4,2006,0
1456,20,3,160.0,20000,1,0,3,3,4,0,...,0,0,0,0,0,2,0,9,2006,0
1457,85,3,62.0,10441,1,0,3,3,4,0,...,32,0,0,0,0,2,700,7,2006,4


In [15]:
train_data=np.float32(np.array(label_X_train))
test_data=np.float32(np.array(label_X_valid))
train_targets=np.float32(np.array(y_train))
test_targets=np.float32(np.array(y_valid))

X_test_c = np.float32(np.array(label_X_test)) 

In [16]:
train_data.shape

(1168, 57)

In [17]:
test_data.shape

(292, 57)

In [18]:
################# ОПРЕДЛЕНИЕ МОДЕЛИ ################

In [19]:
from sklearn.ensemble import RandomForestRegressor

# Define the models
model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]

In [20]:
from sklearn.metrics import mean_absolute_error
# Function for comparing different models
def score_model(model, X_t=train_data, X_v=test_data, y_t=train_targets, y_v=test_targets):
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)

In [23]:
from xgboost import XGBRegressor
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4, max_depth=7)
my_model.fit(train_data, train_targets, 
             early_stopping_rounds=5, 
             eval_set=[(test_data, test_targets)])

[0]	validation_0-rmse:190341.62500
Will train until validation_0-rmse hasn't improved in 5 rounds.
[1]	validation_0-rmse:181580.73438
[2]	validation_0-rmse:173358.96875
[3]	validation_0-rmse:165586.28125
[4]	validation_0-rmse:158285.65625
[5]	validation_0-rmse:151411.21875
[6]	validation_0-rmse:144822.03125
[7]	validation_0-rmse:138607.57812
[8]	validation_0-rmse:132771.43750
[9]	validation_0-rmse:126932.07031
[10]	validation_0-rmse:121814.23438
[11]	validation_0-rmse:116519.91406
[12]	validation_0-rmse:111521.20312
[13]	validation_0-rmse:107173.65625
[14]	validation_0-rmse:102723.65625
[15]	validation_0-rmse:98772.10156
[16]	validation_0-rmse:94988.50781
[17]	validation_0-rmse:91559.93750
[18]	validation_0-rmse:88159.90625
[19]	validation_0-rmse:85013.76562
[20]	validation_0-rmse:81773.07812
[21]	validation_0-rmse:78970.49219
[22]	validation_0-rmse:76359.21875
[23]	validation_0-rmse:73877.36719
[24]	validation_0-rmse:71526.32812
[25]	validation_0-rmse:69363.75781
[26]	validation_0-rms

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.05, max_delta_step=0, max_depth=7,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [24]:
score_model(my_model)

17172.227

In [94]:
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)