https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques

In [336]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_halving_search_cv 
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.impute import SimpleImputer

In [337]:
complete_train_data = pd.read_csv("train.csv")
final_data = pd.read_csv("test.csv")

In [338]:
complete_train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [339]:
complete_train_data.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [340]:
final_data.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           4
LotFrontage      227
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           1
SaleCondition      0
Length: 80, dtype: int64

In [341]:
features = ['MSSubClass', 'MSZoning', 'LotArea', 'Street', 'Alley']
complete_train_data = complete_train_data.drop(["LotFrontage"], axis=1)
final_data = final_data.drop(["LotFrontage"], axis=1)


In [342]:
#features = complete_train_data.columns[:-1]
#features
complete_train_data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'WoodDeckSF'

In [343]:
for collumn in features:
    try:
        complete_train_data[collumn].fillna(complete_train_data.mean(), inplace = True)
        final_data[collumn].fillna(final_data.mean(), inplace = True)
    except:
        complete_train_data[collumn].fillna(complete_train_data.median(), inplace = True)
        final_data[collumn].fillna(final_data.median(), inplace = True)

  complete_train_data[collumn].fillna(complete_train_data.mean(), inplace = True)
  final_data[collumn].fillna(final_data.mean(), inplace = True)


In [344]:
partial_train, partial_test = train_test_split(complete_train_data, test_size=0.3)

In [345]:
y_partial_train = partial_train["SalePrice"]
X_partial_train = pd.get_dummies(partial_train[features])

In [346]:
y_partial_test = partial_test["SalePrice"]
X_partial_test = pd.get_dummies(partial_test[features])

In [347]:
y_complete_train = complete_train_data["SalePrice"]

In [348]:
X_complete_train = pd.get_dummies(complete_train_data[features])

X_final = pd.get_dummies(final_data[features])

In [349]:

param_distributions = {"n_estimators": [100, 150, 200],
                    "max_depth": [8, 10, 12, 14]}


clf = RandomForestRegressor(max_depth=10, n_estimators=100, random_state=1)

best_clf = HalvingRandomSearchCV(clf, param_distributions)

best_clf.fit(X_partial_train.values, y_partial_train.values)

best_clf.best_params_

         nan         nan         nan         nan         nan         nan
 -0.16539553 -0.2838635  -0.22934206 -0.16539553]
         nan         nan         nan         nan         nan         nan
 -0.16539553 -0.2838635  -0.22934206 -0.16539553 -0.21008505 -0.21003549]


{'n_estimators': 200, 'max_depth': 14}

In [350]:
score = best_clf.score(X_partial_test.values, y_partial_test.values)
print(f'Coefficient of determination of the prediction.: {score}')

Coefficient of determination of the prediction.: 0.3590035895834922


In [351]:
predictions = best_clf.predict(X_final.values)
output = pd.DataFrame({"Id": final_data.Id, "SalePrice": predictions})
output.to_csv("submission.csv", index = False)