In [1]:
# IMPORT
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.compose import ColumnTransformer,make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
#model
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor,GradientBoostingRegressor,\
                             RandomForestRegressor,  GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
# measure
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn import set_config
set_config(display='diagram')

In [2]:
# READ DATA
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
# TÁCH CÁC TẬP:
X = train.drop(['SalePrice'], axis=1)
y = train['SalePrice']

In [3]:
class Preprocess(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, tr_X, y=None):
        return self
    def transform(self, X_df, y=None):
        df = X_df.copy()
        # Chuyển đổi thuộc tính dạng numberic sang Categorical.
        for col in ('MSSubClass', 'YrSold', 'MoSold'):
            df[col] = df[col].astype(str)
        # Xóa các thuộc tính không cần thiết, và thêm thuộc tính mới vào.
        df['HasPool'] = df['PoolQC'].isna().apply(lambda x : 0 if x == True else 1)
        df['SumFlrSF'] = df['1stFlrSF'] + df['2ndFlrSF']
        df['SumYrBultRd'] = df['YearBuilt'] + df['YearRemodAdd']
        df['SumBsmtFin'] = df['BsmtFinSF1'] + df['BsmtFinSF2']

        df.drop(columns=['Id','Street','Utilities','PoolQC'], inplace = True)
        
        # chuyển các thuộc categorical sang dạng nominal có thứ tự.
        col1s = ['HeatingQC','KitchenQual','ExterQual','ExterCond','FireplaceQu','BsmtQual','BsmtCond','GarageQual','GarageCond']
        for col in col1s:
            df[col].replace(to_replace=['Ex','Gd','TA','Fa','Po',np.nan], value=[5,4,3,2,1,0],inplace = True)
        df['Alley'].replace(to_replace=['Grvl','Pave',np.nan], value=[1,2,0],inplace = True)
        df['LotShape'].replace(to_replace=['Reg','IR1','IR2','IR3'],value=[4,3,2,1],inplace = True)
        df['LandContour'].replace(to_replace=['Lvl','Bnk','HLS','Low'], value=[2,3,1,0],inplace = True)
        df['LandSlope'].replace(to_replace=['Gtl','Mod','Sev'], value=[2,1,0],inplace = True)
        df['RoofMatl'].replace(to_replace=['ClyTile','CompShg','Membran','Metal',\
                                                     'Roll','Tar&Grv','WdShake','WdShngl'], value=[2,4,1,1,1,3,1,1],inplace = True)
        df['BsmtExposure'].replace(to_replace=['Gd','Av','Mn','No',np.nan], value=[4,3,2,1,0],inplace = True)
        df['BsmtFinType1'].replace(to_replace=['GLQ','ALQ','BLQ','Rec','LwQ','Unf',np.nan],\
                                                 value=[6,5,4,3,2,1,0],inplace = True)
        df['BsmtFinType2'].replace(to_replace=['GLQ','ALQ','BLQ','Rec','LwQ','Unf',np.nan]\
                                                 , value=[6,5,4,3,2,1,0],inplace = True)
        df['Functional'].replace(to_replace=['Typ','Min1','Min2','Mod','Maj1','Maj2','Sev','Sal'],\
                                             value=[5,2,2,3,4,4,0,1],inplace = True)
        df['GarageType'].replace(to_replace=['2Types','Attchd','Basment','BuiltIn','CarPort','Detchd',np.nan],\
                                             value=[1,2,2,2,2,1,0],inplace = True)
        df['GarageFinish'].replace(to_replace=['Fin','RFn','Unf',np.nan],value=[3,2,1,0],inplace = True)
        df['PavedDrive'].replace(to_replace=['Y','P','N'], value=[2,1,0],inplace = True)
        df['Fence'].replace(to_replace=['GdPrv','MnPrv','GdWo','MnWw',np.nan], value=[4,3,2,1,0],inplace = True)
        df['RoofStyle'].replace(to_replace=['Flat','Gable','Gambrel','Hip','Mansard','Shed'],\
                                           value=[1,2,1,2,1,1],inplace = True)
        # fillna các cột Categorical
        Cate_cols = df.select_dtypes(include='object').keys()
        Num_cols = df.keys().difference(Cate_cols)
        df[Cate_cols] = df[Cate_cols].fillna('None')
        # fillna các cột GarageYrBlt và các cột numberic còn lại.
        df['GarageYrBlt'].fillna(df['YearBuilt'],inplace = True)
        df[Num_cols] = df[Num_cols].fillna(0)
        return df

In [4]:
cate_cols = ['MSSubClass', 'MSZoning', 'LotConfig', 'Neighborhood', 'Condition1',
       'Condition2', 'BldgType', 'HouseStyle', 'Exterior1st', 'Exterior2nd',
       'MasVnrType', 'Foundation', 'Heating', 'CentralAir', 'Electrical',
       'MiscFeature', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition']
num_cols = ['1stFlrSF', '2ndFlrSF', '3SsnPorch', 'Alley', 'BedroomAbvGr',
       'BsmtCond', 'BsmtExposure', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFinType1',
       'BsmtFinType2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtQual', 'BsmtUnfSF',
       'EnclosedPorch', 'ExterCond', 'ExterQual', 'Fence', 'FireplaceQu',
       'Fireplaces', 'FullBath', 'Functional', 'GarageArea', 'GarageCars',
       'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType', 'GarageYrBlt',
       'GrLivArea', 'HalfBath', 'HasPool', 'HeatingQC', 'KitchenAbvGr',
       'KitchenQual', 'LandContour', 'LandSlope', 'LotArea', 'LotFrontage',
       'LotShape', 'LowQualFinSF', 'MasVnrArea', 'MiscVal', 'OpenPorchSF',
       'OverallCond', 'OverallQual', 'PavedDrive', 'PoolArea', 'RoofMatl',
       'RoofStyle', 'ScreenPorch', 'TotRmsAbvGrd', 'TotalBsmtSF',
       'WoodDeckSF', 'YearBuilt', 'YearRemodAdd','SumYrBultRd','SumFlrSF']

preprocess = make_pipeline(
    (Preprocess()),
    (make_column_transformer(
        (OneHotEncoder(handle_unknown='ignore'),cate_cols),
        (StandardScaler(),num_cols)
    ))
)

In [5]:
def evaluate_model(model, name_model):
    scores = -1*cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error')
    
    print('\n----Model----: ' + name_model)
    print('Mean Score = {}'.format(scores.mean()))

In [6]:
# List model Using
models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(random_state=0),
    'ElasticNet': ElasticNet(random_state=0),
    'SGDRegressor': SGDRegressor(),
    "BaggingRegressor": BaggingRegressor(random_state=0), 
    "AdaBoostRegressor": AdaBoostRegressor(random_state=0),
    "GradientBoostingRegressor": GradientBoostingRegressor(random_state=0), 
    "RandomForestRegressor": RandomForestRegressor(random_state=0),
    "XGBRegressor": XGBRegressor(n_estimators=500,learning_rate=0.05, n_jobs=4),
    'SVR': SVR()
       }

# using default hyperparameter
for name_model, model in models.items():
    my_pipeline = Pipeline(steps=[('preprocessor', preprocess),
                                  ('model', model)])
    evaluate_model(my_pipeline, name_model)


----Model----: LinearRegression
Mean Score = 1995462672317645.5


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(



----Model----: Lasso
Mean Score = 20983.29087080622

----Model----: ElasticNet
Mean Score = 19879.804219984093

----Model----: SGDRegressor
Mean Score = 20825.54872742248

----Model----: BaggingRegressor
Mean Score = 19301.219794520548

----Model----: AdaBoostRegressor
Mean Score = 23807.49752761676

----Model----: GradientBoostingRegressor
Mean Score = 15801.110993035958

----Model----: RandomForestRegressor
Mean Score = 17349.11113013699

----Model----: XGBRegressor
Mean Score = 15523.930623929797

----Model----: SVR
Mean Score = 55509.945899303675


In [7]:
# find best hyperparameters
'''parametersGrid = { 
    'subsample':[0.7,0.8,0.9],
    'alpha': [0.001, 0.01, 0.05,0.1],
    'learning_rate': [0.001,0.01, 0.05, 0.1, 0.5],
    'n_estimators':[20, 50, 80, 100,150,200],
                 }

kfold = KFold(n_splits=5)

GBR_grid = GridSearchCV(GradientBoostingRegressor(random_state=0), parametersGrid, scoring='neg_root_mean_squared_error', cv=kfold)
GBR_grid.fit(preprocess.fit_transform(X),y)


print(GBR_grid.best_params_)'''

"parametersGrid = { \n    'subsample':[0.7,0.8,0.9],\n    'alpha': [0.001, 0.01, 0.05,0.1],\n    'learning_rate': [0.001,0.01, 0.05, 0.1, 0.5],\n    'n_estimators':[20, 50, 80, 100,150,200],\n                 }\n\nkfold = KFold(n_splits=5)\n\nGBR_grid = GridSearchCV(GradientBoostingRegressor(random_state=0), parametersGrid, scoring='neg_root_mean_squared_error', cv=kfold)\nGBR_grid.fit(preprocess.fit_transform(X),y)\n\n\nprint(GBR_grid.best_params_)"

In [8]:
'''my_best_GBR_model = GradientBoostingRegressor(random_state=0,
                                              alpha= 0.001,learning_rate= 0.1, n_estimators= 100, subsample= 0.8)
my_model = Pipeline(steps=[('preprocessor',preprocess),
                          ('model', my_best_GBR_model)])
my_model.fit(X, y)'''

"my_best_GBR_model = GradientBoostingRegressor(random_state=0,\n                                              alpha= 0.001,learning_rate= 0.1, n_estimators= 100, subsample= 0.8)\nmy_model = Pipeline(steps=[('preprocessor',preprocess),\n                          ('model', my_best_GBR_model)])\nmy_model.fit(X, y)"

In [9]:
params = { 
    'alpha': [0.005,0.05,0.1],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators':[200, 400, 500],
    'n_jobs': [2,3,4]
                 }

kfold = KFold(n_splits=5)

XGB_grid = GridSearchCV(XGBRegressor(random_state=0), params, scoring='neg_root_mean_squared_error', cv=kfold)
XGB_grid.fit(preprocess.fit_transform(X),y)


print(XGB_grid.best_params_)

{'alpha': 0.1, 'learning_rate': 0.05, 'n_estimators': 500, 'n_jobs': 2}


In [12]:
my_best_GBR_model = XGBRegressor(random_state=0,alpha = 0.1,learning_rate=0.05, 
                                              n_estimators=500,n_jobs = 2)
my_model = Pipeline(steps=[('preprocessor',preprocess),
                          ('model', my_best_GBR_model)])
my_model.fit(X, y)
my_model.score(X,y)

0.9986536638669821

In [14]:
y_pred = my_model.predict(test)
df = pd.DataFrame({"SalePrice": y_pred},index = test.Id)

In [15]:
df

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,125711.031250
1462,156090.234375
1463,185638.171875
1464,192725.406250
1465,186541.859375
...,...
2915,77269.039062
2916,83401.648438
2917,164714.468750
2918,121556.148438


In [None]:
#df.to_csv("4thSumis.csv")