In [1]:
# IMPORT
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.compose import ColumnTransformer,make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
#model
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor,GradientBoostingRegressor,\
                             RandomForestRegressor,  GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
# measure
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn import set_config
set_config(display='diagram')

In [2]:
# READ DATA
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
# TÁCH CÁC TẬP:
X = train.drop(['SalePrice'], axis=1)
y = train['SalePrice']

In [48]:
class Preprocess(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, tr_X, y=None):
        return self
    def transform(self, X_df, y=None):
        df = X_df.copy()
        # Chuyển đổi thuộc tính dạng numberic sang Categorical.
        for col in ('MSSubClass', 'YrSold', 'MoSold'):
            df[col] = df[col].astype(str)
        # Xóa các thuộc tính không cần thiết, và thêm thuộc tính mới vào.
        df['HasPool'] = df['PoolQC'].isna().apply(lambda x : 0 if x == True else 1)
        df.drop(columns=['Id','Street','Utilities','PoolQC'], inplace = True)
        
        # chuyển các thuộc categorical sang dạng nominal có thứ tự.
        col1s = ['HeatingQC','KitchenQual','ExterQual','ExterCond','FireplaceQu','BsmtQual','BsmtCond','GarageQual','GarageCond']
        for col in col1s:
            df[col].replace(to_replace=['Ex','Gd','TA','Fa','Po',np.nan], value=[5,4,3,2,1,0],inplace = True)
        df['Alley'].replace(to_replace=['Grvl','Pave',np.nan], value=[1,2,0],inplace = True)
        df['LotShape'].replace(to_replace=['Reg','IR1','IR2','IR3'],value=[4,3,2,1],inplace = True)
        df['LandContour'].replace(to_replace=['Lvl','Bnk','HLS','Low'], value=[2,3,1,0],inplace = True)
        df['LandSlope'].replace(to_replace=['Gtl','Mod','Sev'], value=[2,1,0],inplace = True)
        df['RoofMatl'].replace(to_replace=['ClyTile','CompShg','Membran','Metal',\
                                                     'Roll','Tar&Grv','WdShake','WdShngl'], value=[2,4,1,1,1,3,1,1],inplace = True)
        df['BsmtExposure'].replace(to_replace=['Gd','Av','Mn','No',np.nan], value=[4,3,2,1,0],inplace = True)
        df['BsmtFinType1'].replace(to_replace=['GLQ','ALQ','BLQ','Rec','LwQ','Unf',np.nan],\
                                                 value=[6,5,4,3,2,1,0],inplace = True)
        df['BsmtFinType2'].replace(to_replace=['GLQ','ALQ','BLQ','Rec','LwQ','Unf',np.nan]\
                                                 , value=[6,5,4,3,2,1,0],inplace = True)
        df['Functional'].replace(to_replace=['Typ','Min1','Min2','Mod','Maj1','Maj2','Sev','Sal'],\
                                             value=[5,2,2,3,4,4,0,1],inplace = True)
        df['GarageType'].replace(to_replace=['2Types','Attchd','Basment','BuiltIn','CarPort','Detchd',np.nan],\
                                             value=[1,2,2,2,2,1,0],inplace = True)
        df['GarageFinish'].replace(to_replace=['Fin','RFn','Unf',np.nan],value=[3,2,1,0],inplace = True)
        df['PavedDrive'].replace(to_replace=['Y','P','N'], value=[2,1,0],inplace = True)
        df['Fence'].replace(to_replace=['GdPrv','MnPrv','GdWo','MnWw',np.nan], value=[4,3,2,1,0],inplace = True)
        df['RoofStyle'].replace(to_replace=['Flat','Gable','Gambrel','Hip','Mansard','Shed'],\
                                           value=[1,2,1,2,1,1],inplace = True)
        # fillna các cột Categorical
        Cate_cols = df.select_dtypes(include='object').keys()
        Num_cols = df.keys().difference(Cate_cols)
        df[Cate_cols] = df[Cate_cols].fillna('None')
        # fillna các cột GarageYrBlt và các cột numberic còn lại.
        df['GarageYrBlt'].fillna(df['YearBuilt'],inplace = True)
        df[Num_cols] = df[Num_cols].fillna(0)
        return df

In [63]:
cate_cols = ['MSSubClass', 'MSZoning', 'LotConfig', 'Neighborhood', 'Condition1',
       'Condition2', 'BldgType', 'HouseStyle', 'Exterior1st', 'Exterior2nd',
       'MasVnrType', 'Foundation', 'Heating', 'CentralAir', 'Electrical',
       'MiscFeature', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition']
num_cols = ['1stFlrSF', '2ndFlrSF', '3SsnPorch', 'Alley', 'BedroomAbvGr',
       'BsmtCond', 'BsmtExposure', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFinType1',
       'BsmtFinType2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtQual', 'BsmtUnfSF',
       'EnclosedPorch', 'ExterCond', 'ExterQual', 'Fence', 'FireplaceQu',
       'Fireplaces', 'FullBath', 'Functional', 'GarageArea', 'GarageCars',
       'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType', 'GarageYrBlt',
       'GrLivArea', 'HalfBath', 'HasPool', 'HeatingQC', 'KitchenAbvGr',
       'KitchenQual', 'LandContour', 'LandSlope', 'LotArea', 'LotFrontage',
       'LotShape', 'LowQualFinSF', 'MasVnrArea', 'MiscVal', 'OpenPorchSF',
       'OverallCond', 'OverallQual', 'PavedDrive', 'PoolArea', 'RoofMatl',
       'RoofStyle', 'ScreenPorch', 'TotRmsAbvGrd', 'TotalBsmtSF',
       'WoodDeckSF', 'YearBuilt', 'YearRemodAdd']

preprocess = make_pipeline(
    (Preprocess()),
    (make_column_transformer(
        (OneHotEncoder(handle_unknown='ignore'),cate_cols),
        (StandardScaler(),num_cols)
    ))
)

In [61]:
a = Preprocess().fit_transform(X)

In [62]:
a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 77 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   object 
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1460 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Alley          1460 non-null   int64  
 5   LotShape       1460 non-null   int64  
 6   LandContour    1460 non-null   int64  
 7   LotConfig      1460 non-null   object 
 8   LandSlope      1460 non-null   int64  
 9   Neighborhood   1460 non-null   object 
 10  Condition1     1460 non-null   object 
 11  Condition2     1460 non-null   object 
 12  BldgType       1460 non-null   object 
 13  HouseStyle     1460 non-null   object 
 14  OverallQual    1460 non-null   int64  
 15  OverallCond    1460 non-null   int64  
 16  YearBuilt      1460 non-null   int64  
 17  YearRemodAdd   1460 non-null   int64  
 18  RoofStyl

In [71]:
def evaluate_model(model, name_model):
    scores = -1*cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error')
    
    print('\n----Model----: ' + name_model)
    print('Mean Score = {}'.format(scores.mean()))

In [80]:
# List model Using
models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(random_state=0),
    'ElasticNet': ElasticNet(random_state=0),
    'SGDRegressor': SGDRegressor(),
    "BaggingRegressor": BaggingRegressor(random_state=0), 
    "AdaBoostRegressor": AdaBoostRegressor(random_state=0),
    "GradientBoostingRegressor": GradientBoostingRegressor(random_state=0), 
    "RandomForestRegressor": RandomForestRegressor(random_state=0),
    "XGBRegressor": XGBRegressor(n_estimators=500,learning_rate=0.05, n_jobs=4),
    'SVR': SVR()
       }

# using default hyperparameter
for name_model, model in models.items():
    my_pipeline = Pipeline(steps=[('preprocessor', preprocess),
                                  ('model', model)])
    evaluate_model(my_pipeline, name_model)


----Model----: LinearRegression
Mean Score = 2826239638743176.5


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(



----Model----: Lasso
Mean Score = 20983.329854552598

----Model----: ElasticNet
Mean Score = 20093.80577481215

----Model----: SGDRegressor
Mean Score = 20626.28526100053

----Model----: BaggingRegressor
Mean Score = 19282.80719178082

----Model----: AdaBoostRegressor
Mean Score = 24204.13038567981

----Model----: GradientBoostingRegressor
Mean Score = 15933.948083874862

----Model----: RandomForestRegressor
Mean Score = 17470.960760273974

----Model----: XGBRegressor
Mean Score = 15632.714905286814

----Model----: SVR
Mean Score = 55512.87644692935


In [74]:
# find best hyperparameters
parametersGrid = { 
    'subsample':[0.7,0.8,0.9],
    'alpha': [0.001, 0.01, 0.05,0.1],
    'learning_rate': [0.001,0.01, 0.05, 0.1, 0.5],
    'n_estimators':[20, 50, 80, 100,150,200],
                 }

kfold = KFold(n_splits=5)

GBR_grid = GridSearchCV(GradientBoostingRegressor(random_state=0), parametersGrid, scoring='neg_root_mean_squared_error', cv=kfold)
GBR_grid.fit(preprocess.fit_transform(X),y)


print(GBR_grid.best_params_)

{'alpha': 0.001, 'learning_rate': 0.1, 'n_estimators': 100, 'subsample': 0.8}


In [77]:
my_best_GBR_model = GradientBoostingRegressor(random_state=0,
                                              alpha= 0.001,learning_rate= 0.1, n_estimators= 100, subsample= 0.8)
my_model = Pipeline(steps=[('preprocessor',preprocess),
                          ('model', my_best_GBR_model)])
my_model.fit(X, y)

0.9674227507024442

In [81]:
y_preds = my_model.predict(test)

In [82]:
y_preds

array([124711.86916994, 159280.42899285, 173226.7884282 , ...,
       152988.64071997, 118854.31026496, 217836.40796485])

In [83]:
test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [86]:
df = pd.DataFrame({"SalePrice": y_preds},index = test.Id)

In [87]:
df

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,124711.869170
1462,159280.428993
1463,173226.788428
1464,178924.393522
1465,198350.158068
...,...
2915,85954.356476
2916,87845.921344
2917,152988.640720
2918,118854.310265


In [90]:
params = { 
    'alpha': [0.001,0.005,0.01,0.05,0.1]
    'learning_rate': [0.01, 0.05, 0.1, 0.5],
    'n_estimators':[200, 400, 500],
    'n_jobs': [2,3,4]
                 }

kfold = KFold(n_splits=5)

XGB_grid = GridSearchCV(XGBRegressor(random_state=0), params, scoring='neg_root_mean_squared_error', cv=kfold)
XGB_grid.fit(preprocess.fit_transform(X),y)


print(XGB_grid.best_params_)

{'learning_rate': 0.05, 'n_estimators': 400, 'n_jobs': 2}


In [92]:
my_best_GBR_model = XGBRegressor(random_state=0,learning_rate=0.05, 
                                              n_estimators=400,n_jobs = 2)
my_model = Pipeline(steps=[('preprocessor',preprocess),
                          ('model', my_best_GBR_model)])
my_model.fit(X, y)
my_model.predict(test)

array([123920.08, 157255.52, 187757.89, ..., 163704.31, 119265.84,
       211715.33], dtype=float32)

In [93]:
my_model.score(X,y)

0.9980161305314297

In [94]:
y_preds = my_model.predict(test)

In [95]:
df = pd.DataFrame({"SalePrice": y_preds},index = test.Id)

In [96]:
df

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,123920.078125
1462,157255.515625
1463,187757.890625
1464,191589.187500
1465,191331.296875
...,...
2915,79726.937500
2916,84892.429688
2917,163704.312500
2918,119265.843750


In [97]:
df.to_csv("2ndSumis.csv")

In [98]:
pd.read_csv('2ndSumis.csv')

Unnamed: 0,Id,SalePrice
0,1461,123920.08
1,1462,157255.52
2,1463,187757.89
3,1464,191589.19
4,1465,191331.30
...,...,...
1454,2915,79726.94
1455,2916,84892.43
1456,2917,163704.31
1457,2918,119265.84
