In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [40]:
X_full = pd.read_csv(r"D:\Housing price\train.csv")
X_test_full = pd.read_csv(r"D:\Housing price\test.csv")

In [41]:
y = X_full.SalePrice

In [42]:
features = ['LotArea','LotShape','CentralAir','OverallQual','OverallCond','LotFrontage', 'MSSubClass','BsmtFinSF1','BsmtUnfSF', 'TotalBsmtSF', 'GrLivArea', 'GarageArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
#features = ['LotArea','Street','Utilities','CentralAir','LandSlope','PavedDrive','LotShape','LandContour','ExterQual','KitchenQual','MSZoning','LotConfig','BldgType','ExterCond','HeatingQC','Condition2','RoofStyle','Foundation','Heating','Functional','SaleCondition','RoofMatl','HouseStyle','Condition1','SaleType','OverallQual','OverallCond', 'MSSubClass','BsmtFinSF1','BsmtUnfSF', 'TotalBsmtSF', 'GrLivArea', 'GarageArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = X_full[features].copy()
X_test = X_test_full[features].copy()

In [43]:
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, train_size=0.8, test_size=0.2, random_state=0)

In [44]:
#Select categorical columns with low cardinality
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"]

In [45]:
# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

In [46]:
#Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [47]:
X_train.head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Condition1,Condition2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
618,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Norm,Norm,...,0,108,0,0,260,0,0,7,2007,314813
870,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,PosN,Norm,...,0,0,0,0,0,0,0,8,2009,109500
92,RL,Pave,Grvl,IR1,HLS,AllPub,Inside,Gtl,Norm,Norm,...,0,0,44,0,0,0,0,8,2009,163500
817,RL,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,Norm,Norm,...,150,59,0,0,0,0,0,7,2008,271000
302,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Norm,Norm,...,468,81,0,0,0,0,0,1,2006,205000


In [48]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [49]:
# Preprocessing imputation for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

In [50]:
# Preprocessing OneHot Encoding for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [51]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [52]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators = 100, random_state=0)

In [53]:
from sklearn.metrics import mean_absolute_error
#Bundle preprocessing and modeling code in a pipeline 
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

#Preprocessing of training data
my_pipeline.fit(X_train, y_train)

#Preprocessing of validation data
preds = my_pipeline.predict(X_valid)

#Evaluate the model
score = mean_absolute_error(y_valid, preds)

print('MAE', score)

MAE 922.2361643835619
