In [1]:
import pandas as pd
import numpy as np  
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,StandardScaler
from sklearn.ensemble import RandomForestRegressor



In [2]:
df = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [3]:
feature_cols = ['Id','LotConfig', 'LandSlope', 'Neighborhood', 'Heating', 'SaleType', 'RoofStyle', 'HouseStyle',
                'LandContour', 'MSSubClass', 'LotFrontage', 'LotArea', 'LotShape', '1stFlrSF', '2ndFlrSF', 'BsmtFullBath', 
                'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'GarageArea', 'PoolArea']

In [4]:
X_train = df[feature_cols]
y_train = df['SalePrice']
X_test = df_test[feature_cols]

In [5]:
print(df[X_train.columns].dtypes)


Id                int64
LotConfig        object
LandSlope        object
Neighborhood     object
Heating          object
SaleType         object
RoofStyle        object
HouseStyle       object
LandContour      object
MSSubClass        int64
LotFrontage     float64
LotArea           int64
LotShape         object
1stFlrSF          int64
2ndFlrSF          int64
BsmtFullBath      int64
BsmtHalfBath      int64
FullBath          int64
HalfBath          int64
BedroomAbvGr      int64
KitchenAbvGr      int64
GarageArea        int64
PoolArea          int64
dtype: object


In [None]:
FillValues = ColumnTransformer([  
    ('SimpleImputer_LotFrontage', SimpleImputer(strategy='mean'), ['LotFrontage']),
    ('OneHotEncoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['LotConfig', 'LandSlope', 'Neighborhood', 'Heating', 'SaleType', 'RoofStyle', 'HouseStyle']), 
    ('OrdinalEncoder_LandContour', OrdinalEncoder(), ['LandContour']),  
    ('StandardScaler', StandardScaler(), ['MSSubClass', 'LotArea', '1stFlrSF', '2ndFlrSF', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'GarageArea', 'PoolArea'])
], remainder='drop') 

In [7]:
X_transformed = FillValues.fit_transform(X_train)


In [8]:
print(X_transformed[:5]) 


[[ 6.50000000e+01  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  1.00000000e+00  1.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  1.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   1.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   1.00000

In [9]:
pipe = Pipeline([
    ('FillValues',FillValues),
    ('model',RandomForestRegressor(n_estimators=100, random_state=42))
])

In [10]:
pipe.fit(X_train, y_train)

In [11]:
y_pred = pipe.predict(X_test)
print(y_pred)

[136769.58 153715.5  183303.85 ... 159699.   107562.   237509.42]


In [None]:
submission = pd.DataFrame({
    'Id': df_test['Id'], 
    'SalePrice': y_pred  
})


submission.to_csv("submission.csv", index=False)

print("Submission file saved successfully!")


Submission file saved successfully!
