# House price prediction [🔗](https://www.kaggle.com/competitions/home-data-for-ml-course/data)
by kaggle

In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
# Read the data
X = pd.read_csv('./data/train.csv', index_col='Id') 
X_test = pd.read_csv('./data/test.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

# To keep things simple, we'll drop columns with missing values
cols_with_missing = [col for col in X.columns if X[col].isnull().any()] 
X.drop(cols_with_missing, axis=1, inplace=True)
X_test.drop(cols_with_missing, axis=1, inplace=True)

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [2]:
X_train.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
619,20,RL,11694,Pave,Reg,Lvl,AllPub,Inside,Gtl,NridgHt,...,108,0,0,260,0,0,7,2007,New,Partial
871,20,RL,6600,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,...,0,0,0,0,0,0,8,2009,WD,Normal
93,30,RL,13360,Pave,IR1,HLS,AllPub,Inside,Gtl,Crawfor,...,0,44,0,0,0,0,8,2009,WD,Normal
818,20,RL,13265,Pave,IR1,Lvl,AllPub,CulDSac,Gtl,Mitchel,...,59,0,0,0,0,0,7,2008,WD,Normal
303,20,RL,13704,Pave,IR1,Lvl,AllPub,Corner,Gtl,CollgCr,...,81,0,0,0,0,0,1,2006,WD,Normal


In [3]:
# function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

## Training Model on Full dataset
1. Applying Ordinal Encoding on both `X` and `X_test` dataset, as it performs better than both
2. Using `RandomForestRegressor` to train the model
3. Submitting the predictions in the **House price prediction competition**

### Applying `Ordinal Encoding` to the categorical columns
1. Dropping columns having values which are not in training dataset
2. then applying ordinal encoding to rest of the categorical columns

In [4]:
# Categorical columns in the training data
object_cols = [col for col in X.columns if X[col].dtype == "object"]

# Columns that can be safely ordinal encoded
good_label_cols = [col for col in object_cols if 
                   set(X_test[col]).issubset(set(X[col]))]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))
        
print('Categorical columns that will be ordinal encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)

Categorical columns that will be ordinal encoded: ['Street', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'PavedDrive', 'SaleCondition']

Categorical columns that will be dropped from the dataset: ['SaleType', 'MSZoning', 'Utilities', 'KitchenQual', 'Exterior2nd', 'Functional', 'Exterior1st']


In [5]:
# Dropping bad_label columns from the test data set
# Drop categorical columns that will not be encoded
X.drop(bad_label_cols, axis=1, inplace=True)
X_test.drop(bad_label_cols, axis=1, inplace=True)

In [6]:
X_test.head()

Unnamed: 0_level_0,MSSubClass,LotArea,Street,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,11622,Pave,Reg,Lvl,Inside,Gtl,NAmes,Feedr,Norm,...,140,0,0,0,120,0,0,6,2010,Normal
1462,20,14267,Pave,IR1,Lvl,Corner,Gtl,NAmes,Norm,Norm,...,393,36,0,0,0,0,12500,6,2010,Normal
1463,60,13830,Pave,IR1,Lvl,Inside,Gtl,Gilbert,Norm,Norm,...,212,34,0,0,0,0,0,3,2010,Normal
1464,60,9978,Pave,IR1,Lvl,Inside,Gtl,Gilbert,Norm,Norm,...,360,36,0,0,0,0,0,6,2010,Normal
1465,120,5005,Pave,IR1,HLS,Inside,Gtl,StoneBr,Norm,Norm,...,0,82,0,0,144,0,0,1,2010,Normal


In [7]:
ordinal_encoder = OrdinalEncoder()
X[good_label_cols] = ordinal_encoder.fit_transform(X[good_label_cols])
X_test[good_label_cols] = ordinal_encoder.transform(X_test[good_label_cols])

In [8]:
X_test.head()

Unnamed: 0_level_0,MSSubClass,LotArea,Street,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,11622,1.0,3.0,3.0,4.0,0.0,12.0,1.0,2.0,...,140,0,0,0,120,0,0,6,2010,4.0
1462,20,14267,1.0,0.0,3.0,0.0,0.0,12.0,2.0,2.0,...,393,36,0,0,0,0,12500,6,2010,4.0
1463,60,13830,1.0,0.0,3.0,4.0,0.0,8.0,2.0,2.0,...,212,34,0,0,0,0,0,3,2010,4.0
1464,60,9978,1.0,0.0,3.0,4.0,0.0,8.0,2.0,2.0,...,360,36,0,0,0,0,0,6,2010,4.0
1465,120,5005,1.0,0.0,1.0,4.0,0.0,22.0,2.0,2.0,...,0,82,0,0,144,0,0,1,2010,4.0


In [9]:
X_test.isna().sum()

MSSubClass       0
LotArea          0
Street           0
LotShape         0
LandContour      0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
RoofStyle        0
RoofMatl         0
ExterQual        0
ExterCond        0
Foundation       0
BsmtFinSF1       1
BsmtFinSF2       1
BsmtUnfSF        1
TotalBsmtSF      1
Heating          0
HeatingQC        0
CentralAir       0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     2
BsmtHalfBath     2
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
TotRmsAbvGrd     0
Fireplaces       0
GarageCars       1
GarageArea       1
PavedDrive       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
MiscVal          0
MoSold           0
YrSold           0
SaleConditio

#### `X_test` is containing some `nan` values, Imputing it with `Simple Imputer`

In [10]:
X_cols = X.columns
X_test_cols = X_test.columns
X_test_index = X_test.index

my_imputer = SimpleImputer()

X = pd.DataFrame(my_imputer.fit_transform(X))
X_test = pd.DataFrame(my_imputer.transform(X_test))

# Fill in the lines below: imputation removed column names; put them back
X.columns = X_cols
X_test.columns = X_test_cols
X_test.index = X_test_index

### Creating a `RandomForestRegressor` model and training it on full data

In [11]:
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X, y)
price_prediction = model.predict(X_test)
price_prediction

array([129790.5 , 154219.  , 172914.19, ..., 156724.17, 114353.5 ,
       225485.16])

In [12]:
# (Optional) Your code here
output = pd.DataFrame(data={"Id" : X_test.index, "Saleprice" : price_prediction})
output.to_csv("./data/submission_4.csv", index=False)