# House Prices Modeling : Nisarg 

In [83]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_log_error
from sklearn.impute import SimpleImputer

In [84]:
data = pd.read_csv('C:/Users/nisar/dsp-nisarg-parmar/data/house-prices-advanced-regression-techniques/train.csv')

In [85]:
data.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [86]:
data.columns
data.dropna()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice


## Selecting the continous and categorical features 

In [87]:
continuous_features = ['LotArea', 'GrLivArea']
categorical_features = ['Neighborhood', 'Exterior1st']

## Spliting data into train and test sets

In [88]:
X_train, X_test, y_train, y_test = train_test_split(data[continuous_features+categorical_features], 
                                                    data['SalePrice'], 
                                                    test_size=0.2, 
                                                    random_state=42)

## Feature processing

In [89]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, continuous_features),
        ('cat', categorical_transformer, categorical_features)
    ])

## Model training

In [90]:
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])

model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['LotArea', 'GrLivArea']),
                                                 ('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Neighborhood',
                                                   'Exterior1st'])])),
                ('regressor', LinearRegression())])

# Predict on test set and compute precision

In [91]:
predictions = model.predict(X_test)
predictions = np.clip(predictions, None, 20)
predicted_prices = np.exp(predictions)
precision = model.score(X_test, y_test)
print(f"Model precision: {precision:.2f}")

Model precision: 0.77


# Print predicted values for test set

In [92]:
prediction = model.predict(X_test)
pd.DataFrame(prediction, columns = ["Predicted Price"]).head(20)

Unnamed: 0,Predicted Price
0,122659.836281
1,327025.245922
2,98298.522776
3,190514.570592
4,223749.812554
5,63898.507743
6,228820.333768
7,158536.554607
8,64151.268462
9,133216.865329
