<a href="https://colab.research.google.com/github/SINDHUSITA/Boston-House-Prices-Prediction/blob/master/House_Price_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Importing basic packages**

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

**Importing Data**

In [249]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.dtypes.unique()
train.shape

(1460, 81)

**Feature Selection**

In [250]:
corr = train.corr()
insignif_cats = []
count = 0 
c=0
print('Shape of numeric features before feature selection:',train.shape)
for i in corr['SalePrice']:
  if(i<0.5):
    insignif_cats.append(corr.columns[count])
    c+=1    
  count+=1
train.drop(columns=insignif_cats,inplace=True)
print('Shape of numeric features after feature selection:',train.shape)
test.drop(columns=insignif_cats,inplace=True)

Shape of numeric features before feature selection: (1460, 81)
Shape of numeric features after feature selection: (1460, 54)


**Preprocessing Numeric features**

In [251]:
#obtaining numeric features (int64 and float64)
train_float = train.loc[:,train.dtypes == np.float64]
train_int=(train.loc[:,train.dtypes == np.int64])
train_num=pd.concat([train_float,train_int],axis=1,ignore_index=False)

#obtaining numeric features (int64 and float64) for test data
test_float = test.loc[:,test.dtypes == np.float64]
test_int=(test.loc[:,test.dtypes == np.int64])
test_num=pd.concat([test_float,test_int],axis=1,ignore_index=False)

#handling missing data in numeric columns 
print(train_num.shape)
nulls = pd.DataFrame(train_num.isnull().sum().sort_values(ascending=False).head())
print("Null count in each column initially: ",nulls)
train_num = train_num.interpolate(method='linear', limit_direction='forward', axis=0)
nulls = pd.DataFrame(train_num.isnull().sum().sort_values(ascending=False).head())
print("Null count in each column after handling missing data: ",nulls)

#handling missing data in numeric columns for test data
nulls = pd.DataFrame(test_num.isnull().sum().sort_values(ascending=False).head())
print("Null count for each column in test data initially: ",nulls)
test_num.dropna(inplace=True)
nulls = pd.DataFrame(test_num.isnull().sum().sort_values(ascending=False).head())
print("Null count for each column in test data later: ",nulls)

(1460, 11)
Null count in each column initially:                0
SalePrice     0
GarageArea    0
GarageCars    0
TotRmsAbvGrd  0
FullBath      0
Null count in each column after handling missing data:                0
SalePrice     0
GarageArea    0
GarageCars    0
TotRmsAbvGrd  0
FullBath      0
Null count for each column in test data initially:                0
GarageArea    1
GarageCars    1
TotalBsmtSF   1
TotRmsAbvGrd  0
FullBath      0
Null count for each column in test data later:                0
TotRmsAbvGrd  0
FullBath      0
GrLivArea     0
1stFlrSF      0
YearRemodAdd  0


###**Preprocessing Categorical Features**

**Handling missing data in categorical data**

In [252]:
#obtaining categorical features
train_cat=train.select_dtypes(exclude=[np.int64,np.float64])
train_cat=train_cat.astype('str')

#obtaining categorical features in test data
test_cat=test.select_dtypes(exclude=[np.int64,np.float64])
test_cat=test_cat.astype('str')

#handling missing data in categorical features in train data
nulls = pd.DataFrame(train_cat.isnull().sum().sort_values(ascending=False).head())
print("Null count in each column initially:",nulls) #clearly there are no null values
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy = 'most_frequent')
imp.fit(train_cat)
len(np.unique(train_cat.values))

#handling missing data in categorical features in test data
nulls = pd.DataFrame(test_cat.isnull().sum().sort_values(ascending=False).head())
print("Null count in each column initially in test data:",nulls) #clearly there are no null values

Null count in each column initially:                0
SaleCondition  0
Condition2     0
ExterQual      0
MasVnrType     0
Exterior2nd    0
Null count in each column initially in test data:                0
SaleCondition  0
Condition2     0
ExterQual      0
MasVnrType     0
Exterior2nd    0


**Encoding Categorical data**

In [253]:
print('Shape of categorical features in train and test data before encoding',train_cat.shape,test_cat.shape)
cat_cols = train_cat.columns
temp = pd.get_dummies(pd.concat([train_cat,test_cat],keys=[0,1]), columns=cat_cols)
# Selecting data from multi index and assigning them
train_cat,test_cat = temp.xs(0),temp.xs(1)

print('Shape of categorical features in train and test data after encoding',train_cat.shape, test_cat.shape)

Shape of categorical features in train and test data before encoding (1460, 43) (1459, 43)
Shape of categorical features in train and test data after encoding (1460, 275) (1459, 275)


**Splitting Train and Test Data**

In [0]:
#concatenating numerical and categorical features
train_num.drop(columns='SalePrice',inplace=True)
train_total = pd.concat([train_num,train_cat],axis=1,ignore_index=False)

#concatenating numerical and categorical features in test data
test_total = pd.concat([test_num,test_cat],axis=1,ignore_index=False)

In [255]:
#Splitting as X, y
X = train_total
y = train['SalePrice']

#Splitting train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=0)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(1095, 285) (365, 285) (1095,) (365,)


**Feature Scaling**

In [0]:
y_train= y_train.values.reshape(-1,1)
y_test= y_test.values.reshape(-1,1)

from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X_train_fs = sc_X.fit_transform(X_train)
X_test_fs = sc_X.fit_transform(X_test)
y_train_fs = sc_X.fit_transform(y_train)
y_test_fs = sc_y.fit_transform(y_test)


##**Training the model using various algortims**

**1. Training using Multi Linear Regression**

In [257]:
from sklearn.linear_model import LinearRegression
regressor1 = LinearRegression()
regressor1.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

**2. Training using Polynomial Regression**

In [0]:
# from sklearn.preprocessing import PolynomialFeatures
# plr = PolynomialFeatures(degree=2)
# X_poly = plr.fit_transform(X_train)
# regressor2 = LinearRegression()
# regressor2.fit(X_poly,y_train)

**3. Training using Decision Tree Regressor**

In [259]:
from sklearn.tree import DecisionTreeRegressor
regressor3 = DecisionTreeRegressor()
regressor3.fit(X_train,y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

**4. Training using Random Forest Regressor**

In [260]:
from sklearn.ensemble import RandomForestRegressor
regressor4 = RandomForestRegressor(n_estimators=10,random_state=1)
regressor4.fit(X_train,y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=10, n_jobs=None, oob_score=False,
                      random_state=1, verbose=0, warm_start=False)

**5. Training using Support Vector Regressor**

In [261]:
from sklearn.svm import SVR
regressor5 = SVR(kernel='rbf')
regressor5.fit(X_train_fs,y_train_fs)

  y = column_or_1d(y, warn=True)


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

##**Testing above models**

**1. Testing using Multi Linear Regression**

In [0]:
y_pred1 = regressor1.predict(X_test)

**2. Testing using Polynomial Regression**

In [0]:
# y_pred2 = regressor2.predict(plr.fit_transform(X_test))

**3. Testing using Decision Tree Regression**

In [0]:
y_pred3 = regressor3.predict(X_test)

**4. Testing using Random Forest Regression**

In [0]:
y_pred4 = regressor4.predict(X_test)

**5. Testing using Support Vector Regression**

In [0]:
y_pred5 = regressor5.predict(X_test_fs)

##**Evaluating each model using R-squared test**

In [0]:
from sklearn.metrics import r2_score

**1. Evaluating the Multi Linear Regression**

In [268]:
r2_score(y_test, y_pred1)

0.5815222543692613

**2. Evaluating the Polynomial Regression**

In [0]:
# r2_score(y_test, y_pred2)

**3. Evaluating the Decision Tree Regression**

In [270]:
r2_score(y_test, y_pred3)

0.7725634618646722

**4. Evaluating the Random Forest Regression**

In [271]:
r2_score(y_test, y_pred4)

0.8346358445140396

**5. Evaluating the Support Vector Regression**

In [272]:
r2_score(y_test_fs, y_pred5)

0.6778851733205491

##**Predicting the prices using the Best Model**

In [273]:
#we are using Random Forest Regressor as it gave better results

nulls = pd.DataFrame(test_total.isnull().sum().sort_values(ascending=False).head())
test_total.dropna(inplace=True)
y_pred = regressor4.predict(test_total)
print(y_pred[:10])


[326895.4 328925.3 319030.  343388.1 314232.6 304630.  304180.  315630.
 317495.4 326895.4]
