<a href="https://colab.research.google.com/github/PreethikaShankar/INSAID_Term2_Files/blob/master/HousePrice_Term2_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf


import tensorflow_datasets as tfds

In [0]:
from PIL import Image
import requests
from io import BytesIO
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import math
import time
import re
import os
import seaborn as sns
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.metrics import pairwise_distances
from matplotlib import gridspec
from scipy.sparse import hstack
import plotly
import plotly.figure_factory as ff
from plotly.graph_objs import Scatter, Layout

plotly.offline.init_notebook_mode(connected=True)
warnings.filterwarnings("ignore")

In [0]:
hp_data = pd.read_csv("https://raw.githubusercontent.com/insaid2018/Term-2/master/Projects/house_data.csv")
hp_data.shape

In [0]:
hp_data.isnull().sum()

In [0]:
hp_data['SalePrice'].describe()

In [0]:
plt.hist(hp_data['SalePrice'])
plt.show()
print ("Skew of SalePrice:", hp_data['SalePrice'].skew())

In [0]:
hp_data['LT_SalePrice'] = np.log(hp_data['SalePrice']+1)
plt.hist(hp_data['LT_SalePrice'], color='red')
plt.show()
print ("Skew of Log Transformed SalePrice:", hp_data['LT_SalePrice'].skew())

In [0]:
hp_data.select_dtypes(include=[np.number]).columns.values

In [0]:
zero_feat=[  'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF',
       '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath',
       'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
       'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',
       'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold','LT_SalePrice']
out_feat=['SalePrice']

In [0]:
def correlation_heatmap(hp_data):
    correlations = hp_data.corr()

    fig, ax = plt.subplots(figsize=(10,10))
    sns.heatmap(correlations, vmax=1.0, center=0, fmt='.2f',
                square=True, linewidths=.5,  cbar_kws={"shrink": .70})
    plt.show();
    
correlation_heatmap(hp_data)

In [0]:
#Removing the null values
null_values = pd.DataFrame(hp_data.isnull().sum().sort_values(ascending=False)[:10])
null_values.index.name = 'Feature'
null_values.columns = ['Number of Null Values']
null_values

In [0]:
nan_rows=hp_data.iloc[np.where(hp_data.isnull())]
print(nan_rows)

In [0]:
hp_data.select_dtypes(exclude=[np.number]).columns.values

In [0]:
cat_feat=['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
       'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish',
       'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence',
       'MiscFeature', 'SaleType', 'SaleCondition']

In [0]:
hp_data.columns

In [0]:
data = hp_data.select_dtypes(include=[np.number]).interpolate().dropna()

In [0]:
from sklearn.ensemble import IsolationForest

clf = IsolationForest(max_samples = 100, random_state = 42)
clf.fit(data)
y_noano = clf.predict(data)
y_noano = pd.DataFrame(y_noano, columns = ['Top'])
y_noano[y_noano['Top'] == 1].index.values

data = data.iloc[y_noano[y_noano['Top'] == 1].index.values]
data.reset_index(drop = True, inplace = True)
print("Number of Outliers:", y_noano[y_noano['Top'] == -1].shape[0])
print("Number of rows without outliers:", data.shape[0])

In [0]:
data.head(5)

In [0]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import MinMaxScaler

col_train = list(data.columns)
col_train_bis = list(data.columns)

col_train_bis.remove('SalePrice')

mat_train = np.matrix(data)
#mat_test  = np.matrix(test_)
mat_new = np.matrix(data.drop('SalePrice',axis = 1))
mat_y = np.array(data.SalePrice).reshape((1325,1))

prepro_y = MinMaxScaler()
prepro_y.fit(mat_y)

prepro = MinMaxScaler()
prepro.fit(mat_train)

prepro_test = MinMaxScaler()
prepro_test.fit(mat_new)

train = pd.DataFrame(prepro.transform(mat_train),columns = col_train)
#test  = pd.DataFrame(prepro_test.transform(mat_test),columns = col_train_bis)

train.head()

In [0]:
X = hp_data.drop(["SalePrice","LT_SalePrice"], axis=1)

In [0]:
print(type(X))
print(X.shape)

In [0]:
y = hp_data["LT_SalePrice"]
y.head()

In [0]:
from sklearn.model_selection import train_test_split

def split(X,y):
    return train_test_split(X, y, test_size=0.20, random_state=1)

In [0]:
X_train, X_test, y_train, y_test=split(X,y)
print('Train cases as below')
print('X_train shape: ',X_train.shape)
print('y_train shape: ',y_train.shape)
print('\nTest cases as below')
print('X_test shape: ',X_test.shape)
print('y_test shape: ',y_test.shape)

In [0]:
X_train=X_train.fillna(0)
nan_rows=X_train.iloc[np.where(X_train.isnull())]
print(nan_rows)

In [0]:
X_test=X_test.fillna(0)
nan_rows=X_test.iloc[np.where(X_test.isnull())]
print(nan_rows)

In [0]:
zero_feat=['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF',
       '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath',
       'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
       'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',
       'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold']

In [0]:
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

def numerical_vectorizations(m,o):
    
    scalars = StandardScaler()
    scalars.fit(m.values.reshape(-1,1)) 
    print(f"Mean : {scalars.mean_[0]}, Standard deviation : {np.sqrt(scalars.var_[0])}")

    standardized_train = scalars.transform(m.values.reshape(-1, 1))
    standardized_test = scalars.transform(o.values.reshape(-1, 1))
    return standardized_train,standardized_test

In [0]:
std_train_1 ,std_test_1 = numerical_vectorizations(X_train['MSSubClass'],X_test['MSSubClass'])
std_train_2 ,std_test_2 = numerical_vectorizations(X_train['LotFrontage'],X_test['LotFrontage'])
std_train_3 ,std_test_3 = numerical_vectorizations(X_train['LotArea'],X_test['LotArea'])
std_train_4 ,std_test_4 = numerical_vectorizations(X_train['OverallQual'],X_test['OverallQual'])
std_train_5 ,std_test_5 = numerical_vectorizations(X_train['OverallCond'],X_test['OverallCond'])
std_train_6 ,std_test_6 = numerical_vectorizations(X_train['YearBuilt'],X_test['YearBuilt'])
std_train_7 ,std_test_7 = numerical_vectorizations(X_train['YearRemodAdd'],X_test['YearRemodAdd'])
std_train_8 ,std_test_8 = numerical_vectorizations(X_train['MasVnrArea'],X_test['MasVnrArea'])
std_train_9 ,std_test_9 = numerical_vectorizations(X_train['BsmtFinSF1'],X_test['BsmtFinSF1'])
std_train_10 ,std_test_10 = numerical_vectorizations(X_train['BsmtFinSF2'],X_test['BsmtFinSF2'])
std_train_11 ,std_test_11 = numerical_vectorizations(X_train['BsmtUnfSF'],X_test['BsmtUnfSF'])
std_train_12 ,std_test_12 = numerical_vectorizations(X_train['TotalBsmtSF'],X_test['TotalBsmtSF'])
std_train_13 ,std_test_13 = numerical_vectorizations(X_train['1stFlrSF'],X_test['1stFlrSF'])
std_train_14 ,std_test_14 = numerical_vectorizations(X_train['2ndFlrSF'],X_test['2ndFlrSF'])
std_train_15 ,std_test_15 = numerical_vectorizations(X_train['LowQualFinSF'],X_test['LowQualFinSF'])
std_train_16 ,std_test_16 = numerical_vectorizations(X_train['GrLivArea'],X_test['GrLivArea'])
std_train_17 ,std_test_17 = numerical_vectorizations(X_train['BsmtFullBath'],X_test['BsmtFullBath'])
std_train_18 ,std_test_18 = numerical_vectorizations(X_train['BsmtHalfBath'],X_test['BsmtHalfBath'])
std_train_19 ,std_test_19 = numerical_vectorizations(X_train['FullBath'],X_test['FullBath'])
std_train_20 ,std_test_20 = numerical_vectorizations(X_train['HalfBath'],X_test['HalfBath'])
std_train_21 ,std_test_21 = numerical_vectorizations(X_train['BedroomAbvGr'],X_test['BedroomAbvGr'])
std_train_22 ,std_test_22 = numerical_vectorizations(X_train['KitchenAbvGr'],X_test['KitchenAbvGr'])
std_train_23 ,std_test_23 = numerical_vectorizations(X_train['TotRmsAbvGrd'],X_test['TotRmsAbvGrd'])
std_train_24 ,std_test_24 = numerical_vectorizations(X_train['Fireplaces'],X_test['Fireplaces'])
std_train_25 ,std_test_25 = numerical_vectorizations(X_train['GarageYrBlt'],X_test['GarageYrBlt'])
std_train_26 ,std_test_26 = numerical_vectorizations(X_train['GarageCars'],X_test['GarageCars'])
std_train_27 ,std_test_27 = numerical_vectorizations(X_train['GarageArea'],X_test['GarageArea'])
std_train_28 ,std_test_28 = numerical_vectorizations(X_train['WoodDeckSF'],X_test['WoodDeckSF'])
std_train_29 ,std_test_29 = numerical_vectorizations(X_train['OpenPorchSF'],X_test['OpenPorchSF'])
std_train_30 ,std_test_30 = numerical_vectorizations(X_train['EnclosedPorch'],X_test['EnclosedPorch'])
std_train_31 ,std_test_31 = numerical_vectorizations(X_train['3SsnPorch'],X_test['3SsnPorch'])
std_train_32 ,std_test_32 = numerical_vectorizations(X_train['ScreenPorch'],X_test['ScreenPorch'])
std_train_33 ,std_test_33 = numerical_vectorizations(X_train['PoolArea'],X_test['PoolArea'])
std_train_34 ,std_test_34 = numerical_vectorizations(X_train['MiscVal'],X_test['MiscVal'])
std_train_35 ,std_test_35 = numerical_vectorizations(X_train['MoSold'],X_test['MoSold'])
std_train_36 ,std_test_36 = numerical_vectorizations(X_train['YrSold'],X_test['YrSold'])

In [0]:
std_train_36.shape ,std_test_36.shape

In [0]:
cat_feat=['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
       'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish',
       'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence',
       'MiscFeature', 'SaleType', 'SaleCondition']

In [0]:
for i in cat_feat:
    X_train[i]=X_train[i].replace(0,'')

In [0]:
for i in cat_feat:
    X_test[i]=X_test[i].replace(0,'')

In [0]:
for i in X_train['MSZoning']:
    if i=='C (all)':
        X_train['MSZoning']=X_train['MSZoning'].replace(i,'all')

In [0]:
from collections import Counter
def categorical_vectorization(m,o):
    
    
    my_counter = Counter()
    for word in m.values:
        my_counter.update(str(word).split())
    category_dict = dict(my_counter)
    sorted_dict = dict(sorted(category_dict.items(), key=lambda kv: kv[1]))
    
    ## we use count vectorizer to convert the values into one hot encoded features
    vectorizer = CountVectorizer(vocabulary=list(sorted_dict.keys()), lowercase=False, binary=True)
    vectorizer.fit(m.values)
    print(vectorizer.get_feature_names())
    
    one_hot_train = vectorizer.transform(m.values)
    one_hot_test = vectorizer.transform(o.values)
    return one_hot_train, one_hot_test

In [0]:
std_train_37 ,std_test_37 = categorical_vectorization(X_train['MSZoning'],X_test['MSZoning'])
std_train_38 ,std_test_38 = categorical_vectorization(X_train['Street'],X_test['Street'])
std_train_39 ,std_test_39 = categorical_vectorization(X_train['Alley'],X_test['Alley'])
std_train_40 ,std_test_40 = categorical_vectorization(X_train['LotShape'],X_test['LotShape'])
std_train_41 ,std_test_41 = categorical_vectorization(X_train['LandContour'],X_test['LandContour'])
std_train_42 ,std_test_42 = categorical_vectorization(X_train['Utilities'],X_test['Utilities'])
std_train_43 ,std_test_43 = categorical_vectorization(X_train['LotConfig'],X_test['LotConfig'])
std_train_44 ,std_test_44 = categorical_vectorization(X_train['LandSlope'],X_test['LandSlope'])
std_train_45 ,std_test_45 = categorical_vectorization(X_train['Neighborhood'],X_test['Neighborhood'])
std_train_46 ,std_test_46 = categorical_vectorization(X_train['Condition1'],X_test['Condition1'])
std_train_47 ,std_test_47 = categorical_vectorization(X_train['Condition2'],X_test['Condition2'])
std_train_48 ,std_test_48 = categorical_vectorization(X_train['BldgType'],X_test['BldgType'])
std_train_49 ,std_test_49 = categorical_vectorization(X_train['HouseStyle'],X_test['HouseStyle'])
std_train_50 ,std_test_50 = categorical_vectorization(X_train['RoofStyle'],X_test['RoofStyle'])
std_train_51 ,std_test_51 = categorical_vectorization(X_train['RoofMatl'],X_test['RoofMatl'])
std_train_52 ,std_test_52 = categorical_vectorization(X_train['Exterior1st'],X_test['Exterior1st'])
std_train_53 ,std_test_53 = categorical_vectorization(X_train['Exterior2nd'],X_test['Exterior2nd'])
std_train_54 ,std_test_54 = categorical_vectorization(X_train['MasVnrType'],X_test['MasVnrType'])
std_train_55 ,std_test_55 = categorical_vectorization(X_train['ExterQual'],X_test['ExterQual'])
std_train_56 ,std_test_56 = categorical_vectorization(X_train['ExterCond'],X_test['ExterCond'])
std_train_57 ,std_test_57 = categorical_vectorization(X_train['Foundation'],X_test['Foundation'])
std_train_58 ,std_test_58 = categorical_vectorization(X_train['BsmtQual'],X_test['BsmtQual'])
std_train_59 ,std_test_59 = categorical_vectorization(X_train['BsmtCond'],X_test['BsmtCond'])
std_train_60 ,std_test_60 = categorical_vectorization(X_train['BsmtExposure'],X_test['BsmtExposure'])
std_train_61 ,std_test_61 = categorical_vectorization(X_train['BsmtFinType1'],X_test['BsmtFinType1'])
std_train_62 ,std_test_62 = categorical_vectorization(X_train['BsmtFinType2'],X_test['BsmtFinType2'])
std_train_63 ,std_test_63 = categorical_vectorization(X_train['Heating'],X_test['Heating'])
std_train_64 ,std_test_64 = categorical_vectorization(X_train['HeatingQC'],X_test['HeatingQC'])
std_train_65 ,std_test_65 = categorical_vectorization(X_train['CentralAir'],X_test['CentralAir'])
std_train_66 ,std_test_66 = categorical_vectorization(X_train['Electrical'],X_test['Electrical'])
std_train_67 ,std_test_67 = categorical_vectorization(X_train['KitchenQual'],X_test['KitchenQual'])
std_train_68 ,std_test_68 = categorical_vectorization(X_train['Functional'],X_test['Functional'])
std_train_69 ,std_test_69 = categorical_vectorization(X_train['FireplaceQu'],X_test['FireplaceQu'])
std_train_70 ,std_test_70 = categorical_vectorization(X_train['GarageType'],X_test['GarageType'])
std_train_71 ,std_test_71 = categorical_vectorization(X_train['GarageFinish'],X_test['GarageFinish'])
std_train_72 ,std_test_72 = categorical_vectorization(X_train['GarageQual'],X_test['GarageQual'])
std_train_73 ,std_test_73 = categorical_vectorization(X_train['GarageCond'],X_test['GarageCond'])
std_train_74 ,std_test_74 = categorical_vectorization(X_train['PavedDrive'],X_test['PavedDrive'])
std_train_75 ,std_test_75 = categorical_vectorization(X_train['PoolQC'],X_test['PoolQC'])
std_train_76 ,std_test_76 = categorical_vectorization(X_train['Fence'],X_test['Fence'])
std_train_77 ,std_test_77 = categorical_vectorization(X_train['MiscFeature'],X_test['MiscFeature'])
std_train_78 ,std_test_78 = categorical_vectorization(X_train['SaleType'],X_test['SaleType'])
std_train_79 ,std_test_79 = categorical_vectorization(X_train['SaleCondition'],X_test['SaleCondition'])
std_train_79.shape,std_test_79.shape

In [0]:
# merge two sparse matrices: https://stackoverflow.com/a/19710648/4084039
from scipy.sparse import hstack

X_tr = hstack((std_train_1,std_train_2,std_train_3,std_train_4,std_train_5,std_train_6,std_train_7,std_train_8,std_train_9,std_train_10,std_train_11,std_train_12,std_train_13,std_train_14,std_train_15,std_train_16,std_train_17,std_train_18,std_train_19,std_train_20,std_train_21,std_train_22,std_train_23,std_train_24,std_train_25,std_train_26,std_train_27,std_train_28,std_train_29,std_train_30,std_train_31,std_train_32,std_train_33,std_train_34,std_train_35,std_train_36,std_train_37,std_train_38,std_train_39,std_train_40,std_train_41,std_train_42,std_train_43,std_train_44,std_train_45,std_train_46,std_train_47,std_train_48,std_train_49,std_train_50,std_train_51,std_train_52,std_train_53,std_train_54,std_train_55,std_train_56,std_train_57,std_train_58,std_train_59,std_train_60,std_train_61,std_train_62,std_train_63,std_train_64,std_train_65,std_train_66,std_train_67,std_train_68,std_train_69,std_train_70,std_train_71,std_train_72,std_train_73,std_train_74,std_train_75,std_train_76,std_train_77,std_train_78,std_train_79)).tocsr()
X_te = hstack((std_test_1,std_test_2,std_test_3,std_test_4,std_test_5,std_test_6,std_test_7,std_test_8,std_test_9,std_test_10,std_test_11,std_test_12,std_test_13,std_test_14,std_test_15,std_test_16,std_test_17,std_test_18,std_test_19,std_test_20,std_test_21,std_test_22,std_test_23,std_test_24,std_test_25,std_test_26,std_test_27,std_test_28,std_test_29,std_test_30,std_test_31,std_test_32,std_test_33,std_test_34,std_test_35,std_test_36,std_test_37,std_test_38,std_test_39,std_test_40,std_test_41,std_test_42,std_test_43,std_test_44,std_test_45,std_test_46,std_test_47,std_test_48,std_test_49,std_test_50,std_test_51,std_test_52,std_test_53,std_test_54,std_test_55,std_test_56,std_test_57,std_test_58,std_test_59,std_test_60,std_test_61,std_test_62,std_test_63,std_test_64,std_test_65,std_test_66,std_test_67,std_test_68,std_test_69,std_test_70,std_test_71,std_test_72,std_test_73,std_test_74,std_test_75,std_test_76,std_test_77,std_test_78,std_test_79)).tocsr()

In [0]:
print("Final Data matrix")
print(X_tr.shape, y_train.shape)
print(X_te.shape)

In [0]:
from sklearn import linear_model
from sklearn import ensemble
lr = linear_model.LinearRegression()
model = lr.fit(X_tr, y_train)

In [0]:
print ("R^2 is: \n", model.score(X_te, y_test))
predictions = model.predict(X_te)

In [0]:
from sklearn.metrics import mean_squared_error
print ('RMSE is: \n', mean_squared_error(y_test, predictions))

In [0]:
actual_values = y_test
plt.scatter(predictions, actual_values, alpha=.75,
            color='g') #alpha helps to show overlapping data
plt.xlabel('Predicted Price')
plt.ylabel('Actual Price')
plt.title('LinearRegression Model')

In [0]:
from prettytable import PrettyTable
ptable = PrettyTable()
ptable.field_names=["Model Name","Loss"]
ptable.add_row(["Logistic Regression","0.01892"])
print(ptable)
print()