### House Price Predictor
This notebook aims at creating a model which will help predict the market price of a house

In [None]:
# Standard imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns; sns.set()
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.linear_model import LinearRegression
%matplotlib inline

In [None]:
import warnings


def warn(*args, **kwargs):
    pass


warnings.warn = warn

#### 1. Fetching Dataset

Downloading dataset

In [None]:
file_name = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-SkillsNetwork/labs/FinalModule_Coursera/data/kc_house_data_NaN.csv'

df = pd.read_csv(file_name)

In [None]:
df.head()

#### 2. Data Wrangling

In [None]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
df.columns

In [None]:
df.dtypes

Let's also drop the 'id' column

In [None]:
df.drop(columns=['id'], inplace=True)

In [None]:
df.info()

We can see that we have missing values for columns 'bedrooms' and 'bathrooms'

In [None]:
print("number of NaN values for the column bedrooms :",
      df['bedrooms'].isnull().sum())
print("number of NaN values for the column bathrooms :",
      df['bathrooms'].isnull().sum())

We can replace them with the mean of their respective columns since the size is small

In [None]:
ls = ['bedrooms', 'bathrooms']
for val in ls:
    mean = df[val].mean()
    df[val].replace(np.nan, mean, inplace=True)
    print("number of NaN values for the column bedrooms :",
          df[val].isnull().sum())

#### 3: Exploratory Data Analysis

In [None]:
df['floors'].value_counts().to_frame()

In [None]:
df['waterfront'].value_counts().to_frame()

Let's use boxplot() in the seaborn library to determine whether houses with a waterfront view or without a waterfront view have more price outliers.

In [None]:
sns.boxplot(data=df, x=df['waterfront'], y=df['price'])

We'll also use regplot() in the seaborn library to determine if the feature sqft_above is negatively or positively correlated with price.

In [None]:
sns.regplot(data=df, x=df['sqft_above'],
            y=df['price'], line_kws={'color': 'red'})

We can use the Pandas method corr() to find the feature other than price that is most correlated with price.

In [None]:
df.corr(numeric_only=True)['price'].sort_values()

#### 4: Model Development

Let's 1st do some dirty work of checking various R^2 scores

In [None]:
# using the feature 'long'
X = df[['long']]
y = df['price']
lm = LinearRegression()
lm.fit(X, y)
lm.score(X, y)

In [None]:
# using the feature 'sqft_living'
X = df[['sqft_living']]
y = df['price']
lm = LinearRegression()
lm.fit(X, y)
lm.score(X, y)

In [None]:
# Using multiple features
features = ["floors", "waterfront", "lat", "bedrooms", "sqft_basement",
            "view", "bathrooms", "sqft_living15", "sqft_above", "grade", "sqft_living"]
X = df[features]
y = df['price']
lm = LinearRegression()
lm.fit(X, y)
lm.score(X, y)

In [None]:
model = make_pipeline(StandardScaler(), PolynomialFeatures(
    include_bias=False), LinearRegression())
model.fit(X, y)
model.score(X, y)

In [None]:
model

#### 5: Model Evaluation and Refinement

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

Splitting the data into training and testing sets

In [None]:
features = ["floors", "waterfront", "lat", "bedrooms", "sqft_basement",
            "view", "bathrooms", "sqft_living15", "sqft_above", "grade", "sqft_living"]
X = df[features]
y = df['price']

x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, random_state=1)


print("number of test samples:", x_test.shape[0])
print("number of training samples:", x_train.shape[0])

In [None]:
from sklearn.linear_model import Ridge

In [None]:
ridge_model = Ridge(alpha=0.1)
ridge_model.fit(x_train, y_train)
ridge_model.score(x_test, y_test)

In [None]:
pr = PolynomialFeatures(degree=2)

In [None]:
x_train_pr = pr.fit_transform(x_train)
x_test_pr = pr.fit_transform(x_test)

In [None]:
x_train_pr.shape, x_train.shape

In [None]:
ridge_model_2 = Ridge(alpha=0.1)
ridge_model_2.fit(x_train_pr, y_train)
ridge_model_2.score(x_test_pr, y_test)