In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv("../input/housesalesprediction/kc_house_data.csv")

# Feature Columns
* **id** - Unique ID for each home sold
* **date** - Date of the home sale
* **price** - Price of each home sold
* **bedrooms** - Number of bedrooms
* **bathrooms** - Number of bathrooms, where .5 accounts for a room with a toilet but no shower
* **sqft_living** - Square footage of the apartments interior living space
* **sqft_lot** - Square footage of the land space
* **floors** - Number of floors
* **waterfront** - A dummy variable for whether the apartment was overlooking the waterfront or not
* **view** - An index from 0 to 4 of how good the view of the property was
* **condition** - An index from 1 to 5 on the condition of the apartment,
* **grade** - An index from 1 to 13, where 1-3 falls short of building construction and design, 7 has an average level of construction and design, and 11-13 have a high quality level of construction and design.
* **sqft_above** - The square footage of the interior housing space that is above ground level
* **sqft_basement** - The square footage of the interior housing space that is below ground level
* **yr_buil**t - The year the house was initially built
* **yr_renovated** - The year of the house’s last renovation
* **zipcode** - What zipcode area the house is in
* **lat** - Lattitude
* **long** - Longitude
* **sqft_living15** - The square footage of interior housing living space for the nearest 15 neighbors
* **sqft_lot15** - The square footage of the land lots of the nearest 15 neighbors

In [None]:
df.head(10)

In [None]:
# Explore Data

In [None]:
df.isnull().sum()

In [None]:
df.describe().transpose()

In [None]:
plt.figure(figsize=(20,8))
sns.distplot(df["price"])

In [None]:
sns.countplot(df["bedrooms"])

In [None]:
# price between 0 - 2M and bedrooms 2 - 5

In [None]:
df.corr()["price"].sort_values(ascending=False)

In [None]:
plt.figure(figsize=(20,8))
sns.scatterplot(x="price",y="sqft_living",data=df)

In [None]:
plt.figure(figsize=(20,8))
sns.scatterplot(x="long",y="lat",data=df,hue="price",palette="RdYlGn",alpha=0.2,edgecolor=None)

In [None]:
bottom_99 = df.sort_values("price",ascending=False).iloc[216:]

In [None]:
plt.figure(figsize=(20,8))
sns.scatterplot(x="long",y="lat",data=bottom_99,hue="price",palette="RdYlGn",alpha=0.2,edgecolor=None)

In [None]:
#from the coordinates we can see that water side houses are more expenseive as natural. 
sns.boxplot(x="waterfront",y="price",data=bottom_99)

In [None]:
# so far we did a corraletion analysis, inspect lat/long - price relationship. exclude most expensive %1 of the houses.

In [None]:
df.head(10)

In [None]:
# what we do is to drop unusable columns and transform remaining ones as necessary. 

id column is not necessary for prediction model. Also we need to transform date columns as year and month.Even seperating them should benefit us more for seasonal movements. 

In [None]:
df = df.drop("id",axis=1)

In [None]:
df["date"]

In [None]:
df["date"] = pd.to_datetime(df["date"])

In [None]:
df["date"]

In [None]:
df["year"] = df["date"].apply(lambda date: date.year)
df["month"] = df["date"].apply(lambda date: date.month)
df = df.drop("date",axis=1)

In [None]:
plt.figure(figsize=(20,8))
sns.boxenplot(x="month",y="price",data=df)

In [None]:
plt.figure(figsize=(20,8))
df.groupby("month").mean()["price"].plot()

From the figures above month is not playing a vital role but still some months are higher average especially in spring-summer season

In [None]:
plt.figure(figsize=(20,8))
df.groupby("year").mean()["price"].plot()

As we excpected year/price have lineer relatinship. 

In [None]:
#Also zipcode requires domain experince about the area itself. Normally we can categorize them.
#However, it wiil create 70 category which is too much to handle. 
#But if you are familiar with the area you can group them, and reduce category bin size. 
#I prefer to drop that column too. (Accept Bias on my model.)

In [None]:
df = df.drop("zipcode",axis=1)

In [None]:
df.head(10)

In [None]:
df["yr_renovated"].value_counts()

In [None]:
#Here we can say most of the data is equal to "0" so it is not a good variable in the context of quality
#It is possible to think that the building has not been renovated,that's why it is "0". 
#For example basement sqft value has also many "0" but you can think like there is no basement.

In [None]:
#So lets begin
X = df.drop("price",axis=1).values
y = df["price"].values

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

As we split our data to test/train groups we need to scale them. I chose MinMaxScaler from sklearn

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)

I did not fit the scaler on test set in order to prevent data leakage

In [None]:
X_test = scaler.transform(X_test)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
model = Sequential()
model.add(Dense(19,activation ="relu"))
model.add(Dense(19,activation ="relu"))
model.add(Dense(19,activation ="relu"))
model.add(Dense(19,activation ="relu"))

model.add(Dense(1))

The shape of the train set contains 19 variables so we insert 19 neurons as a start, 4 hidden layers with one output layer which is a signle neuron due to single value expected. 

In [None]:
model.compile(optimizer="adam",loss="mse")

In [None]:
model.fit(x=X_train,y=y_train,validation_data=(X_test,y_test),batch_size=128,epochs=250)

In [None]:
loss_df = pd.DataFrame(model.history.history)

In [None]:
#check if we overfit on test data and also see optimum epochs
loss_df.plot()

In [None]:
preds = model.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,explained_variance_score

In [None]:
mean_absolute_error(y_test,preds)

In [None]:
#This not mean anything if don't compare with the actual data, price avg value is 540K and the error 104K
#We are %20 percent wrong.

In [None]:
explained_variance_score(y_test,preds)

In [None]:
plt.scatter(y_test,preds)
plt.plot(y_test,y_test,"r")

Our model works better in lower price houses and errors are quite larger at higher prices. 

In [None]:
#lets do everything with bottom_99 data set.But we re-define from our transformed and explored data df
bottom_99 = df.sort_values("price",ascending=False).iloc[216:]
X1 =bottom_99.drop("price",axis=1).values
y1 = bottom_99["price"].values

In [None]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.3, random_state=0)

In [None]:
scaler1  =  MinMaxScaler()
X1_train =  scaler1.fit_transform(X1_train)
X1_test  =  scaler1.transform(X1_test)

In [None]:
model1 = Sequential()
model1.add(Dense(19,activation ="relu"))
model1.add(Dense(19,activation ="relu"))
model1.add(Dense(19,activation ="relu"))
model.add(Dense(19,activation ="relu"))

model1.add(Dense(1))

model1.compile(optimizer="adam",loss="mse")

In [None]:
model1.fit(x=X1_train,y=y1_train,validation_data=(X1_test,y1_test),batch_size=64,epochs=500)

In [None]:
loss_bott99 = pd.DataFrame(model1.history.history)

In [None]:
loss_bott99.plot()

In [None]:
preds1 = model1.predict(X1_test)

In [None]:
mean_absolute_error(y1_test,preds1)

In [None]:
explained_variance_score(y1_test,preds1)

In [None]:
plt.scatter(y1_test,preds1)
plt.plot(y1_test,y1_test,"r")

In [None]:
#So we get worse around %30. So we need to optimize our model. Try different epochs and batch sizes, but we may need to update our data set, make more cleaning
#Also find the optimum neuron structure. Will be continued. 