In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
df = pd.read_csv('kc_house_data.csv')
# kaggle house sales prediction dataset

FileNotFoundError: [Errno 2] No such file or directory: 'kc_house_data.csv'

In [None]:
df.describe().transpose()

In [None]:
plt.figure(figsize=(10,6))
sns.displot(df['price'])

In [None]:
sns.countplot(df['bedrooms'])

In [None]:
df.corr()['price'].sort_values()
# correlation with the price; we can analyze this further

In [None]:
sns.scatterplot(x='price',y='sqft_living',data=df)

In [None]:
sns.boxplot(x='bedrooms',y='price',data=df)

In [None]:
plt.figure(figsize=(20,20))
sns.scatterplot(x='price',y='long',data=df)

In [None]:
plt.figure(figsize=(20,20))
sns.scatterplot(x='price',y='lat',data=df)
# at a certain combination of lat - long there is an expensive housing area

In [None]:
plt.figure(figsize=(20,20))
sns.scatterplot(x='long',y='lat',data=df,hue='price')
# matxhing these results in a map

In [None]:
df.sort_values('price',ascending=False).head(20) # top 20 most expensive houses

In [None]:
# sample out the top 1% of the houses by price to clear up the map
non_top_1_percent = df.sort_values('price',ascending=False).iloc[216:]
plt.figure(figsize=(12,8))
sns.scatterplot(x='long',y='lat',data=non_top_1_percent,alpha=.45,palette='RdYlGn',hue='price')

In [None]:
sns.boxplot(x='waterfront',y='price',data=df)

In [None]:
# dropping attributes which are not relevant
df = df.drop('id',axis=1)

In [None]:
df['date'] = pd.to_datetime(df['date']) # converting date string into date datetime
df['date']

In [None]:
df['year'] = df['date'].apply(lambda date: date.year)
df['month'] = df['date'].apply(lambda date: date.month)

In [None]:
df.head(1)

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x='month',y='price',data=df)

In [None]:
df.groupby('month').mean()['price'].plot()

In [None]:
df.groupby('year').mean()['price'].plot()

In [None]:
# df = df.drop('date',axis=1) <= ezt már droppoltam
df = df.drop('zipcode',axis=1) # we could also map these by price, but it requires domain experience

In [None]:
df['yr_renovated'].value_counts()
# 0 is an indicator of no renovation; we may categorize on this attribute

In [None]:
df['sqft_basement'].value_counts()
# 0 is an indicator, that there is no basement

**Data preprocessing**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
#X = df.drop('price',axis=1).values
#y = df['price'].values

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.3,random_state=101)

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()
scaler.fit(X_train)

In [None]:
X_train = scaler.transform(X_train)

In [None]:
X_test = scaler.transform(X_test)

**Tensorflow**

In [None]:
from tensorflow.keras.models import Sequential

In [None]:
from tensorflow.keras.layers import Dense

In [None]:
model = Sequential()

model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))

model.add(Dense(1))

model.compile(optimizer='adam',loss='mse')

In [None]:
model.fit(x=X_train,y=y_train,validation_data=(X_test,y_test),batch_size=128,epochs=400)
# batch size should be powers of 2; small batch sizes result in less overfitting

**Model performance evaluation**

In [None]:
losses = pd.DataFrame(model.history.history)

In [None]:
losses.plot()
# increasing validation loss would indicate overfitting

In [None]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,explained_variance_score

In [None]:
predictions = model.predict(X_test)

In [None]:
mean_absolute_error(y_test,predictions)

In [None]:
mean_squared_error(y_test,predictions)

In [None]:
np.sqrt(mean_squared_error(y_test,predictions))


In [None]:
explained_variance_score(y_test,predictions)

In [None]:
plt.scatter(y_test,predictions)
plt.plot(y_test,y_test,'r')