<a href="https://colab.research.google.com/github/PremGorecki/NeuralNetwork/blob/main/05_regression/01_housing_prices.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Import biblotek

In [2]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

np.set_printoptions(precision=12, suppress=True, linewidth=150)
pd.options.display.float_format = '{:.6f}'.format
tf.__version__

'2.8.2'

# 2. Załadowanie danych i wstępna eksploracja

In [None]:
raw_dataset = pd.read_csv('https://storage.googleapis.com/esmartdata-courses-files/ann-course/housing.csv')
raw_dataset.info()

In [None]:
# dobra praktyka to skopiować DataSet
dataset = raw_dataset.copy()
dataset.head()

In [None]:
#sprawdzamy ile pustych danych
dataset.isnull().sum() / len(dataset)

In [None]:
#danych pustych poniżej 1% to można usunąć

dataset.dropna(inplace=True)

dataset.isnull().sum() / len(dataset)

In [None]:
#zwróci nam pewne statystyki ale tylko dla zmiennych liczbowych

dataset.describe()

In [None]:
#możemy też mieć statystyki dla zmiennych nieliczbowych
dataset.describe(include=['object'])

In [9]:
dataset.ocean_proximity.value_counts()

<1H OCEAN     9034
INLAND        6496
NEAR OCEAN    2628
NEAR BAY      2270
ISLAND           5
Name: ocean_proximity, dtype: int64

In [None]:
#jak wygląda rozkład zmiennej którą chcemy przywidywać
px.histogram(dataset, x='median_house_value')

In [None]:
#widzimy że 500001 jest często
dataset.median_house_value.value_counts()

In [None]:
#usuwamy 5000001 
index_to_drop = dataset[dataset.median_house_value == 500001].index
dataset = dataset.drop(index=index_to_drop)
px.histogram(dataset, x='median_house_value')

In [None]:
#teraz ostatnią kolumnę gdzie tym był object rozbijamy to na 5 kolumn z zerami i jedynkami
dataset_dummies = pd.get_dummies(dataset, drop_first=True)
dataset_dummies.head()

# 3. Podział na zbiór treningowy oraz testowy

In [None]:
train_dataset = dataset_dummies.sample(frac=0.8, random_state=0)
test_dataset = dataset_dummies.drop(train_dataset.index)

print(f'train_dataset length: {len(train_dataset)}')
print(f'test_dataset length: {len(test_dataset)}')

In [None]:
train_dataset.head()

In [None]:
test_dataset.head()

In [None]:
# eksplorujemy dane bardziej
px.scatter_matrix(train_dataset, dimensions=['median_house_value', 'housing_median_age', 'median_income', 'total_rooms'], color='median_house_value', height=700)

In [None]:
#statystyki ale bez cen bo to będziemy przewidywać
train_stats = train_dataset.describe()
train_stats.pop('median_house_value')
train_stats = train_stats.transpose()
train_stats

In [None]:
#usuwamy ceny z danych
train_labels = train_dataset.pop('median_house_value')
test_labels = test_dataset.pop('median_house_value')

# 4. Standaryzacja danych

In [21]:
def norm(x):
    return (x - train_stats['mean']) / train_stats['std']

In [22]:
#uwaga funkcja działa na danych treningowych a potem uruchamiamy ją też na danych testowych
normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)

In [None]:
normed_train_data.isnull().sum()

In [25]:
normed_test_data = normed_test_data.values
normed_train_data = normed_train_data.values

# 5. Budowa modelu

In [26]:
def build_model():
    model = Sequential()
    model.add(Dense(1024, kernel_regularizer='l2', activation='relu', input_shape=[len(train_dataset.keys())]))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(1))

    model.compile(optimizer='adam',
                  loss='mse',
                  metrics=['mae', 'mse'])
    return model

In [None]:
model = build_model()
model.summary()

# 6. Trenowanie sieci

In [28]:
history = model.fit(normed_train_data, train_labels.values, epochs=150, validation_split=0.2, verbose=1, batch_size=32)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

In [None]:
def plot_hist(history):
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch
    hist['rmse'] = np.sqrt(hist['mse'])
    hist['val_rmse'] = np.sqrt(hist['val_mse'])

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['mae'], name='mae', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_mae'], name='val_mae', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='MAE vs. VAL_MAE', xaxis_title='Epoki', yaxis_title='Mean Absolute Error', yaxis_type='log')
    fig.show()

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['rmse'], name='rmse', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_rmse'], name='val_rmse', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='RMSE vs. VAL_RMSE', xaxis_title='Epoki', yaxis_title='Root Mean Squared Error', yaxis_type='log')
    fig.show()

plot_hist(history)

In [None]:
for name, value in zip(model.metrics_names, model.evaluate(normed_test_data, test_labels.values)):
    print(f'{name:8}{value:.4f}')

In [None]:
test_predictions = model.predict(normed_test_data).flatten()
test_predictions

In [None]:
pred = pd.DataFrame(test_labels)
pred['predictions'] = test_predictions
pred.head()

In [None]:
fig = px.scatter(pred, 'median_house_value', 'predictions')
fig.add_trace(go.Scatter(x=[0, 500000], y=[0, 500000], mode='lines'))
fig.show()

In [None]:
pred['error'] = pred['median_house_value'] - pred['predictions']
pred.head()

In [None]:
px.histogram(pred, 'error', marginal='rug', width=1000)