<a target="_blank" href="https://colab.research.google.com/github/PrzemekSekula/DeepLearningClasses1/blob/master/BiasVarianceDemo/Example_2.ipynb">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" />
    Run in Google Colab</a>

In [None]:
# Dowmload data and helper.py if running in Google Colab
import sys
IN_COLAB = "google.colab" in sys.modules

if IN_COLAB:
    !wget https://raw.githubusercontent.com/PrzemekSekula/DeepLearningClasses1/master/BiasVarianceDemo/data.csv

In [None]:
import pandas as pd
import numpy as np

### Data loading

In [None]:
df = pd.read_csv('./data.csv')
print (df.shape)
df.head()

### DATA Split
Let's define features and labels first

In [None]:
X = df.drop('price', axis = 1)
X.head()

In [None]:
y = df.price
y.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.3, random_state = 42)

print ('X train shape:', X_train.shape)
print ('y train shape:', y_train.shape)
print ('X test shape:', X_test.shape)
print ('y test shape:', y_test.shape)

### Model creation, training and testing
We will use $MAPE$ to test the model.

$$MAPE = 100\% * \frac{1}{n} \sum_{i=1}^n\displaystyle\left\lvert \frac{ypred_i-y_i}{y_i}\right\rvert$$

Where:
- $y$ - real price
- $ypred$ - estimated price (model's output)
- $n$ - number of examples

In [None]:
def get_mape(model, X, y):
    pred = model.predict(X)
    return np.mean(100 * np.abs((y-pred) / y))

Let's use random forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(min_samples_leaf=1)
model.fit(X_train, y_train)

In [None]:
train_mape = get_mape(model, X_train, y_train)
print ('Train MAPE: {:.3f}%'.format(train_mape))

test_mape = get_mape(model, X_test, y_test)
print ('Test MAPE: {:.3f}%'.format(test_mape))


### Question: Overfitting, Underfitting or OK?

### Improving a model

In [None]:
model = RandomForestRegressor(min_samples_leaf=20)
model.fit(X_train, y_train)

train_mape = get_mape(model, X_train, y_train)
print ('Train MAPE: {:.3f}%'.format(train_mape))
test_mape = get_mape(model, X_test, y_test)
print ('Test MAPE: {:.3f}%'.format(test_mape))