In [None]:
import pandas as pd
import numpy as np

In [None]:
# Uncomment this if you are using Google Colab
#!wget https://raw.githubusercontent.com/PrzemekSekula/DeepLearningClasses1/master/BiasVarianceDemo/data.csv
#!wget https://raw.githubusercontent.com/PrzemekSekula/DeepLearningClasses1/master/BiasVarianceDemo/helper.py

### Data loading

In [None]:
df = pd.read_csv('./data.csv')
print (df.shape)
df.head()

### Data analysis
Usually we start an ML project with data analysis. In this notebook we assume that the price of the appartment depends only on the area of it, so let's check the relation visually

In [None]:
df.plot.scatter('area', 'price')

### Data preparation
We want to create the model that estimates the price of the apartment given the area of it. It may be denoted as:
$$price = f(area)$$
or 
$$y = f(X)$$
To train the model we need to prepare our features ($X$) and labels ($y$)

In [None]:
X = df[['area']]
y = df['price']
print ('X shape:', X.shape)
print ('y shape:', y.shape)

### Model creation and training
Let's start with the simplest model possible.
**Question: Is this a classification or a regression task?**


In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [None]:
model.fit(X, y)

### Let's see how the model works

In [None]:
model.predict([[50]])

In [None]:
def get_price(model, area):
    price = model.predict([[area]])
    print ('Price for {}m^2 appartment is: {:.2f} thous. USD'.format(area, price[0]))

get_price(model, 50)
get_price(model, 100)
get_price(model, 150)

In [None]:
import helper
helper.plot_model(model, df)


### How good is the model
To estimate the quality of the model we can use Mean Average Percentage Error (MAPE)
$$MAPE = 100\% * \frac{1}{n} \sum_{i=1}^n\displaystyle\left\lvert \frac{ypred_i-y_i}{y_i}\right\rvert$$

Where:
- $y$ - real price
- $ypred$ - estimated price (model's output)
- $n$ - number of examples

In [None]:
def print_mape(model, X, y):
    pred = model.predict(X)
    error = np.mean(100 * np.abs((y-pred) / y))
    print ("MAPE error: {:.1f}%".format(error))

In [None]:
print_mape(model, X, y)

### More complicated model
Linear regression is a relatively simple model that produces only a line. Let's try something more complicated.

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(min_samples_leaf=25)
model.fit(X, y)
print_mape(model, X, y)

In [None]:
helper.plot_model(model, df)

## More inputs
To increase the accuracy, we may also consider other features, such as crime rate or proximity to the metro

In [None]:
X = df.drop('price', axis = 1)
X.head()

In [None]:
model = RandomForestRegressor(min_samples_leaf=10)
model.fit(X, y)
print_mape(model, X, y)