In [243]:
import pandas as pd
import numpy as np

In [244]:
!curl -O https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1390k  100 1390k    0     0  3573k      0 --:--:-- --:--:-- --:--:-- 3582k


In [245]:
df = pd.read_csv('housing.csv')
df.columns = data.columns.str.lower().str.replace(' ', '_')

### Question 1

In [246]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

### Question 2

In [247]:
np.median(df.population)

1166.0

### Question 3

In [248]:
np.random.seed(42)
n = len(df)
idx = np.arange(n)
np.random.shuffle(idx)
n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - n_val - n_test

In [249]:
df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train + n_val]]
df_test = df.iloc[idx[n_train + n_val:]]

In [252]:
y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)
del df_train["median_house_value"]
del df_val["median_house_value"]
del df_test["median_house_value"]

#### Functions 

In [253]:
def prepare_0(X):
    X = X.copy()
    X = X[X.columns[X.dtypes != "object"]]
    X = X.fillna(0)
    return X

def prepare_mean(X):
    X = X.copy()
    X = X[X.columns[X.dtypes != "object"]]
    for col in X.columns:
        X['%s'%col] = X['%s'%col].fillna(np.mean(X['%s'%col]))
    return X

def train_regression(X, y, r):
    X = X.copy()
    X = np.column_stack([np.ones(X.shape[0]), X])
    XTX = np.dot(X.T, X)
    XTX += np.eye(XTX.shape[0]) * r
    w = np.dot(np.dot(np.linalg.inv(XTX), X.T), y)
    return w[0], w[1:]

def RMSE(y_pred, y_tg):
    return np.sqrt(sum((y_pred - y_tg)**2)/len(y_pred))

#### Fill NA with zeros

In [254]:
w0, w = train_regression(prepare_0(df_train), y_train, 0.01)
y_pred = w0 + np.dot(prepare_0(df_val), w)
score = RMSE(y_pred, y_val)
round(score, 2)

0.33

#### Fill NA with mean value

In [255]:
w0, w = train_regression(prepare_mean(df_train), y_train, 0.01)
y_pred = w0 + np.dot(prepare_mean(df_val), w)
score = RMSE(y_pred, y_val)
round(score, 2)

0.33

### Question 4

In [259]:
r_values = [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]
scores = []
for r in r_values:
    w0, w = train_regression(prepare_0(df_train), y_train, r)
    y_pred = w0 + np.dot(prepare_0(df_val), w)
    scores.append([round(RMSE(y_pred, y_val), 5), r])
scores

[[0.32953, 0],
 [0.32953, 1e-06],
 [0.32953, 0.0001],
 [0.32953, 0.001],
 [0.32953, 0.01],
 [0.32969, 0.1],
 [0.33379, 1],
 [0.33925, 5],
 [0.34061, 10]]

### Question 5

In [260]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
scores = []
n = len(df)
idx = np.arange(n)
np.random.shuffle(idx)
n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - n_val - n_test

for seed in seeds:
    idx = np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idx)

    df_train = df.iloc[idx[:n_train]]
    df_val = df.iloc[idx[n_train:n_train + n_val]]
    df_test = df.iloc[idx[n_train + n_val:]]

    y_train = np.log1p(df_train.median_house_value.values)
    y_val = np.log1p(df_val.median_house_value.values)
    y_test = np.log1p(df_test.median_house_value.values)
    
    del df_train["median_house_value"]
    del df_val["median_house_value"]
    del df_test["median_house_value"]
    
    w0, w = train_regression(prepare_0(df_train), y_train, 0)
    y_pred = w0 + np.dot(prepare_0(df_val), w)
    score = RMSE(y_pred, y_val)
    scores.append(score)
    np.std(scores)

### Question 6

In [261]:
idx = np.arange(n)
np.random.seed(9)
np.random.shuffle(idx)

df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train + n_val]]
df_test = df.iloc[idx[n_train + n_val:]]

y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

del df_train["median_house_value"]
del df_val["median_house_value"]
del df_test["median_house_value"]

df_full_train = pd.concat([df_train, df_val])
y_full_train = np.concatenate([y_train, y_val])

w0, w = train_regression(prepare_0(df_full_train), y_full_train, 0.001)
y_pred = w0 + np.dot(prepare_0(df_test), w)
RMSE(y_pred, y_test)

0.3453168914384796