In [11]:
import pandas as pd
import numpy as np
import seaborn as sns

In [12]:
df = pd.read_csv('housing.csv')

In [13]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [14]:
df = df[df['ocean_proximity'].isin(['<1H OCEAN', 'INLAND'])]

In [15]:
del df['ocean_proximity']

In [17]:
# question 1
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        157
population              0
households              0
median_income           0
median_house_value      0
dtype: int64

In [18]:
# question 2
print(df['population'].median())

1195.0


In [19]:
mean = df['population'].mean() # set for use later in scenarios, easier to set now than run within prep function

In [470]:
# Build function for different question scenarios

In [33]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

def train_linear_regression_reg(X, y, r):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [34]:
def prepare_data(seed, fill):
    df = pd.read_csv('housing.csv')
    df = df[df['ocean_proximity'].isin(['<1H OCEAN', 'INLAND'])]
    del df['ocean_proximity']

    n = len(df)
    n_val = int(n * 0.2)
    n_test = int(n * 0.2)
    n_train = n - n_val - n_test

    idx = np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idx)
    
    df_train = df.iloc[idx[:n_train]].reset_index(drop=True)
    df_val = df.iloc[idx[n_train:n_train+n_val]].reset_index(drop=True)
    df_test = df.iloc[idx[n_train+n_val:]].reset_index(drop=True)

    df_train.total_bedrooms.fillna(fill, inplace=True)
    df_val.total_bedrooms.fillna(fill, inplace=True)
    df_test.total_bedrooms.fillna(fill, inplace=True)
    
    y_train = np.log1p(df_train.median_house_value.values)
    y_val = np.log1p(df_val.median_house_value.values)
    y_test = np.log1p(df_test.median_house_value.values)
    
    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']
    
    return df_train, df_val, df_test, y_train, y_val, y_test

In [35]:
# question 3
for i in [0, mean]:
    df_train, df_val, df_test, y_train, y_val, y_test = prepare_data(42, i)
    w0, w = train_linear_regression(df_train, y_train)
    y_pred = w0 + df_val.dot(w)
    rmse_score = rmse(y_val, y_pred)
    print(rmse_score)

0.34084790341590543
0.34086287089079714


In [39]:
# question 4
for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    df_train, df_val, df_test, y_train, y_val, y_test = prepare_data(42, 0)
    w0, w = train_linear_regression_reg(df_train, y_train, r)
    y_pred = w0 + df_val.dot(w)
    rmse_score = rmse(y_val, y_pred)
    print(r, rmse_score, rmse_score.round(2))

0 0.34084790341590543 0.34
1e-06 0.34084790618218563 0.34
0.0001 0.3408481800557182 0.34
0.001 0.34085069218459585 0.34
0.01 0.34087793005233935 0.34
0.1 0.3412862042033381 0.34
1 0.3448958327639019 0.34
5 0.34773980704846963 0.35
10 0.348314983351884 0.35


In [37]:
# question 5
rmse_scores = [];

for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] :
    df_train, df_val, df_test, y_train, y_val, y_test = prepare_data(seed, 0)
    w0, w = train_linear_regression(df_train, y_train)
    y_pred = w0 + df_val.dot(w)
    rmse_score = rmse(y_val, y_pred)
    rmse_scores.append(rmse_score)

print(rmse_scores)
print(np.std(rmse_scores).round(3))

[0.3377387160067942, 0.33779993536635067, 0.3384287006757561, 0.3320049468302922, 0.33944518625573017, 0.34338197052508884, 0.3385330211773327, 0.34687476972982134, 0.35127368659591834, 0.33415582665259735]
0.005


In [38]:
# question 6
df_train, df_val, df_test, y_train, y_val, y_test = prepare_data(9, 0)
w0, w = train_linear_regression_reg(df_train, y_train, 0.001)
y_pred = w0 + df_val.dot(w)
rmse_score = rmse(y_val, y_pred)
print(rmse_score)

0.33415301881222803
