In [35]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [47]:
df=pd.read_csv(r" https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv")

In [48]:
df=df[df['ocean_proximity'].isin(['<1H OCEAN','INLAND'])]

In [50]:
df=df[["longitude", "latitude", "housing_median_age", "total_rooms", "total_bedrooms", 
       "population", "households", "median_income","median_house_value"]].reset_index(drop=True)

Find a feature with missing values. How many missing values does it have?

In [51]:
df.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        157
population              0
households              0
median_income           0
median_house_value      0
dtype: int64

What's the median (50% percentile) for variable 'population'?

In [52]:
median = np.percentile(df.population, 50)
print(median)

1195.0


In [53]:

n = df.shape[0]
n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)
df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [54]:
y_train_or = df_train.median_house_value.values
y_val_or = df_val.median_house_value.values
y_test_or = df_test.median_house_value.values

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

y_train = np.log1p(y_train_or)
y_val = np.log1p(y_val_or)
y_test = np.log1p(y_test_or)

In [55]:
df_train.isna().sum()

longitude              0
latitude               0
housing_median_age     0
total_rooms            0
total_bedrooms        94
population             0
households             0
median_income          0
dtype: int64

In [56]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [57]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9413 entries, 0 to 9412
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           9413 non-null   float64
 1   latitude            9413 non-null   float64
 2   housing_median_age  9413 non-null   float64
 3   total_rooms         9413 non-null   float64
 4   total_bedrooms      9319 non-null   float64
 5   population          9413 non-null   float64
 6   households          9413 non-null   float64
 7   median_income       9413 non-null   float64
dtypes: float64(8)
memory usage: 588.4 KB


In [58]:
df_train_mean=df_train.copy()
df_val_mean=df_val.copy()

In [59]:
mean = df_train.total_bedrooms.mean()
df_train_mean['total_bedrooms'].fillna(mean,inplace=True)
w_0_mean, w_mean = train_linear_regression(df_train_mean, y_train)

Question 3: Missing values

In [60]:
df_val_mean['total_bedrooms']=df_val_mean['total_bedrooms'].fillna(mean)
y_mean_pred_val = w_0_mean +df_val_mean.dot(w_mean)
np.round(rmse(y_val, y_mean_pred_val),2)

0.34

In [61]:
df_train_zero=df_train.copy()
df_val_zero=df_val.copy()
df_train_zero['total_bedrooms'].fillna(0,inplace=True)
w_0_mean, w_mean = train_linear_regression(df_train_zero, y_train)
df_val_zero['total_bedrooms']=df_val_zero['total_bedrooms'].fillna(0)
y_mean_pred_val = w_0_mean +df_val_zero.dot(w_mean)
np.round(rmse(y_val, y_mean_pred_val),2)

0.34

Both are equally good

Question 4: Regularization

In [62]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [63]:
def prepare_X(df):
    df_num = df[base]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

In [64]:
df_train_zero.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,-119.14,36.23,22.0,2935.0,523.0,1927.0,530.0,2.5875
1,-117.79,34.12,16.0,2426.0,426.0,1319.0,446.0,4.8125
2,-117.97,33.68,26.0,3653.0,568.0,1930.0,585.0,5.7301
3,-118.03,34.1,32.0,2668.0,609.0,1512.0,541.0,2.9422
4,-121.87,37.34,39.0,2479.0,541.0,1990.0,506.0,2.4306


In [65]:
for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    w_0, w = train_linear_regression_reg(df_train_zero, y_train, r=r)
    y_zero_reg_va = w_0 + df_val_zero.dot(w)
    rmse_val = np.round(rmse(y_val, y_zero_reg_va),2)
    print(r, w_0, rmse_val)

0 -9.763249477825624 0.34
1e-06 -9.763228830975619 0.34
0.0001 -9.761185235427776 0.34
0.001 -9.742646250116293 0.34
0.01 -9.561056193231325 0.34
0.1 -8.058889769553687 0.34
1 -3.1331542783986586 0.34
5 -0.841086797552381 0.35
10 -0.4381172315561054 0.35


The answer its r=0


Question 5:  Shuffling


In [69]:
rmse_list = []

for r in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:

    idx = np.arange(n)
    np.random.seed(r)
    np.random.shuffle(idx)
    df=df[["longitude", "latitude", "housing_median_age", "total_rooms", "total_bedrooms", 
       "population", "households", "median_income","median_house_value"]]
    base=["longitude", "latitude", "housing_median_age", "total_rooms", "total_bedrooms", 
       "population", "households", "median_income"]

    df_shuffled = df.iloc[idx]
    
    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_test = df_shuffled.iloc[n_train+n_val:].copy()

    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
    
    y_train_orig = df_train.median_house_value.values
    y_val_orig = df_val.median_house_value.values
    y_test_orig = df_test.median_house_value.values

    y_train = np.log1p(y_train_orig)
    y_val = np.log1p(y_val_orig)
    y_test = np.log1p(y_test_orig)
    
    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']
    
    X_null_train = prepare_X(df_train)
    w_0, w = train_linear_regression(X_null_train, y_train)
    
    X_null_val = prepare_X(df_val)
    y_null_reg_val = w_0 + X_null_val.dot(w)
    rmse_val = np.round(rmse(y_val, y_null_reg_val),2)
    
    rmse_list.append(rmse_val)
    
    print(r, w_0, rmse_val)
    print("std : ", np.round(np.std(rmse_list),3))

0 -9.87006897147586 0.34
std :  0.0
1 -9.114011209299143 0.34
std :  0.0
2 -9.911560201173312 0.34
std :  0.0
3 -10.233183681569443 0.33
std :  0.004
4 -9.174763450440548 0.34
std :  0.004
5 -10.00664830321511 0.34
std :  0.004
6 -10.039903099546635 0.34
std :  0.003
7 -10.353545444732113 0.35
std :  0.005
8 -9.899416167072257 0.35
std :  0.006
9 -9.964622125894088 0.33
std :  0.006


## The answer for the question 5 its 0.005

Question 6: RMSE on test

In [70]:
seed = 9
idx = np.arange(n)
np.random.seed(seed)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]
    
df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

df_train_val = pd.concat([df_train, df_val])

df_train_val = df_train_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train_val_orig = df_train_val.median_house_value.values
y_test_orig = df_test.median_house_value.values

y_train_val = np.log1p(y_train_val_orig)
y_test = np.log1p(y_test_orig)

del df_train_val['median_house_value']
del df_test['median_house_value']

In [71]:
X_zero_train_val = prepare_X(df_train_val)
w_0_train_val, w_train_val = train_linear_regression_reg(X_zero_train_val, y_train_val, r=0.001)

X_zero_test = prepare_X(df_test)
y_zero_pred_test = w_0_train_val + X_zero_test.dot(w_train_val)

np.round(rmse(y_test, y_zero_pred_test),2)

0.33