URL: https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/course-zoomcamp/cohorts/2022/02-regression/homework.md

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('Datasets/housing.csv')

In [3]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
columns = ['latitude','longitude','housing_median_age','total_rooms','total_bedrooms','population','households',
           'median_income','median_house_value']

In [5]:
df = df[columns]

In [6]:
df.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


# Question 1

In [7]:
df.isnull().sum()

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
dtype: int64

Answer: 207

# Question 2

In [8]:
df.population.median()

1166.0

# Question 3

Answer: 1166

In [9]:
np.random.seed(42)
index = list(df.index)
np.random.shuffle(index)

In [10]:
n = len(df)

n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

In [11]:
df_train = df.iloc[index[:n_train]]
df_val = df.iloc[index[n_train:n_train+n_val]]
df_test = df.iloc[index[n_train+n_val:]]

In [13]:
y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

In [14]:
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [16]:
df_train.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
20046,36.06,-119.01,25.0,1505.0,,1392.0,359.0,1.6812
3024,35.14,-119.46,30.0,2943.0,,1565.0,584.0,2.5313
15663,37.8,-122.44,52.0,3830.0,,1310.0,963.0,3.4801
20484,34.28,-118.72,17.0,3051.0,,1705.0,495.0,5.7376
9814,36.62,-121.93,34.0,2351.0,,1063.0,428.0,3.725


In [17]:
df_train.isnull().sum()

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
dtype: int64

# Linear Regression Functions without regularization

In [19]:
def linear_regression(X):
    return X.dot(w)

In [20]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    
    XTX = X.T.dot(X) #Gram matrix
    XTX_inverse = np.linalg.inv(XTX)
    
    w_full = XTX_inverse.dot(X.T).dot(y)
    return w_full[0], w_full[1:]

# RMSE

In [30]:
def rmse(y, y_pred):
    error = y - y_pred
    sq_error = error ** 2
    mse = sq_error.mean()
    return np.sqrt(mse)

# Training with missing values filled with 0

In [46]:
def prepare_X(df):
    features = columns
    df_features = df[features]
    df_features = df_features.fillna(0)
    X = df_features.values
    return X

In [48]:
X_train = prepare_X(df_train)

X_train = prepare_X(df_train)
w0, w = train_linear_regression(X_train, y_train)

X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)

In [49]:
rmse(y_val, y_pred)

0.32953303652313454

# Training with missing values filled with mean

In [50]:
df_train = df.iloc[index[:n_train]]
df_val = df.iloc[index[n_train:n_train+n_val]]
df_test = df.iloc[index[n_train+n_val:]]

In [41]:
y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

In [42]:
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [43]:
total_bedrooms_mean = df_train.total_bedrooms.mean()
total_bedrooms_mean

533.4803317730147

In [66]:
def prepare_X(df):
    features = columns
    df_features = df[features]
    
    df_features.total_bedrooms = df_features.total_bedrooms.fillna(total_bedrooms_mean)
    
    X = df_features.values
    return X

In [67]:
X_train = prepare_X(df_train)

X_train = prepare_X(df_train)
w0, w = train_linear_regression(X_train, y_train)

X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)

In [70]:
rmse(y_val, y_pred)

0.32901954390040344

# Question 4

Answer: Both are equally Good

# Linear Regression Functions with regularization

In [72]:
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    
    XTX = X.T.dot(X) #Gram matrix
    XTX = XTX + r * np.eye(XTX.shape[0])     # Regularization
    
    XTX_inverse = np.linalg.inv(XTX)
    
    w_full = XTX_inverse.dot(X.T).dot(y)
    return w_full[0], w_full[1:]

# Training with missing values filled with 0

In [84]:
def prepare_X(df):
    df = df.copy()
    features = columns
    df_features = df[features]
    df_features = df_features.fillna(0)
    X = df_features.values
    return X

In [85]:
for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    X_train = prepare_X(df_train)

    X_train = prepare_X(df_train)
    w0, w = train_linear_regression_reg(X_train, y_train, r=r)

    X_val = prepare_X(df_val)
    y_pred = w0 + X_val.dot(w)

    print(r, ' ', rmse(y_val, y_pred))

0   0.32953303652313454
1e-06   0.32953303616477314
0.0001   0.32953300097040716
0.001   0.3295327038672535
0.01   0.329531936598947
0.1   0.32969472053996707
1   0.33378872200041393
5   0.33924853455042614
10   0.34060638078084016


# Training with missing values filled with mean

In [78]:
df_train = df.iloc[index[:n_train]]
df_val = df.iloc[index[n_train:n_train+n_val]]
df_test = df.iloc[index[n_train+n_val:]]

In [79]:
y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

In [80]:
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [81]:
total_bedrooms_mean = df_train.total_bedrooms.mean()
total_bedrooms_mean

533.4803317730147

In [82]:
def prepare_X(df):
    df = df.copy()
    features = columns
    df_features = df[features]
    
    df_features.total_bedrooms = df_features.total_bedrooms.fillna(total_bedrooms_mean)
    
    X = df_features.values
    return X

In [83]:
X_train = prepare_X(df_train)

X_train = prepare_X(df_train)
w0, w = train_linear_regression_reg(X_train, y_train)

X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)

rmse(y_val, y_pred)

0.32901939146802806

Answer: 0.0001

# Question 5

In [93]:
def prepare_X(df):
    df = df.copy()
    
    features = columns
    df_features = df[features]
    
    df_features = df_features.fillna(0)
    
    X = df_features.values
    return X

In [95]:
rmse_values  = []
for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    np.random.seed(seed)
    index = list(df.index)
    np.random.shuffle(index)
    
    n = len(df)
    n_val = int(n * 0.2)
    n_test = int(n * 0.2)
    n_train = n - n_val - n_test
    
    df_train = df.iloc[index[:n_train]]
    df_val = df.iloc[index[n_train:n_train+n_val]]
    df_test = df.iloc[index[n_train+n_val:]]
    
    y_train = np.log1p(df_train.median_house_value.values)
    y_val = np.log1p(df_val.median_house_value.values)
    y_test = np.log1p(df_test.median_house_value.values)
    
    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']
    
    X_train = prepare_X(df_train)

    X_train = prepare_X(df_train)
    w0, w = train_linear_regression(X_train, y_train)

    X_val = prepare_X(df_val)
    y_pred = w0 + X_val.dot(w)

    rmse_values.append(rmse(y_val, y_pred))

In [96]:
std = np.std(rmse_values)

In [97]:
round(std, 3)

0.004

In [98]:
rmse_values

[0.33884304805295895,
 0.33623872559558743,
 0.3320912318844025,
 0.3405153609035518,
 0.33890240665726906,
 0.3434866725719516,
 0.3451980953098988,
 0.33959899274043814,
 0.3466230873192812,
 0.3365926124192119]

# Question 6

In [101]:
np.random.seed(9)
index = list(df.index)
np.random.shuffle(index)

n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

df_train = df.iloc[index[:n_train]]
df_val = df.iloc[index[n_train:n_train+n_val]]
df_test = df.iloc[index[n_train+n_val:]]

y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']


df_full_train  = pd.concat([df_train, df_val]).reset_index()
y_full_train = np.concatenate([y_train, y_val])

X_full_train = prepare_X(df_full_train)

X_full_train = prepare_X(df_full_train)
w0, w = train_linear_regression_reg(X_full_train, y_full_train, r=0.001)

X_test = prepare_X(df_test)
y_pred = w0 + X_test.dot(w)

rmse(y_test, y_pred)

0.3453168914389518