In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv"
start_df = pd.read_csv(url)

In [None]:
sns.histplot(start_df.median_house_value, bins=50)

# **Preparing the dataset**

In [None]:
# Step 1: Filter the records where ocean_proximity is either '<1H OCEAN' or 'INLAND'
filtered_df = start_df[start_df['ocean_proximity'].isin(['<1H OCEAN', 'INLAND'])]

# Step 2: Select only the specified columns
selected_columns = ['latitude', 'longitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value']
df = filtered_df[selected_columns]

# Question 1

There's one feature with missing values. What is it?

In [None]:
# Check for missing values in the dataset
missing_values = filtered_df.isnull().sum()

# Identify the feature with missing values
feature_with_missing_values = missing_values[missing_values > 0].index[0]

print("Answer for Question 1:", feature_with_missing_values)

# Question 2
What's the median (50% percentile) for variable 'population'?

In [None]:
# Calculate the median (50th percentile) for 'population'
median_population = filtered_df['population'].median()

print("Answer for Question 2:", median_population)

# **Prepare and split the dataset**

In [None]:
def train_linear_regression(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [None]:
n = len(df)
n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)
df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train_orig = df_train.median_house_value.values
y_val_orig = df_val.median_house_value.values
y_test_orig = df_test.median_house_value.values

y_train = np.log1p(y_train_orig)
y_val = np.log1p(y_val_orig)
y_test = np.log1p(y_test_orig)

In [None]:
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [None]:
check=["longitude", "latitude", "housing_median_age", "total_rooms", "total_bedrooms", 
       "population", "households", "median_income"]

def prepare_X(df, fillna_value):
    df_num = df[check]
    df_num = df_num.fillna(fillna_value)
    X = df_num.values
    return X

In [None]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [None]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

# **Question 3**

In [None]:
def train_and_evaluate(X_train, X_val, y_train, y_val):
    w_0, w = train_linear_regression(X_train, y_train)

    y_pred_val = w_0 + X_val.dot(w)

    rmse_val = np.round(rmse(y_val, y_pred_val), 2)

    return rmse_val

mean_fill_value = df_train.total_bedrooms.mean()
X_mean_train = prepare_X(df_train, fillna_value=mean_fill_value)
X_mean_val = prepare_X(df_val, fillna_value=mean_fill_value)
rmse_mean_fill = train_and_evaluate(X_mean_train, X_mean_val, y_train, y_val)

X_null_train = prepare_X(df_train, fillna_value=0)
X_null_val = prepare_X(df_val, fillna_value=0)
rmse_null_fill = train_and_evaluate(X_null_train, X_null_val, y_train, y_val)

print("RMSE with Mean Fill:", rmse_mean_fill)
print("RMSE with Null Fill:", rmse_null_fill)

Answer for Question 3: Both are equally good

# **Question 4**

In [None]:
for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    w_0, w = train_linear_regression_reg(X_null_train, y_train, r=r)
    y_null_reg_val = w_0 + X_null_val.dot(w)
    rmse_val = np.round(rmse(y_val, y_null_reg_val),2)
    print(r, w_0, rmse_val)

Answer for Question 4: 0

# **Question 5**

In [None]:
rmse_list = []

for r in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:

    idx = np.arange(n)
    np.random.seed(r)
    np.random.shuffle(idx)

    df_shuffled = df.iloc[idx]
    
    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_test = df_shuffled.iloc[n_train+n_val:].copy()

    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
    
    y_train_orig = df_train.median_house_value.values
    y_val_orig = df_val.median_house_value.values
    y_test_orig = df_test.median_house_value.values

    y_train = np.log1p(y_train_orig)
    y_val = np.log1p(y_val_orig)
    y_test = np.log1p(y_test_orig)
    
    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']
    
    X_null_train = prepare_X(df_train, fillna_value=0)
    w_0, w = train_linear_regression(X_null_train, y_train)
    
    X_null_val = prepare_X(df_val, fillna_value=0)
    y_null_reg_val = w_0 + X_null_val.dot(w)
    rmse_val = np.round(rmse(y_val, y_null_reg_val),2)
    
    rmse_list.append(rmse_val)
    
    print(r, w_0, rmse_val)

In [None]:
np.round(np.std(rmse_list),3)

Answer for Question 5: 0.005

# **Question 6**

In [None]:
r = 9

idx = np.arange(n)
np.random.seed(r)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]
    
df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

frames = [df_train, df_val]
df_train_val = pd.concat(frames)

df_train_val = df_train_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train_val_orig = df_train_val.median_house_value.values
y_test_orig = df_test.median_house_value.values

y_train_val = np.log1p(y_train_val_orig)
y_test = np.log1p(y_test_orig)

del df_train_val['median_house_value']
del df_test['median_house_value']

In [None]:
X_null_train_val = prepare_X(df_train_val, fillna_value=0)
w_0_train_val, w_train_val = train_linear_regression_reg(X_null_train_val, y_train_val, r=0.001)

X_null_test = prepare_X(df_test, fillna_value=0)
y_null_pred_test = w_0_train_val + X_null_test.dot(w_train_val)

np.round(rmse(y_test, y_null_pred_test),2)

Answer for Question 5: 0.33