In [69]:
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv')
df = df[df['ocean_proximity'].isin(['<1H OCEAN', 'INLAND'])]
df = df[[
    'latitude',
    'longitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
    'median_house_value'
]]

# Question 1

In [70]:
df.columns[df.isna().sum()>0]

Index(['total_bedrooms'], dtype='object')

# Question 2

In [71]:
df['population'].median()

1195.0

# Question 3

In [72]:
import sklearn.model_selection
from sklearn.linear_model import LinearRegression
from sklearn.utils import shuffle
import numpy as np
seed=42
df = df.sample(frac=1, random_state=seed).reset_index(drop=True)

df['median_house_value'] = np.log1p(df['median_house_value'])

df_train, df_test = sklearn.model_selection.train_test_split(df, test_size = 0.2, random_state=seed)
df_train, df_val = sklearn.model_selection.train_test_split(df_train, test_size = 0.25, random_state=seed)

In [74]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_zero = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)

y_column = 'median_house_value'
X_columns = df.columns[df.columns != y_column]

X_train = df_train[X_columns]
y_train = df_train[y_column]
model1 = Pipeline([('imputation', imp_zero), ('regression', LinearRegression())])
model1.fit(X_train, y_train)
model1.predict(df_val[X_columns])

X_train = df_train[X_columns]
y_train = df_train[y_column]
model2 = Pipeline([('imputation', imp_mean), ('regression', LinearRegression())])
model2.fit(X_train, y_train)

from sklearn.metrics import mean_squared_error
rms1 = mean_squared_error(df_val[y_column], model1.predict(df_val[X_columns]),squared=True)
rms2 = mean_squared_error(df_val[y_column], model2.predict(df_val[X_columns]),squared=True)

print(f"Input zero RMSE: {round(rms1,2)}")
print(f"Input mean RMSE: {round(rms2,2)}")

Input zero RMSE: 0.11
Input mean RMSE: 0.11


# Question 4

In [9]:
#!pip install hypopt

In [75]:
from sklearn.linear_model import Lasso, Ridge

regression__alphas = [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]

X_train = df_train[X_columns]
y_train = df_train[y_column]

rmses = {}

for alpha in regression__alphas:
    model3 = Pipeline([('imputation', imp_zero), ('regression', Ridge(alpha))])
    model3.fit(X_train, y_train)
    rms3 = mean_squared_error(df_val[y_column], model3.predict(df_val[X_columns]),squared=True)

    rmses[alpha] = round(rms3,2)

print(rmses)
min(rmses, key=rmses.get)

{0: 0.11, 1e-06: 0.11, 0.0001: 0.11, 0.001: 0.11, 0.01: 0.11, 0.1: 0.11, 1: 0.11, 5: 0.11, 10: 0.11}


0

# Question 5

In [85]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

rmses = []

for seed in seeds:
    df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv')
    df = df[df['ocean_proximity'].isin(['<1H OCEAN', 'INLAND'])]
    df = df[[
        'latitude',
        'longitude',
        'housing_median_age',
        'total_rooms',
        'total_bedrooms',
        'population',
        'households',
        'median_income',
        'median_house_value'
    ]]

    df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    df['median_house_value'] = np.log1p(df['median_house_value'])

    df_train, df_test = sklearn.model_selection.train_test_split(df, test_size = 0.2, random_state=seed)
    SimpleImputer, df_val = sklearn.model_selection.train_test_split(df_train, test_size = 0.25, random_state=seed)
    
    X_train = df_train[X_columns]
    y_train = df_train[y_column]
    
    model5 = Pipeline([('imputation', imp_zero), ('regression', LinearRegression())])
    model5.fit(X_train, y_train)
    rms5 = mean_squared_error(df_val[y_column], model5.predict(df_val[X_columns]),squared=False)

    rmses.append(round(rms5,2))

np.std(rmses)

[0.34, 0.34, 0.33, 0.34, 0.33, 0.34, 0.34, 0.34, 0.33, 0.33]


0.00489897948556636

# Question 6

In [53]:
np.random.seed(9)
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv')
df = df[df['ocean_proximity'].isin(['<1H OCEAN', 'INLAND'])]
df = df[[
    'latitude',
    'longitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
    'median_house_value'
]]

df = df.sample(frac=1, random_state=seed).reset_index(drop=True)

df['median_house_value'] = np.log1p(df['median_house_value'])

df_train, df_test = sklearn.model_selection.train_test_split(df, test_size = 0.2, random_state=seed)

model6 = Pipeline([('imputation', imp_zero), ('regression', Ridge(alpha=0.001))])
model6.fit(X_train, y_train)
rms6 = mean_squared_error(df_test[y_column], model6.predict(df_test[X_columns]),squared=False)
rms6

0.33753410349699725