In [1]:
# !wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [3]:
df = pd.read_csv("housing.csv")
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


# Data Preparation

In [4]:
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [5]:
df = df.fillna(0)

In [6]:
df['rooms_per_household'] = df['total_rooms'] / df['households']
df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
df['population_per_household'] = df['population'] / df['households']

### Question 1

In [7]:
df.ocean_proximity.value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

# Spliting the data

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [10]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [11]:
len(df_train), len(df_val), len(df_test)

(12384, 4128, 4128)

In [12]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [13]:
y_train = df_train.median_house_value
y_val = df_val.median_house_value
y_test = df_test.median_house_value

In [14]:
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

### Question 2

In [15]:
corr = df_train.corr() 

In [16]:
corr.unstack().sort_values(kind='quicksort').nlargest(20)[11:]

total_bedrooms  households        0.965051
households      total_bedrooms    0.965051
total_rooms     total_bedrooms    0.917606
total_bedrooms  total_rooms       0.917606
total_rooms     households        0.916417
households      total_rooms       0.916417
population      households        0.903207
households      population        0.903207
total_bedrooms  population        0.860477
dtype: float64

In [17]:
# converting median_house_value to binary value
avg = y_train.mean()
above_average = y_train.apply(lambda x: x > avg).astype(int)

### Question 3

In [18]:
from sklearn.metrics import mutual_info_score

In [19]:
mutual_info_score(df_train.ocean_proximity, y_train).round(2)

0.58

### Question 4

In [20]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [21]:
dv = DictVectorizer(sparse=False)

train_dicts = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [22]:
y_train_cat = y_train.apply(lambda x: x > avg).astype(int) 
y_val_cat = y_val.apply(lambda x: x > avg).astype(int) 
y_test_cat = y_test.apply(lambda x: x > avg).astype(int) 

In [23]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train_cat)

LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

In [24]:
y_val_pred = model.predict_proba(X_val)[:, 1].round(2)
y_val_pred_binary = (y_val_pred >= 0.5).astype(int)

val_accuracy = (y_val_pred_binary == y_val_cat).mean()
val_accuracy

0.8238856589147286

### Question 5

In [25]:
columns = ['total_rooms', 'total_bedrooms', 'population', 'households']
X_small = df_train.loc[:, columns]

model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_small.values, y_train_cat)

LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

In [26]:
X_val_small = df_val.loc[:, columns]
y_val_pred = model.predict_proba(X_val_small.values)[:, 1].round(2)

y_val_pred = (y_val_pred >= 0.5).astype(int)
val_acc = (y_val_pred == y_val_cat).mean()
val_acc

0.6848352713178295

In [27]:
accuracy = []
for col in X_small.columns:
    X_train_small = X_small.drop(col, axis=1)
    model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train_small, y_train_cat)
    
    X_val_s = X_val_small.drop(col, axis=1) 
    y_val_pred = model.predict_proba(X_val_s)[:, 1].round(2)

    y_val_pred = (y_val_pred >= 0.5).astype(int)
    new_val_acc = (y_val_pred == y_val_cat).mean()
    
    diff_acc = val_acc - new_val_acc
    
    accuracy.append(f"{col}: {diff_acc}")

In [28]:
accuracy

['total_rooms: 0.07437015503875977',
 'total_bedrooms: 0.036579457364341095',
 'population: 0.05256782945736438',
 'households: 0.02083333333333337']

### Question 6

In [29]:
from sklearn.linear_model import Ridge

In [30]:
y_train_log = np.log(y_train)
y_val_log = np.log(y_val)

In [32]:
def rmse(y, y_pred):
    m = len(y)
    sq_err = (y - y_pred) ** 2
    m_sq_err = np.sum(sq_err) / m
    rm_sq_err = np.sqrt(m_sq_err)
    
    return rm_sq_err


alpha_values = [0, 0.01, 0.1, 1, 10]
error = []
for a in alpha_values:
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train, y_train_log)
    
    y_val_pred = model.predict(X_val) 
    err = rmse(y_val_log, y_val_pred).round(3)
    error.append(err)

In [33]:
error

[0.531, 0.531, 0.531, 0.531, 0.531]