In [1]:
%load_ext lab_black

In [2]:
# ! wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("housing.csv")

In [5]:
df = df[
    [
        "latitude",
        "longitude",
        "housing_median_age",
        "total_rooms",
        "total_bedrooms",
        "population",
        "households",
        "median_income",
        "median_house_value",
        "ocean_proximity",
    ]
]

In [6]:
df.fillna(0, inplace=True)

In [7]:
df["rooms_per_household"] = df.total_rooms / df.households

In [8]:
df["bedrooms_per_room"] = df.total_bedrooms / df.total_rooms

In [9]:
df["population_per_household"] = df.population / df.households

# Q1

In [10]:
df.ocean_proximity.value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
import numpy as np

In [13]:
idx = np.arange(len(df))

In [14]:
idx_train_val, idx_test = train_test_split(idx, test_size=0.2, random_state=42)

In [15]:
idx_train, idx_val = train_test_split(idx_train_val, test_size=0.25, random_state=42)

In [16]:
X_train = df.iloc[idx_train].drop("median_house_value", axis=1)
X_val = df.iloc[idx_val].drop("median_house_value", axis=1)
X_test = df.iloc[idx_test].drop("median_house_value", axis=1)

y_train = df.iloc[idx_train].median_house_value
y_val = df.iloc[idx_val].median_house_value
y_test = df.iloc[idx_test].median_house_value

# Q2

In [17]:
X_train.corr()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household
latitude,1.0,-0.925005,0.002477,-0.025914,-0.05973,-0.100272,-0.063529,-0.076805,0.119118,-0.124507,-0.002301
longitude,-0.925005,1.0,-0.099812,0.036449,0.06384,0.09167,0.049762,-0.016426,-0.034814,0.10232,0.011022
housing_median_age,0.002477,-0.099812,1.0,-0.363522,-0.324156,-0.292476,-0.306119,-0.119591,-0.181275,0.129456,0.012167
total_rooms,-0.025914,0.036449,-0.363522,1.0,0.931546,0.853219,0.921441,0.198951,0.168926,-0.194185,-0.029452
total_bedrooms,-0.05973,0.06384,-0.324156,0.931546,1.0,0.87734,0.979399,-0.009833,0.010381,0.078094,-0.034301
population,-0.100272,0.09167,-0.292476,0.853219,0.87734,1.0,0.906841,-0.000849,-0.07621,0.031592,0.064998
households,-0.063529,0.049762,-0.306119,0.921441,0.979399,0.906841,1.0,0.011925,-0.085832,0.058004,-0.032522
median_income,-0.076805,-0.016426,-0.119591,0.198951,-0.009833,-0.000849,0.011925,1.0,0.394154,-0.616617,-0.000454
rooms_per_household,0.119118,-0.034814,-0.181275,0.168926,0.010381,-0.07621,-0.085832,0.394154,1.0,-0.500589,0.001801
bedrooms_per_room,-0.124507,0.10232,0.129456,-0.194185,0.078094,0.031592,0.058004,-0.616617,-0.500589,1.0,-0.002851


* total_bedrooms and households have highest correlation 0.9794

In [18]:
median_house_value_avg = y_train.mean()
median_house_value_avg

206807.7419250646

In [19]:
y_train_bin = y_train > median_house_value_avg
y_val_bin = y_val > median_house_value_avg
y_test_bin = y_test > median_house_value_avg

In [20]:
from sklearn.metrics import mutual_info_score

In [21]:
from functools import partial

In [22]:
mutual_info_score(y_train_bin, X_train.total_rooms)

0.2682495050746932

# Q3

In [23]:
X_train.apply(partial(mutual_info_score, y_train_bin), axis=0)



latitude                    0.156040
longitude                   0.178880
housing_median_age          0.013672
total_rooms                 0.268250
total_bedrooms              0.085501
population                  0.175275
households                  0.079897
median_income               0.521072
ocean_proximity             0.101384
rooms_per_household         0.651019
bedrooms_per_room           0.654318
population_per_household    0.637922
dtype: float64

In [24]:
from sklearn.linear_model import LogisticRegression

In [25]:
from sklearn.feature_extraction import DictVectorizer

In [26]:
dicts_train = X_train[["ocean_proximity"]].to_dict(orient="records")
dicts_val = X_val[["ocean_proximity"]].to_dict(orient="records")
dicts_test = X_test[["ocean_proximity"]].to_dict(orient="records")

In [27]:
dv = DictVectorizer(sparse=False)

In [28]:
dv.fit(dicts_train)

In [29]:
dv.transform(dicts_train)

array([[1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       ...,
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.]])

In [30]:
X_train_arr = np.concatenate(
    [X_train.drop("ocean_proximity", axis=1).values, dv.transform(dicts_train)], axis=1
)
X_val_arr = np.concatenate(
    [X_val.drop("ocean_proximity", axis=1).values, dv.transform(dicts_val)], axis=1
)
X_test_arr = np.concatenate(
    [X_test.drop("ocean_proximity", axis=1).values, dv.transform(dicts_test)], axis=1
)

In [52]:
X_train_arr.shape

(12384, 16)

In [40]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)

In [41]:
model.fit(X_train_arr, y_train_bin.values)

In [42]:
from sklearn.metrics import accuracy_score

# Q4

In [44]:
accuracy_score(model.predict(X_val_arr), y_val_bin.values)

0.8364825581395349

In [46]:
accuracies = []

for feature in X_train.columns:
    if feature == "ocean_proximity":
        X_train_arr = X_train.drop(["ocean_proximity", feature], axis=1).values
        X_val_arr = X_val.drop(["ocean_proximity", feature], axis=1).values
        X_test_arr = X_test.drop(["ocean_proximity", feature], axis=1).values
    else:
        X_train_arr = np.concatenate(
            [
                X_train.drop(["ocean_proximity", feature], axis=1).values,
                dv.transform(dicts_train),
            ],
            axis=1,
        )
        X_val_arr = np.concatenate(
            [
                X_val.drop(["ocean_proximity", feature], axis=1).values,
                dv.transform(dicts_val),
            ],
            axis=1,
        )
        X_test_arr = np.concatenate(
            [
                X_test.drop(["ocean_proximity", feature], axis=1).values,
                dv.transform(dicts_test),
            ],
            axis=1,
        )

    model.fit(X_train_arr, y_train_bin.values)
    val_accuracy = accuracy_score(model.predict(X_val_arr), y_val_bin.values)
    print(f"Eliminated {feature} - accuracy is {val_accuracy}.")
    accuracies.append({"feature": feature, "accuracy": val_accuracy})

Eliminated latitude - accuracy is 0.8343023255813954.
Eliminated longitude - accuracy is 0.8326065891472868.
Eliminated housing_median_age - accuracy is 0.8313953488372093.
Eliminated total_rooms - accuracy is 0.8364825581395349.
Eliminated total_bedrooms - accuracy is 0.8376937984496124.
Eliminated population - accuracy is 0.8263081395348837.
Eliminated households - accuracy is 0.8328488372093024.
Eliminated median_income - accuracy is 0.7856104651162791.
Eliminated ocean_proximity - accuracy is 0.8197674418604651.
Eliminated rooms_per_household - accuracy is 0.8364825581395349.
Eliminated bedrooms_per_room - accuracy is 0.8352713178294574.
Eliminated population_per_household - accuracy is 0.8355135658914729.


# Q5

In [48]:
pd.DataFrame(accuracies).sort_values("accuracy")

Unnamed: 0,feature,accuracy
7,median_income,0.78561
8,ocean_proximity,0.819767
5,population,0.826308
2,housing_median_age,0.831395
1,longitude,0.832607
6,households,0.832849
0,latitude,0.834302
10,bedrooms_per_room,0.835271
11,population_per_household,0.835514
3,total_rooms,0.836483


In [49]:
y_train_log = np.log1p(y_train.values)
y_val_log = np.log1p(y_val.values)
y_test_log = np.log1p(y_test.values)

In [50]:
from sklearn.linear_model import Ridge

In [51]:
X_train_arr = np.concatenate(
    [X_train.drop("ocean_proximity", axis=1).values, dv.transform(dicts_train)], axis=1
)
X_val_arr = np.concatenate(
    [X_val.drop("ocean_proximity", axis=1).values, dv.transform(dicts_val)], axis=1
)
X_test_arr = np.concatenate(
    [X_test.drop("ocean_proximity", axis=1).values, dv.transform(dicts_test)], axis=1
)

In [52]:
from sklearn.metrics import mean_squared_error

# Q6

In [55]:
X_train_arr

array([[  34.43, -119.67,   39.  , ...,    0.  ,    0.  ,    0.  ],
       [  33.74, -118.32,   24.  , ...,    0.  ,    0.  ,    1.  ],
       [  39.13, -121.62,   41.  , ...,    0.  ,    0.  ,    0.  ],
       ...,
       [  32.74, -116.99,   18.  , ...,    0.  ,    0.  ,    0.  ],
       [  33.84, -117.87,   16.  , ...,    0.  ,    0.  ,    0.  ],
       [  33.91, -118.32,   35.  , ...,    0.  ,    0.  ,    0.  ]])

In [53]:
for alpha in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=alpha, solver="sag", random_state=42)
    model.fit(X_train_arr, y_train_log)
    val_rmse = mean_squared_error(model.predict(X_val_arr), y_val_log, squared=False)

    print(f"alpha {alpha}, rmse {val_rmse}")

alpha 0, rmse 0.524063570701514
alpha 0.01, rmse 0.524063570718629
alpha 0.1, rmse 0.5240635708812071
alpha 1, rmse 0.5240635725155536
alpha 10, rmse 0.5240635888333284
