In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv')

In [4]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
df.dtypes

longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
ocean_proximity        object
dtype: object

In [6]:
len(df)

20640

In [7]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [8]:
df['total_bedrooms'].fillna(0, inplace = True)

In [9]:
df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

Create a new column rooms_per_household by dividing the column total_rooms by the column households from dataframe.
Create a new column bedrooms_per_room by dividing the column total_bedrooms by the column total_rooms from dataframe.
Create a new column population_per_household by dividing the column population by the column households from dataframe.

In [10]:
df['rooms_per_household'] = df['total_rooms']/df['households']
df['bedrooms_per_room'] = df['total_bedrooms']/df['total_rooms']
df['population_per_household'] = df['population']/df['households']

What is the most frequent observation (mode) for the column ocean_proximity?

In [11]:
df.ocean_proximity.value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

Create the correlation matrix for the numerical features of your train dataset.
In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
What are the two features that have the biggest correlation in this dataset?

In [12]:
df.corr(method='pearson', min_periods=1)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_household,bedrooms_per_room,population_per_household
longitude,1.0,-0.924664,-0.108197,0.044568,0.068082,0.099773,0.05531,-0.015176,-0.045967,-0.02754,0.084836,0.002476
latitude,-0.924664,1.0,0.011173,-0.0361,-0.065318,-0.108785,-0.071035,-0.079809,-0.14416,0.106389,-0.104112,0.002366
housing_median_age,-0.108197,0.011173,1.0,-0.361262,-0.317063,-0.296244,-0.302916,-0.119034,0.105623,-0.153277,0.125396,0.013191
total_rooms,0.044568,-0.0361,-0.361262,1.0,0.920196,0.857126,0.918484,0.19805,0.134153,0.133798,-0.174583,-0.024581
total_bedrooms,0.068082,-0.065318,-0.317063,0.920196,1.0,0.866266,0.966507,-0.007295,0.049148,0.002717,0.122205,-0.028019
population,0.099773,-0.108785,-0.296244,0.857126,0.866266,1.0,0.907222,0.004834,-0.02465,-0.072213,0.031397,0.069863
households,0.05531,-0.071035,-0.302916,0.918484,0.966507,0.907222,1.0,0.013033,0.065843,-0.080598,0.059818,-0.027309
median_income,-0.015176,-0.079809,-0.119034,0.19805,-0.007295,0.004834,0.013033,1.0,0.688075,0.326895,-0.573836,0.018766
median_house_value,-0.045967,-0.14416,0.105623,0.134153,0.049148,-0.02465,0.065843,0.688075,1.0,0.151948,-0.238759,-0.023737
rooms_per_household,-0.02754,0.106389,-0.153277,0.133798,0.002717,-0.072213,-0.080598,0.326895,0.151948,1.0,-0.387465,-0.004852


Make median_house_value binary
We need to turn the median_house_value variable from numeric into binary.
Let's create a variable above_average which is 1 if the median_house_value is above its mean value and 0 otherwise.

In [13]:
df['above_average'] = [1 if i > df['median_house_value'].mean() else 0 for i in df['median_house_value']]

In [14]:
df['above_average']

0        1
1        1
2        1
3        1
4        1
        ..
20635    0
20636    0
20637    0
20638    0
20639    0
Name: above_average, Length: 20640, dtype: int64

Split your data in train/val/test sets, with 60%/20%/20% distribution.
Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
Make sure that the target value (median_house_value) is not in your dataframe.

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
np.random.seed(42)

In [17]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)

df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values 

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

Calculate the mutual information score with the (binarized) 
price for the categorical variable that we have. Use the training set only.

What is the value of mutual information?
Round it to 2 decimal digits using round(score, 2)

In [18]:
from sklearn.metrics import mutual_info_score

In [19]:
mutual_info_score(df_train['ocean_proximity'], y_train)

0.10138385763624205

Now let's train a logistic regression
Remember that we have one categorical variable ocean_proximity in the data. Include it using one-hot encoding.

Fit the model on the training dataset.

To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)

Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [20]:
df_train = pd.get_dummies(data=df_train,columns=["ocean_proximity"],prefix=["o"])
df_val = pd.get_dummies(data=df_val,columns=["ocean_proximity"],prefix=["o"])
df_test = pd.get_dummies(data=df_test,columns=["ocean_proximity"],prefix=["o"])

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(df_train, y_train)

LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

In [23]:
from sklearn import metrics

In [24]:
y_pred = model.predict_proba(df_val)[:, 1]

In [25]:
start_score = model.score(df_val, y_val)
print(start_score)

0.8364825581395349


Let's find the least useful feature using the feature elimination technique.
Train a model with all these features (using the same parameters as in Q4).
Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
Which of following feature has the smallest difference?

total_rooms
total_bedrooms
population
households

note: the difference doesn't have to be positive

In [26]:
tr_train = df_train.drop('total_rooms', axis=1, inplace=False)
tr_val = df_val.drop('total_rooms', axis=1, inplace=False)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(tr_train, y_train)
tr_score = model.score(tr_val, y_val)
print(round(tr_score - start_score,5))

0.0


In [27]:
tb_train = df_train.drop('total_bedrooms', axis=1, inplace=False)
tb_val = df_val.drop('total_bedrooms', axis=1, inplace=False)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(tb_train, y_train)
tb_score = model.score(tb_val, y_val)
print(round(tb_score - start_score,5))

0.0


In [28]:
p_train = df_train.drop('population', axis=1, inplace=False)
p_val = df_val.drop('population', axis=1, inplace=False)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(p_train, y_train)
p_score = model.score(p_val, y_val)
print(p_score - start_score)

-0.01017441860465118


In [29]:
h_train = df_train.drop('households', axis=1, inplace=False)
h_val = df_val.drop('households', axis=1, inplace=False)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(h_train, y_train)
h_score = model.score(h_val, y_val)
print(h_score - start_score)

-0.003875968992248069


For this question, we'll see how to use a linear regression model from Scikit-Learn
We'll need to use the original column 'median_house_value'. Apply the logarithmic transformation to this column.

Fit the Ridge regression model (model = Ridge(alpha=a, solver="sag", random_state=42)) on the training data.
This model has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10]

Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.
If there are multiple options, select the smallest alpha.

In [33]:
df = pd.get_dummies(data=df,columns=["ocean_proximity"],prefix=["o"])

df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)

df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

y_train = np.log(df_train.median_house_value.values) 
y_val = np.log(df_val.median_house_value.values) 
y_test = np.log(df_test.median_house_value.values)

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

In [34]:
from sklearn.linear_model import Ridge

In [38]:
res = []

alpha = [0, 0.01, 0.1, 1, 10]

for a in alpha:
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(df_train, y_train)
    y_pred = model.predict(df_val)
    res.append((a, np.round(np.sqrt(metrics.mean_squared_error(y_val, model.predict(df_val))), 8)))

In [39]:
print(res)

[(0, 0.52406716), (0.01, 0.52406716), (0.1, 0.52406716), (1, 0.52406716), (10, 0.52406718)]
