In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv("housing.csv")
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
data.shape

(20640, 10)

In [4]:
data.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [5]:
data.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [6]:
data['total_bedrooms'] = data.total_bedrooms.fillna(0)

In [7]:
data.dtypes

longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
ocean_proximity        object
dtype: object

In [8]:
data["rooms_per_household"] = data["total_rooms"] / data["households"]
data["bedrooms_per_room"] = data["total_bedrooms"] / data["total_rooms"]
data["population_per_household"] = data["population"] / data["households"]

### Question 1
What is the most frequent observation (mode) for the column ocean_proximity?

In [9]:
M = data['ocean_proximity'].mode()

print(f"The most frequent observation for the column ocean_proximity is, {M[0]}")

The most frequent observation for the column ocean_proximity is, <1H OCEAN


In [10]:
cols = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value','rooms_per_household',
       'bedrooms_per_room', 'population_per_household']

### Question 2

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
#Binarize our target data
data['above_average'] = (data.median_house_value > data.median_house_value.mean()).astype(int)

In [13]:
data.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity', 'rooms_per_household',
       'bedrooms_per_room', 'population_per_household', 'above_average'],
      dtype='object')

In [14]:
# df_train 80% and df_test 20%
df_full_train , df_test = train_test_split(data, test_size = 0.2, random_state = 1)

In [15]:
df_full_train.shape, df_test.shape

((16512, 14), (4128, 14))

In [16]:
# get 20% from the df_full_train
df_train , df_val = train_test_split(df_full_train, test_size = 0.25, random_state = 1)

In [17]:
len(df_train), len(df_val), len(df_test)

(12384, 4128, 4128)

In [18]:
#reset the index for all the dataframes

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [19]:
df_val

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household,above_average
0,-118.35,34.07,45.0,7803.0,2154.0,3359.0,2041.0,3.3594,287500.0,<1H OCEAN,3.823126,0.276048,1.645762,1
1,-117.25,32.82,23.0,6139.0,826.0,2036.0,807.0,9.5245,500001.0,NEAR OCEAN,7.607187,0.134550,2.522924,1
2,-121.54,40.06,17.0,858.0,262.0,47.0,27.0,2.4028,67500.0,INLAND,31.777778,0.305361,1.740741,0
3,-117.81,33.66,20.0,2851.0,490.0,1192.0,463.0,5.8752,274200.0,<1H OCEAN,6.157667,0.171870,2.574514,1
4,-117.39,33.97,48.0,1915.0,348.0,1060.0,376.0,3.4044,117900.0,INLAND,5.093085,0.181723,2.819149,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4123,-121.97,37.97,24.0,1330.0,183.0,656.0,205.0,5.0092,244100.0,INLAND,6.487805,0.137594,3.200000,1
4124,-119.18,34.18,31.0,2636.0,638.0,2695.0,614.0,3.2196,175800.0,NEAR OCEAN,4.293160,0.242033,4.389251,0
4125,-122.49,37.73,39.0,1937.0,336.0,742.0,307.0,5.1991,369400.0,NEAR OCEAN,6.309446,0.173464,2.416938,1
4126,-118.72,34.28,18.0,2229.0,371.0,1283.0,379.0,5.5955,217700.0,<1H OCEAN,5.881266,0.166442,3.385224,1


In [20]:
# get the target value

y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

In [21]:
# delete the target value from the train dataset

del df_train["median_house_value"]
del df_val["median_house_value"]
del df_test["median_house_value"]

In [22]:
df_train.shape, df_val.shape, df_test.shape

((12384, 13), (4128, 13), (4128, 13))

In [23]:
len(y_train), len(y_val), len(y_test)

(12384, 4128, 4128)

In [24]:
df_train.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity', 'rooms_per_household', 'bedrooms_per_room',
       'population_per_household', 'above_average'],
      dtype='object')

In [25]:
numerical = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income','rooms_per_household', 'bedrooms_per_room',
       'population_per_household']

In [26]:
categorical = ["ocean_proximity"]

In [27]:
df_train[numerical].corr().mean()

longitude                   0.022101
latitude                   -0.021744
housing_median_age         -0.043580
total_rooms                 0.314010
total_bedrooms              0.319833
population                  0.304466
households                  0.312351
median_income               0.065669
rooms_per_household         0.076498
bedrooms_per_room           0.018405
population_per_household    0.094381
dtype: float64

### Question 3

In [28]:
from sklearn.metrics import mutual_info_score

In [29]:
round(mutual_info_score(df_train['ocean_proximity'], df_train['above_average']),2)

0.1

### Question 4

In [30]:
from sklearn.feature_extraction import DictVectorizer

In [31]:
dv = DictVectorizer(sparse = False)

#### Fitting the Train Data and Validation Data

In [32]:
train_dicts = df_train[categorical + numerical].to_dict(orient ='records')

In [33]:
val_dicts = df_val[categorical + numerical].to_dict(orient ='records')

In [34]:
X_train = dv.fit_transform(train_dicts)

In [35]:
X_val = dv.transform(val_dicts)

#### Training our Model

In [36]:
from sklearn.linear_model import LogisticRegression

In [37]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

In [38]:
# check then weight
model.coef_[0].round(3)

array([ 0.237,  0.005,  0.037,  0.134,  0.094,  1.217,  0.51 , -1.81 ,
        0.045,  0.264,  0.903, -0.002,  0.004,  0.009,  0.   , -0.   ])

In [39]:
model.intercept_[0]

-0.08813883652599135

In [40]:
# Hard predictions
model.predict(X_train)

array([0, 0, 0, ..., 1, 0, 0])

In [41]:
# soft prediction meaning that is a score
model.predict_proba(X_val)[:, 1]

array([0.99458129, 0.99942016, 0.01864608, ..., 0.92911225, 0.77418057,
       0.62974903])

In [42]:
y_pred = model.predict(X_val)

In [43]:
y_pred

array([1, 1, 0, ..., 1, 1, 1])

In [44]:
from sklearn.metrics import accuracy_score

In [45]:
accuracy_score(y_val, y_pred).round(2)

0.82

### Question 5

In [46]:
# without total_room
category_1 = ['longitude', 'latitude', 'housing_median_age','total_bedrooms','population', 
              'households', 'median_income','rooms_per_household', 'bedrooms_per_room','population_per_household']

In [47]:
# without total_bedrooms
category_2 = ['longitude', 'latitude', 'housing_median_age', 'total_rooms','population', 
              'households', 'median_income','rooms_per_household', 'bedrooms_per_room','population_per_household']

In [48]:
# without population
category_3 = ['longitude', 'latitude', 'housing_median_age', 'total_rooms','total_bedrooms', 
              'households', 'median_income','rooms_per_household', 'bedrooms_per_room','population_per_household']

In [49]:
# without households
category_4 = ['longitude', 'latitude', 'housing_median_age', 'total_rooms','total_bedrooms','population', 
              'median_income','rooms_per_household', 'bedrooms_per_room','population_per_household']