In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("housing.csv")

In [3]:
features = ['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'median_house_value',
'ocean_proximity']

In [4]:
df = df[features]
df = df.fillna(0)
df['rooms_per_household'] = df.total_rooms / df.households
df['bedrooms_per_room'] = df.total_bedrooms / df.total_rooms
df['population_per_household'] = df.population / df.households

In [5]:
df.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.80226
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467


### Question 1

In [6]:
df.ocean_proximity.value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

### Question 2

In [7]:
df_train, df_test = train_test_split(df, test_size=0.4,random_state=42)

In [8]:
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=42)

In [9]:
y_train = df_train.median_house_value.values
y_val = df_val.median_house_value.values
y_test = df_test.median_house_value.values
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [10]:
cor = df_train.corr() - np.eye(len(df_train.columns[df_train.dtypes != "object"]))

In [11]:
pd.concat([cor.max(),cor.idxmax()], axis = 1).sort_values(0, ascending = False)

Unnamed: 0,0,1
total_bedrooms,0.982014,households
households,0.982014,total_bedrooms
total_rooms,0.928249,total_bedrooms
population,0.905168,households
median_income,0.323443,rooms_per_household
rooms_per_household,0.323443,median_income
housing_median_age,0.13317,bedrooms_per_room
bedrooms_per_room,0.13317,housing_median_age
latitude,0.110749,rooms_per_household
longitude,0.102873,bedrooms_per_room


In [12]:
y_mean = np.mean(df.median_house_value)

In [13]:
above_average = (y_train > y_mean).astype(int)

### Question 3

In [14]:
from sklearn.metrics import mutual_info_score

In [15]:
round(mutual_info_score(df_train['ocean_proximity'], above_average), 2)

0.1

### Question 4

In [16]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [17]:
dv = DictVectorizer(sparse = False)

In [18]:
train_dict = df_train.to_dict(orient = 'records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient = 'records')
X_val = dv.transform(val_dict)

In [19]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)

In [20]:
model.fit(X_train, above_average)

In [29]:
accuracy = np.mean(model.predict(X_val) == (y_val > y_mean).astype(int))

In [30]:
round(accuracy, 2)

0.83

### Question 5

In [44]:
features_to_exclude = dv.get_feature_names_out()
accuracy_dif = {}

In [46]:
for feature in range(len(features_to_exclude)):
    model.fit(np.delete(X_train, feature , axis = 1), above_average)
    y_pred = model.predict(np.delete(X_val, feature, axis = 1))
    accuracy_dif['%s'%features_to_exclude[feature]] = round(accuracy - np.mean(y_pred == (y_val > y_mean).astype(int)) , 4)

In [47]:
accuracy_dif

{'bedrooms_per_room': 0.0004844961240310086,
 'households': 0.0031492248062016115,
 'housing_median_age': 0.006782945736434121,
 'latitude': 0.002422480620155043,
 'longitude': 0.01065891472868219,
 'median_income': 0.04142441860465118,
 'ocean_proximity=<1H OCEAN': 0.001211240310077577,
 'ocean_proximity=INLAND': 0.0007267441860465684,
 'ocean_proximity=ISLAND': 0.0014534883720930258,
 'ocean_proximity=NEAR BAY': -0.0004844961240310086,
 'ocean_proximity=NEAR OCEAN': -0.002422480620155043,
 'population': 0.007751937984496138,
 'population_per_household': -0.0002422480620154488,
 'rooms_per_household': 0.0007267441860465684,
 'total_bedrooms': 0.00024224806201555982,
 'total_rooms': -0.0009689922480620172}

In [51]:
print(accuracy_dif['total_rooms'])
print(accuracy_dif['total_bedrooms']) #smallest one
print(accuracy_dif['population'])
print(accuracy_dif['households'])

-0.0009689922480620172
0.00024224806201555982
0.007751937984496138
0.0031492248062016115


### Question 6

In [52]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error as RMSE

In [54]:
y_train = np.log1p(y_train)
y_val = np.log1p(y_val)
y_test = np.log1p(y_test)

alpha = [0, 0.01, 0.1, 1, 10]
for a in alpha:
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    print(RMSE(y_val, y_pred), a)
    

0.001769337815678853 0
0.0017693378157117687 0.01
0.0017693378160246144 0.1
0.0017693378191695082 1
0.0017693378505771722 10
