In [89]:
import pandas as pd
import numpy as np
import math

In [90]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [91]:
df = pd.read_csv('./housing.csv')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [92]:
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

#
## Data Preparation

In [93]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [94]:
df.total_bedrooms = df.total_bedrooms.fillna(df.total_bedrooms.median())

In [95]:
df['rooms_per_household'] = df.total_rooms / df.households 

In [96]:
df['bedrooms_per_room'] = df.total_bedrooms / df.total_rooms
df['population_per_household'] = df.population / df.households 

In [97]:
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.802260
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND,5.045455,0.224625,2.560606
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND,6.114035,0.215208,3.122807
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND,5.205543,0.215173,2.325635
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND,5.329513,0.219892,2.123209


#### 1. What is the most frequent observation (mode) for the column?

In [132]:
df.ocean_proximity.mode() # Ans: < 1H Ocean

0    <1H OCEAN
Name: ocean_proximity, dtype: object

### Question 2
1. Create the correlation matrix for the numerical features of your train dataset. </br>
2. In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.</br>
What are the two features that have the biggest correlation in this dataset?

In [99]:
numerical = list(df.dtypes[df.dtypes != 'object'].index)

In [100]:
from IPython.display import display 

In [101]:
for feature in numerical:
    corr = df[numerical].corrwith(df[feature])
    corr = corr.sort_values(ascending=False).to_frame(name='correlation '+feature)
    display(corr.iloc[1:2])

Unnamed: 0,correlation longitude
population,0.099773


Unnamed: 0,correlation latitude
rooms_per_household,0.106389


Unnamed: 0,correlation housing_median_age
bedrooms_per_room,0.135622


Unnamed: 0,correlation total_rooms
total_bedrooms,0.927058


Unnamed: 0,correlation total_bedrooms
households,0.974366


Unnamed: 0,correlation population
households,0.907222


Unnamed: 0,correlation households
total_bedrooms,0.974366


Unnamed: 0,correlation median_income
median_house_value,0.688075


Unnamed: 0,correlation median_house_value
median_income,0.688075


Unnamed: 0,correlation rooms_per_household
median_income,0.326895


Unnamed: 0,correlation bedrooms_per_room
housing_median_age,0.135622


Unnamed: 0,correlation population_per_household
population,0.069863


The two features that have the biggest correlation in this dataset is of total_bedrooms and households

<h3>Making median_house_value binary</h3>
</br>
<li>We need to turn the median_house_value variable from numeric into binary.</li>
<li>Creating a variable above_average which is 1 if the median_house_value is above its mean value and 0 otherwise.</li>

In [102]:
mean_median_house_value = round(df.median_house_value.mean(),2)

In [103]:
df['above_average']= (df.median_house_value > mean_median_house_value).astype(int)

<h3> Splitting the data </h3>

In [104]:
df_train_full, df_test = train_test_split(df, test_size = 0.2, random_state = 42)

In [105]:
df_train, df_val = train_test_split(df_train_full, test_size = 0.25, random_state = 42)

In [106]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

In [107]:
df_train = df_train.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

df_train_full = df_train_full.reset_index(drop = True)

In [108]:
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [109]:
df_train

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household,above_average
0,-119.67,34.43,39.0,1467.0,381.0,1404.0,374.0,2.3681,<1H OCEAN,3.922460,0.259714,3.754011,1
1,-118.32,33.74,24.0,6097.0,794.0,2248.0,806.0,10.1357,NEAR OCEAN,7.564516,0.130228,2.789082,1
2,-121.62,39.13,41.0,1317.0,309.0,856.0,337.0,1.6719,INLAND,3.908012,0.234624,2.540059,0
3,-118.63,34.24,9.0,4759.0,924.0,1884.0,915.0,4.8333,<1H OCEAN,5.201093,0.194158,2.059016,1
4,-122.30,37.52,38.0,2769.0,387.0,994.0,395.0,5.5902,NEAR OCEAN,7.010127,0.139762,2.516456,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12379,-118.29,33.79,16.0,1867.0,571.0,951.0,498.0,3.3427,<1H OCEAN,3.748996,0.305838,1.909639,0
12380,-121.34,38.04,16.0,3295.0,565.0,2279.0,576.0,3.6083,INLAND,5.720486,0.171472,3.956597,0
12381,-116.99,32.74,18.0,3341.0,611.0,1952.0,602.0,3.9844,<1H OCEAN,5.549834,0.182879,3.242525,1
12382,-117.87,33.84,16.0,1545.0,354.0,730.0,350.0,4.5112,<1H OCEAN,4.414286,0.229126,2.085714,0


<h3>Question 3</h3>
</br>
<li>Calculate the mutual information score between above_average and ocean_proximity . Use the training set only.</li>
<li>Round it to 2 decimals using round(score, 2)</li>
<li>What is their mutual information score?</li>

In [110]:
df_mutual_info = mutual_info_score(df_train.above_average, df_train.ocean_proximity)
df_mutual_info = round(df_mutual_info, 2)

In [111]:
df_mutual_info

0.1

In [112]:
del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

Their mutual information score is 0.1

<h3>Question 4</h3>

In [None]:
numerical = list(df_train.dtypes[df_train.dtypes != 'object'].index)

categorical = list(df.dtypes[df.dtypes == 'object'].index)

In [115]:
# one hot encoding
train_dicts = df_train[categorical + numerical].to_dict(orient = 'records')

dv = DictVectorizer(sparse = False)

dv.fit(train_dicts)

list(dv.get_feature_names_out())

X_train = dv.transform(train_dicts)

In [120]:
# Fit the model on the training dataset.
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [121]:
val_dicts = df_val[categorical + numerical].to_dict(orient = 'records')
X_val = dv.transform(val_dicts)
y_pred = model.predict(X_val)

In [122]:
accuracy = np.round(accuracy_score(y_val, y_pred),2)
print(accuracy)

0.84


#
## Question 5

In [123]:
# applying the feature elimination technique

orignal_score = accuracy

features = categorical + numerical

# excluding each feature from the set features and training the model without it. Recording the accuracy for each model.
for f in features:
    subset = features.copy()
    subset.remove(f)
    
    train_dict = df_train[subset].to_dict(orient='records')
    
    dv = DictVectorizer(sparse = False)
    dv.fit(train_dict)
    
    X_train = dv.transform(train_dict)
    
    model = LogisticRegression(solver = 'liblinear', C = 1.0, max_iter = 1000, random_state=1)
    model.fit(X_train, y_train)
    
    val_dict = df_val[subset].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    
    y_pred = model.predict(X_val)
    
    #calculating the difference between the original accuracy and the accuracy without the feature.
    score = accuracy_score(y_val, y_pred)
    print(f, orignal_score - score, score)

ocean_proximity 0.01974806201550383 0.8202519379844961
longitude 0.006908914728682158 0.8330910852713178
latitude 0.005939922480620141 0.8340600775193798
housing_median_age 0.009089147286821642 0.8309108527131783
total_rooms 0.003517441860465098 0.8364825581395349
total_bedrooms 0.004728682170542564 0.8352713178294574
population 0.013691860465116279 0.8263081395348837
households 0.006666666666666599 0.8333333333333334
median_income 0.053662790697674434 0.7863372093023255
rooms_per_household 0.003517441860465098 0.8364825581395349
bedrooms_per_room 0.003759689922480547 0.8362403100775194
population_per_household 0.0042441860465115555 0.8357558139534884


The smallest difference is of total_rooms

#### Question 6
Fit the Ridge regression model (model = Ridge(alpha=a, solver="sag", random_state=42)) on the training data.</br>
This model has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10]</br></br>
Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.

In [124]:
# Applying the logarithmic transformation to median_house_value column).
df['median_house_value'] = np.log1p(df['median_house_value'])

In [125]:
df_train_full, df_test = train_test_split(df, test_size = 0.2, random_state = 42)
df_train, df_val = train_test_split(df_train_full, test_size = 0.25, random_state=42)

In [126]:
df_train = df_train.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

In [127]:
y_train = df_train.median_house_value.values
y_val = df_val.median_house_value.values
y_test = df_test.median_house_value.values

In [128]:
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

### Ridge Regression

In [129]:
train_dict = df_train[categorical + numerical].to_dict(orient='records')

In [130]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

X_train = dv.transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [131]:
for a in  [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=a, solver = 'sag', max_iter=10,random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    score = np.sqrt(mean_squared_error(y_val, y_pred))
    
    print(a, round(score, 4))

0 0.544
0.01 0.544
0.1 0.544
1 0.544
10 0.544




The 0 alpha value leads to the best RMSE on the validation set.