In [105]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
%matplotlib inline 
sns.set(style="ticks")

In [94]:
dataset = pd.read_csv('datasets/housing.csv')

In [95]:
dataset.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,NEAR BAY
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,NEAR BAY
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,NEAR BAY
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,NEAR BAY


In [96]:
for column in dataset.columns:
    print(column, "-", dataset[dataset[column].isnull()].shape[0])

longitude - 0
latitude - 0
housing_median_age - 0
total_rooms - 0
total_bedrooms - 207
population - 0
households - 0
median_income - 0
median_house_value - 0
ocean_proximity - 0


In [97]:
dataset.dtypes

longitude             float64
latitude              float64
housing_median_age      int64
total_rooms             int64
total_bedrooms        float64
population              int64
households              int64
median_income         float64
median_house_value      int64
ocean_proximity        object
dtype: object

In [98]:
for column in dataset.columns:
    null_values = dataset[dataset[column].isnull()].shape[0]
    if null_values != 0:
        print(column, ' - ', null_values)
        print(dataset[dataset[column].isnull()])

total_bedrooms  -  207
       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
290      -122.16     37.77                  47         1256             NaN   
341      -122.17     37.75                  38          992             NaN   
538      -122.28     37.78                  29         5154             NaN   
563      -122.24     37.75                  45          891             NaN   
696      -122.10     37.69                  41          746             NaN   
...          ...       ...                 ...          ...             ...   
20267    -119.19     34.20                  18         3620             NaN   
20268    -119.18     34.19                  19         2393             NaN   
20372    -118.88     34.17                  15         4260             NaN   
20460    -118.75     34.29                  17         5512             NaN   
20484    -118.72     34.28                  17         3051             NaN   

       population  household

In [99]:
average = int(dataset["total_bedrooms"].mean())
print('average = ', average)
dataset.loc[dataset["total_bedrooms"].isnull(), "total_bedrooms"] = average

average =  537


In [100]:
for column in dataset.columns:
    print(column, "-", dataset[dataset[column].isnull()].shape[0])

longitude - 0
latitude - 0
housing_median_age - 0
total_rooms - 0
total_bedrooms - 0
population - 0
households - 0
median_income - 0
median_house_value - 0
ocean_proximity - 0


In [101]:
print(sorted(dataset["ocean_proximity"].unique()))

['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN']


In [102]:
dataset = pd.get_dummies(dataset, columns=["ocean_proximity"])

In [103]:
dataset.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,0,0,0,1,0
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,0,0,0,1,0
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,0,0,0,1,0
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,0,0,0,1,0
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,0,0,0,1,0


In [104]:
dataset.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.861822,1425.476744,499.53968,3.870671,206855.816909,0.442636,0.317393,0.000242,0.11095,0.128779
std,2.003532,2.135952,12.585558,2181.615252,419.266601,1132.462122,382.329753,1.899822,115395.615874,0.49671,0.465473,0.015563,0.314077,0.334963
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0,0.0,0.0,0.0,0.0,0.0
25%,-121.8,33.93,18.0,1447.75,297.0,787.0,280.0,2.5634,119600.0,0.0,0.0,0.0,0.0,0.0
50%,-118.49,34.26,29.0,2127.0,438.0,1166.0,409.0,3.5348,179700.0,0.0,0.0,0.0,0.0,0.0
75%,-118.01,37.71,37.0,3148.0,643.25,1725.0,605.0,4.74325,264725.0,1.0,1.0,0.0,0.0,0.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0,1.0,1.0,1.0,1.0,1.0


In [115]:
array = preprocessing.normalize([dataset["population"]])

In [122]:
dataset["population"] = array[0].tolist()

In [123]:
dataset.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41,880,129.0,0.001231,126,8.3252,452600,0,0,0,1,0
1,-122.22,37.86,21,7099,1106.0,0.00918,1138,8.3014,358500,0,0,0,1,0
2,-122.24,37.85,52,1467,190.0,0.001896,177,7.2574,352100,0,0,0,1,0
3,-122.25,37.85,52,1274,235.0,0.002133,219,5.6431,341300,0,0,0,1,0
4,-122.25,37.85,52,1627,280.0,0.00216,259,3.8462,342200,0,0,0,1,0
