# Steps of Exploratory Data Analysis
1. Describe the dataset (Eithar) - this will also be part of the README so it overlaps with your other task
2. Load the dataset (Samantha)
3. Explore the dataset (Samantha) - to get going on visualizations
4. Initial Thoughts (together) - I think we should make a draft that we both contribute to
5. Wrangling 
6. Research Questions
7. Data Analysis and visualizations (Samantha) - this is my part of the proposal so I think I should do this, and you can add additional tables/visualizations as you see fit
8. Summary and Conclusions (together) - Again, both contributing

In [1]:
# Imports
import numpy as np
import pandas as pd
import altair as alt

### Step 2: Load the Dataset

In [6]:
data = pd.read_csv('https://github.com/ageron/handson-ml/blob/master/datasets/housing/housing.csv?raw=true')

data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [7]:
data.describe(include='all')

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0,20640
unique,,,,,,,,,,5
top,,,,,,,,,,<1H OCEAN
freq,,,,,,,,,,9136
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909,
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874,
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0,
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0,
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0,
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0,


-118.49

### Step 3: Explore the Dataset

In [52]:
print("There are {0} records in this dataset. Each record is a census block group.\n".format(
    data.count().min()))
print("The median age of houses/complexes in census blocks ranges from {0} to {1} with a mean of {2} years old and a median of {3} years old.\n".format(
    data['housing_median_age'].min().astype('int'),
    data['housing_median_age'].max().astype('int'),
    round(data['housing_median_age'].mean(), 2),
    data['housing_median_age'].median()))
print("The total number of rooms in a census block ranges from {0} to {1} with a mean of {2} rooms and a median of {3} rooms.\n".format(
    data['total_rooms'].min().astype('int'),
    data['total_rooms'].max().astype('int'),
    round(data['total_rooms'].mean(), 2),
    data['total_rooms'].median()))
print("The number of bedrooms in a census block ranges from {0} to {1} with a mean of {2} bedrooms and a median of {3} bedrooms.\n".format(
    data['total_bedrooms'].min().astype('int'),
    data['total_bedrooms'].max().astype('int'),
    round(data['total_bedrooms'].mean(), 2),
    data['total_bedrooms'].median()))
print("The population of a census block ranges from {0} to {1} with a mean of {2} and a median of {3}.\n".format(
    data['population'].min().astype('int'),
    data['population'].max().astype('int'),
    round(data['population'].mean(), 2),
    data['population'].median()))
print("The number of households in a census block ranges from {0} to {1} with a mean of {2} and a median of {3}.\n".format(
    data['households'].min().astype('int'),
    data['households'].max().astype('int'),
    round(data['households'].mean(), 2),
    data['households'].median()))



There are 20433 records in this dataset. Each record is a census block group.

The median age of houses/complexes in census blocks ranges from 1 to 52 with a mean of 28.63 years old and a median of 29.0 years old.

The total number of rooms in a census block ranges from 2 to 39320 with a mean of 2636.5 rooms and a median of 2127.0 rooms.

The number of bedrooms in a census block ranges from 1 to 6445 with a mean of 537.87 bedrooms and a median of 435.0 bedrooms.

The population of a census block ranges from 3 to 35682 with a mean of 1424.95 and a median of 1166.0.

The number of households in a census block ranges from 1 to 6082 with a mean of 499.43 and a median of 409.0.



### Step 5: Wrangling

In [8]:
# total_bedrooms is missing values for 207 records
# drop these records
data = data.dropna()



Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0
mean,-119.570689,35.633221,28.633094,2636.504233,537.870553,1424.946949,499.433465,3.871162,206864.413155
std,2.003578,2.136348,12.591805,2185.269567,421.38507,1133.20849,382.299226,1.899291,115435.667099
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1450.0,296.0,787.0,280.0,2.5637,119500.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5365,179700.0
75%,-118.01,37.72,37.0,3143.0,647.0,1722.0,604.0,4.744,264700.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0
