In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12


In [2]:
import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [3]:
fetch_housing_data()

In [4]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [5]:
housing = load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [6]:
import pandas as pd
df1 = pd.read_csv('../tutorial/datasets/housing/housing_univariate.csv')
df1.head()

Unnamed: 0,median_income,median_house_value
0,8.3252,452600
1,8.3014,358500
2,7.2574,352100
3,5.6431,341300
4,3.8462,342200


In [7]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 2 columns):
median_income         20640 non-null float64
median_house_value    20640 non-null int64
dtypes: float64(1), int64(1)
memory usage: 322.6 KB


In [8]:
df1.describe()

Unnamed: 0,median_income,median_house_value
count,20640.0,20640.0
mean,3.870671,206855.816909
std,1.899822,115395.615874
min,0.4999,14999.0
25%,2.5634,119600.0
50%,3.5348,179700.0
75%,4.74325,264725.0
max,15.0001,500001.0


In [9]:
%matplotlib inline
import matplotlib.pyplot as plt
df1.hist(bins=50, figsize=(20,5))
#save_fig("attribute_histogram_plots")
plt.show()

NameError: name 'df' is not defined

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df1, test_size=0.2, random_state=42)

In [None]:
type(train_set)

In [None]:
train_set.head()

In [None]:
test_set.head()

In [None]:
df = train_set.copy()

In [None]:
df.plot(kind="scatter", x="median_income", y="median_house_value")


In [None]:
df.plot(kind="scatter", x="median_income", y="median_house_value", alpha = 0.1)
plt.axis([0, 16, 0, 550000])

In [None]:
corr_matrix = df.corr()

In [None]:
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
#df = train_set.drop("median_house_value", axis=1) # drop labels for training set
train_label = train_set.as_matrix(columns = ["median_house_value"])
train_data = train_set.as_matrix(columns =["median_income"])
type(train_data)

In [None]:
test_label = test_set.as_matrix(columns =["median_house_value"])
test_data = test_set.as_matrix(columns =["median_income"])

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(train_data, train_label)



In [None]:
lin_reg.score(train_data, train_label)

In [None]:
lin_reg.coef_

In [None]:
lin_reg.intercept_

In [None]:
housing_predictions = lin_reg.predict(test_data)

In [None]:
print("Predictions:",housing_predictions[0:5])

In [None]:
print("Labels:", test_label[0:5])

In [None]:
from sklearn.metrics import mean_squared_error

lin_mse = mean_squared_error(test_label, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
from sklearn.metrics import mean_absolute_error

lin_mae = mean_absolute_error(test_label, housing_predictions)
lin_mae