In [91]:
# Numerical arrays
import numpy as np

In [92]:
# Data frames
import pandas as pd

In [93]:
# Read CSV data quickly into Pandas
from pandas import read_csv

In [94]:
# Customise display-related options
from pandas import set_option

In [95]:
# Explore trends in data
# https://www.marsja.se/pandas-scatter-matrix-pair-plot/
from pandas.plotting import scatter_matrix

In [96]:
# Plotting
import matplotlib.pyplot as plt

In [97]:
# pyplot is a module that is a simple and easy way to construct plots
from matplotlib import pyplot

In [98]:
# Allows matplotlib graphs to be included in the notebook next to the code
%matplotlib inline

In [99]:
# Fancier, statistical plots
import seaborn as sns

In [100]:
from sklearn.model_selection import train_test_split

In [101]:
from sklearn.linear_model import LinearRegression

In [102]:
from sklearn import metrics

## Linear Regression

Boston House Price Dataset
---------------------------

The Boston House Price dataset contains information about different houses in a Boston suburb or town drawn from the Boston Standard Metropolitan Statistical Area (SMSA) in 1970.

This data was previously a part of the UCI Machine Learning Repository and was inbuilt to the scikit-learn library. It has since been removed from the UCI Machine Learning Repository and is being removed from the scikit-learn library. This is due to ethical issues around assumptions made in creating the dataset and the validity of its purpose. More information on this is available here: https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_boston.html.

An important lesson for anyone working with data is to trust but verify.

In [103]:
# from sklearn.datasets import load_boston
# df_boston = load_boston()
# print(df_boston.DESCR)

To futureproof the functioning of this notebook, given the availability change, the dataset was downloaded and read into the notebook from a CSV file.

In [104]:
filename = 'housing.csv'

In [105]:
dataset = read_csv(filename, delim_whitespace=True, names=names)

Details of the dataset characteristics available as part of the UCI Machine Learning Repository and scikit-learn library are reproduced below.

**Data Set Characteristics**  

    Number of Instances: 506 

    Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
        - LSTAT    % lower status of the population
        - MEDV     Median value of owner-occupied homes in $1000's

    Missing Attribute Values: None

    Creator: Harrison, D. and Rubinfeld, D.L.

References

- Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.
- Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.

The version downloaded from http://lib.stat.cmu.edu/datasets/boston appears to have been processed. However, online research shows that other dataset versions are missing values. No missing values are recorded in this dataset.

For easier referencing late in the model, the attributes have been assigned short names matching those used by the UCI Machine Learning Repository and the scikit-learn library.

In [106]:
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

**Analyze Data**

An initial look at the loaded data.

Confirming the dimensions of the dataset - the number of rows and columns - match the characteristics expected.

Confirming the dimensions of the dataset - the number of rows and columns - match the characteristics expected.

In [107]:
print(dataset.shape)

(506, 14)


The results show 506 data instances with 14 attributes.

Now that we know the size and shape of the data, let's look at the types of data associated with each attribute.

In [108]:
print(dataset.dtypes)

CRIM       float64
ZN         float64
INDUS      float64
CHAS         int64
NOX        float64
RM         float64
AGE        float64
DIS        float64
RAD          int64
TAX        float64
PTRATIO    float64
B          float64
LSTAT      float64
MEDV       float64
dtype: object


All attributes are numeric, mostly floats, and two are integers.

The next step is to have a quick look at the data (the first 20 rows of the dataset) to see what it looks like. 

In [109]:
print(dataset.head(20))

        CRIM    ZN  INDUS  CHAS   NOX    RM    AGE   DIS  RAD    TAX  PTRATIO  \
0   6.32e-03  18.0   2.31     0  0.54  6.58   65.2  4.09    1  296.0     15.3   
1   2.73e-02   0.0   7.07     0  0.47  6.42   78.9  4.97    2  242.0     17.8   
2   2.73e-02   0.0   7.07     0  0.47  7.18   61.1  4.97    2  242.0     17.8   
3   3.24e-02   0.0   2.18     0  0.46  7.00   45.8  6.06    3  222.0     18.7   
4   6.91e-02   0.0   2.18     0  0.46  7.15   54.2  6.06    3  222.0     18.7   
5   2.99e-02   0.0   2.18     0  0.46  6.43   58.7  6.06    3  222.0     18.7   
6   8.83e-02  12.5   7.87     0  0.52  6.01   66.6  5.56    5  311.0     15.2   
7   1.45e-01  12.5   7.87     0  0.52  6.17   96.1  5.95    5  311.0     15.2   
8   2.11e-01  12.5   7.87     0  0.52  5.63  100.0  6.08    5  311.0     15.2   
9   1.70e-01  12.5   7.87     0  0.52  6.00   85.9  6.59    5  311.0     15.2   
10  2.25e-01  12.5   7.87     0  0.52  6.38   94.3  6.35    5  311.0     15.2   
11  1.17e-01  12.5   7.87   

The values associated with each attribute varies widely (e.g. RAD 1-5 and CRIM 1.05-9.38).

To get a better feel for the ranges of these distributions, we can request the statistical descriptions of each attribute. Setting the precision for the data will make it easier for the floats (the primary data type).

References: https://towardsdatascience.com/8-commonly-used-pandas-display-options-you-should-know-a832365efa95

In [110]:
set_option('precision', 2)
print(dataset.describe())

           CRIM      ZN   INDUS    CHAS     NOX      RM     AGE     DIS  \
count  5.06e+02  506.00  506.00  506.00  506.00  506.00  506.00  506.00   
mean   3.61e+00   11.36   11.14    0.07    0.55    6.28   68.57    3.80   
std    8.60e+00   23.32    6.86    0.25    0.12    0.70   28.15    2.11   
min    6.32e-03    0.00    0.46    0.00    0.39    3.56    2.90    1.13   
25%    8.20e-02    0.00    5.19    0.00    0.45    5.89   45.02    2.10   
50%    2.57e-01    0.00    9.69    0.00    0.54    6.21   77.50    3.21   
75%    3.68e+00   12.50   18.10    0.00    0.62    6.62   94.07    5.19   
max    8.90e+01  100.00   27.74    1.00    0.87    8.78  100.00   12.13   

          RAD     TAX  PTRATIO       B   LSTAT    MEDV  
count  506.00  506.00   506.00  506.00  506.00  506.00  
mean     9.55  408.24    18.46  356.67   12.65   22.53  
std      8.71  168.54     2.16   91.29    7.14    9.20  
min      1.00  187.00    12.60    0.32    1.73    5.00  
25%      4.00  279.00    17.40  375.38 

Again there is a wide variation staticial values ofr each attribute (e.g. CRIM min = 6.32 amd TAX min = 187.00).

Finally, we can look at the correlation between the attribute columns for this initial analysis.
The Pearson method is used here. This is the standard correlation coefficient - it measures the strength of a linear association between two variables.

Reference: https://towardsdatascience.com/pearson-coefficient-of-correlation-explained-369991d93404

In [111]:
set_option('precision', 2)
print(dataset.corr(method='pearson'))

         CRIM    ZN  INDUS      CHAS   NOX    RM   AGE   DIS       RAD   TAX  \
CRIM     1.00 -0.20   0.41 -5.59e-02  0.42 -0.22  0.35 -0.38  6.26e-01  0.58   
ZN      -0.20  1.00  -0.53 -4.27e-02 -0.52  0.31 -0.57  0.66 -3.12e-01 -0.31   
INDUS    0.41 -0.53   1.00  6.29e-02  0.76 -0.39  0.64 -0.71  5.95e-01  0.72   
CHAS    -0.06 -0.04   0.06  1.00e+00  0.09  0.09  0.09 -0.10 -7.37e-03 -0.04   
NOX      0.42 -0.52   0.76  9.12e-02  1.00 -0.30  0.73 -0.77  6.11e-01  0.67   
RM      -0.22  0.31  -0.39  9.13e-02 -0.30  1.00 -0.24  0.21 -2.10e-01 -0.29   
AGE      0.35 -0.57   0.64  8.65e-02  0.73 -0.24  1.00 -0.75  4.56e-01  0.51   
DIS     -0.38  0.66  -0.71 -9.92e-02 -0.77  0.21 -0.75  1.00 -4.95e-01 -0.53   
RAD      0.63 -0.31   0.60 -7.37e-03  0.61 -0.21  0.46 -0.49  1.00e+00  0.91   
TAX      0.58 -0.31   0.72 -3.56e-02  0.67 -0.29  0.51 -0.53  9.10e-01  1.00   
PTRATIO  0.29 -0.39   0.38 -1.22e-01  0.19 -0.36  0.26 -0.23  4.65e-01  0.46   
B       -0.39  0.18  -0.36  4.88e-02 -0.

# References

In [112]:
# [1](https://scikit-learn.org/stable/tutorial/index.html)

***

# End