## Imports

In [80]:
# %load ../custom_tools.py
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


def corrplot_(
    df=None,
    mask_type="numerical",
    figsize=(14, 14),
    fontsize=8,
    cpalette=(10, 220),
):
    """ Global function that produces customised correlation plot reducing redundancy. """

    if df is None:
        raise ReferenceError("\nDataFrame not found.")
    corr_data = df.corr()

    # Creates whitespace mask over upper right triangle section for repeated features
    upper_triangle_mask = np.zeros_like(corr_data, dtype=np.bool)
    upper_triangle_mask[np.triu_indices_from(upper_triangle_mask)] = True

    # Generates MatPlotLib subplot objects
    fig, ax = plt.subplots(figsize=figsize)

    # Calculates relative maximum from correlational data
    vmax = np.abs(corr_data.values[~upper_triangle_mask]).max()

    # Creates correlational heatmap with simple color intensity relative to distribution
    cmap = sns.diverging_palette(cpalette[0], cpalette[1], as_cmap=True)
    sns.heatmap(
        corr_data,
        mask=upper_triangle_mask,
        cmap=cmap,
        vmin=-vmax,
        vmax=vmax,
        square=True,
        linecolor="lightgray",
        linewidths=1,
        ax=ax,
    )

    # Overlays feature names and corr. data values over whitespace mask
    for iterator in range(len(corr_data)):
        ax.text(
            iterator + 0.5,
            iterator + 0.5,
            corr_data.columns[iterator],
            ha="center",
            va="center",
            rotation=45,
        )

        for jterator in range(iterator + 1, len(corr_data)):
            value = "{:.3f}".format(corr_data.values[iterator, jterator])

            # Switch-case for numerical whitespace mask
            if mask_type == "numerical":
                ax.text(
                    jterator + 0.5,
                    (iterator + 0.5),
                    value,
                    ha="center",
                    va="center",
                )

            # Switch-case for categorical whitespace mask
            if mask_type == "categorical":
                ax.text(
                    jterator + 0.5,
                    (iterator + 0.5),
                    _value_to_category(value),
                    ha="center",
                    va="center",
                    fontsize=fontsize,
                )
    ax.axis("off")


In [87]:
# CUSTOM FUNCTIONS
# import sys
# sys.path.insert(0, "../")
# from custom_tools import corrplot_

# ANALYSIS
import pandas as pd
import numpy as np

# VISUALISATION
import matplotlib.pyplot as plt
import seaborn as sns
# Allows charts to appear in notebook
%matplotlib inline

# MACHINE LEARNING
from sklearn.datasets import load_boston
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

## Data

In [88]:
boston = load_boston()

print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [89]:
df = pd.DataFrame(boston.data)
df.columns = boston.feature_names
df.columns

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT'],
      dtype='object')

In [90]:
df["PRICE"] = boston.target
df["PRICE"].head(3)

0    24.0
1    21.6
2    34.7
Name: PRICE, dtype: float64

In [91]:
df.head(3)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7


## Data Analysis

## Correllation heatmap

In [92]:
corrplot_(df)

NameError: name 'np' is not defined

## Training the model

In [None]:
lr_reg = LinearRegression()

# split data into test & train
# train model
# predict

## Data Normalisation
- To improve prediction results

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
X_minmax = min_max_scaler.fit_transform(df)
X_minmax

## Line of best fit

In [None]:
# use arbitrary values for predicted valies
predicted_y_values = list(map(lambda x: 0.3*x + 0.3, X ))
plt.scatter(X, Y)
plt.plot(X, predicted_y_values, c = 'r')
# calculates the average distance from each point to their "predicted" point on the line
error = sum([ (i-j)**2 for i, j in zip(Y, predicted_y_values) ]) / len(Y)
print(error)