In [12]:
# Standard plotly imports
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode

# Using plotly + cufflinks in offline mode
import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)

# Other Imports
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np

## Load Data

In [18]:
boston = load_boston()
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['B2'] = df['B'].apply(lambda x: "high" if x > 396.80 else "low")
df['y'] = boston.target
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,B2,y
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,high,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,high,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,low,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,low,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,high,36.2


## Dataset Explained

In [25]:
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

## Interactive Histogram

- Collect the list of descriptions and column names (Key and value)
- Make a drop-down selection to see different variables (Drop down should include key and value)

In [26]:
df['CRIM'].iplot(kind='hist', xTitle='per capita crime rate by town', yTitle='count', title='per capita crime rate by town Distribution')



In [27]:
df['ZN'].iplot(kind='hist', xTitle='proportion of residential land zoned for lots over 25,000 sq.ft.', yTitle='count', title='proportion of residential land zoned for lots over 25,000 sq.ft. Distribution')



In [None]:
# Category columns
category_columns = df.select_dtypes(include=['category']).columns

# Index of columns
idx_dict = {}
i = 0
for column in df.columns:
    if column in category_columns:
        idx_dict[column] = i
    i = i + 1
    
@interact
def univariate(cat_column=category_columns):    
    class GetFeatures(BaseEstimator, TransformerMixin):
        def __init__(self):
            pass

        def fit(self, X, y=None):
            return self

        def transform(self, X, y=None):        
            value = X[:, idx_dict[cat_column]]
            value = ["missing" if x is np.nan else x for x in value]
            return np.c_[value]

    tree_train_X = GetFeatures().fit_transform(train_X)
    tree_train_X = OneHotEncoder().fit_transform(tree_train_X)
    simple_tree_clf.fit(tree_train_X, train_y)
    graph = tree.export_graphviz(simple_tree_clf, out_file=None)  
    return Source(graph)

