In [72]:
import numpy as np
import pandas as pd


In [96]:
def load_data(url="https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data"):
    '''
    Loads Data

    :param url: space separated file location containing Boston housing data
    :return: Raw pd.df containing Boston housing data
    '''
    ### Harrison (1978)과 평균값, Belsley (1980)와 첫 데이터를 비교하여 이상 없음을 확인했음
    # CRIM - 범죄율 (1인당)
    # ZN - large-lot zone 비율 (25000sqft 이하의 주택이 금지되는 구역)
    # INDUS - 산업지구 비율 (공해도)
    # CHAS - 더미 변수 (찰스강 경계에 위치 = 1, 아니면 = 0)
    # NOX - 일산화질소 농도 (단위 : 10pphm, parts per 10 million)
    # RM - 가구당 방의 개수 (부동산 품질)
    # AGE - 오래된 자가주택 비율 (1940년 기준, 구조물 품질에 관련됨)
    # DIS - 고용센터까지의 거리 (통근거리, 가중평균, 로그)
    # RAD - 고속도로 접근성 (공해도, 통근거리, 로그)
    # TAX - 재산세율
    # PTRATIO - 학생-교사 비율
    # B - 흑인 비율 (1000(Bk - 0.63)^2)
    # LSTAT - 하위계층 %비율
    # MEDV - 자가주택 가격 중앙값 (단위 $1000)
    ### 변수 분류
    # 종속변수 : MEDV
    # Structural : RM, AGE
    # Neighborhood : B, LSTAT, CRIM, ZN, INDUS, TAX, PTRATIO, CHAS
    # Accessibility : DIS, RAD
    # Air Pollution : NOX, PART

    names = ['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT','MEDV']
    boston_housing_dataframe = pd.read_csv(url, names=names, header=None, delim_whitespace=True)
    return boston_housing_dataframe

df = load_data()
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677082,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [180]:
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48,22.0


In [151]:
def preprocess_features(boston_housing_dataframe):
    '''
    Prepare features

    :param df: Raw pd.df containing Boston housing data
    :return: df features, df targets
    '''
    features = boston_housing_dataframe.copy()
    targets = features.pop('MEDV')

    ## Transformations in Belsley (1980)
    #features['NOXSQ'] = np.power(features['NOX'],2)
    #features['LMV'] = np.log(features['MEDV'])
    #there's a mention that large errors changes the value of the NOXSQ coefficient 
    ## Harrison (1978)
    #features['RMSQ'] = np.power(features['RM'],2)

    return features, targets

def cut_data(df, ratio_list):
    '''
    Cuts df into smaller rows
    
    :return: [df1, df2, df3, ...] with size=len(ratio_list)+1
    '''
    if not np.sum(ratio_list)<1 :
        raise Error('invalid ratio_list')
    
    offset = (np.array(ratio_list).cumsum()*df.shape[0]).astype(int).tolist()
    offset = zip([0]+offset,offset+[df.shape[0]])
    return [df.iloc()[x[0]:x[1]] for x in offset]

df_feature, df_targets = preprocess_features(df)

df_feature_train, df_feature_test, df_feature_validate = cut_data(df_feature,[0.5,0.2])
df_targets_train, df_targets_test, df_targets_validate = cut_data(df_targets,[0.5,0.2])

In [179]:
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as spc

def plot_corr(df, cluster=False):
    '''
    Prints 
    
    :params df: pd.df
    :params cluster: sorts column if True. bool
    :returns: correlation matrix. pandas.io.formats.style.Styler object
    '''
    corr = df.corr()
    
    if cluster :
        pdist = spc.distance.pdist(corr)
        linkage = spc.linkage(pdist, method='complete')
        idx = spc.fcluster(linkage, 0.5 * pdist.max(), 'distance')
        new_index = [x for _,x in sorted(zip(idx,list(df_feature_train.columns)))]
        corr = corr.reindex(new_index)[new_index]
    
    return corr.style.background_gradient(cmap='coolwarm').set_precision(2)#.set_properties(**{'font-size': '0pt'})

plot_corr(df_feature_train, cluster=True)

Unnamed: 0,B,DIS,RM,ZN,AGE,CRIM,INDUS,LSTAT,NOX,TAX,CHAS,PTRATIO,RAD
B,1.0,0.26,0.17,0.15,-0.23,-0.52,-0.31,-0.2,-0.45,-0.25,-0.06,0.15,-0.07
DIS,0.26,1.0,0.16,0.62,-0.66,-0.48,-0.66,-0.35,-0.7,-0.34,-0.19,-0.02,0.02
RM,0.17,0.16,1.0,0.29,-0.23,-0.25,-0.36,-0.71,-0.35,-0.15,-0.03,-0.23,0.09
ZN,0.15,0.62,0.29,1.0,-0.52,-0.28,-0.43,-0.35,-0.42,-0.06,-0.14,-0.32,-0.14
AGE,-0.23,-0.66,-0.23,-0.52,1.0,0.48,0.54,0.55,0.63,0.3,0.19,0.09,0.09
CRIM,-0.52,-0.48,-0.25,-0.28,0.48,1.0,0.56,0.39,0.79,0.43,0.17,-0.28,0.13
INDUS,-0.31,-0.66,-0.36,-0.43,0.54,0.56,1.0,0.48,0.73,0.45,0.16,0.03,-0.02
LSTAT,-0.2,-0.35,-0.71,-0.35,0.55,0.39,0.48,1.0,0.5,0.2,0.11,0.2,-0.05
NOX,-0.45,-0.7,-0.35,-0.42,0.63,0.79,0.73,0.5,1.0,0.47,0.2,-0.2,0.09
TAX,-0.25,-0.34,-0.15,-0.06,0.3,0.43,0.45,0.2,0.47,1.0,0.03,-0.05,0.31


In [98]:
def train_model(
    learning_rate,
    steps,
    batch_size,
    training_examples,
    training_targets,
    validation_examples,
    validation_targets):
    """Trains a linear regression model.
    
    In addition to training, this function also prints training progress information,
    as well as a plot of the training and validation loss over time.
    
    Args:
        learning_rate: A `float`, the learning rate.
        steps: A non-zero `int`, the total number of training steps. A training step
            consists of a forward and backward pass using a single batch.
        batch_size: A non-zero `int`, the batch size.
        training_examples: A `DataFrame` containing one or more columns from
            `california_housing_dataframe` to use as input features for training.
        training_targets: A `DataFrame` containing exactly one column from
            `california_housing_dataframe` to use as target for training.
        validation_examples: A `DataFrame` containing one or more columns from
            `california_housing_dataframe` to use as input features for validation.
        validation_targets: A `DataFrame` containing exactly one column from
            `california_housing_dataframe` to use as target for validation.
            
    Returns:
        A `LinearRegressor` object trained on the training data.
    """

    periods = 10
    steps_per_period = steps / periods

    # Create a linear regressor object.
    my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
    linear_regressor = tf.estimator.LinearRegressor(
            feature_columns=construct_feature_columns(training_examples),
            optimizer=my_optimizer
    )
        
    # Create input functions.
    training_input_fn = lambda: my_input_fn(training_examples, 
        training_targets["median_house_value"], 
        batch_size=batch_size)
    predict_training_input_fn = lambda: my_input_fn(training_examples, 
        training_targets["median_house_value"], 
        num_epochs=1, 
        shuffle=False)
    predict_validation_input_fn = lambda: my_input_fn(validation_examples, 
        validation_targets["median_house_value"], 
        num_epochs=1, 
        shuffle=False)

    # Train the model, but do so inside a loop so that we can periodically assess
    # loss metrics.
    print("Training model...")
    print("RMSE (on training data):")
    training_rmse = []
    validation_rmse = []
    for period in range (0, periods):
        # Train the model, starting from the prior state.
        linear_regressor.train(
                input_fn=training_input_fn,
                steps=steps_per_period,
        )
        # Take a break and compute predictions.
        training_predictions = linear_regressor.predict(input_fn=predict_training_input_fn)
        training_predictions = np.array([item['predictions'][0] for item in training_predictions])
        
        validation_predictions = linear_regressor.predict(input_fn=predict_validation_input_fn)
        validation_predictions = np.array([item['predictions'][0] for item in validation_predictions])
        
        # Compute training and validation loss.
        training_root_mean_squared_error = math.sqrt(
                metrics.mean_squared_error(training_predictions, training_targets))
        validation_root_mean_squared_error = math.sqrt(
                metrics.mean_squared_error(validation_predictions, validation_targets))
        # Occasionally print the current loss.
        print(" period %02d : %0.2f" % (period, training_root_mean_squared_error))
        # Add the loss metrics from this period to our list.
        training_rmse.append(training_root_mean_squared_error)
        validation_rmse.append(validation_root_mean_squared_error)
    print("Model training finished.")

    
    # Output a graph of loss metrics over periods.
    plt.ylabel("RMSE")
    plt.xlabel("Periods")
    plt.title("Root Mean Squared Error vs. Periods")
    plt.tight_layout()
    plt.plot(training_rmse, label="training")
    plt.plot(validation_rmse, label="validation")
    plt.legend()

    return linear_regressor

ModuleNotFoundError: No module named 'seaborn'

In [None]:
PDFkit, Jinja2
#https://towardsdatascience.com/creating-pdf-reports-with-python-pdfkit-and-jinja2-templates-64a89158fa2d