In [1]:
import numpy as np
import pandas as pd 

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

import os
for dirname, _, filenames in os.walk('./data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


import seaborn as sns

import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
os.path.join(PROJECT_ROOT_DIR)
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images")
CLEAREDDATA_PATH = os.path.join(PROJECT_ROOT_DIR, "cleareddata")
os.makedirs(IMAGES_PATH, exist_ok=True)
os.makedirs(CLEAREDDATA_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

from tqdm import tqdm, tqdm_notebook
tqdm_notebook().pandas()

from pipe_classes import *

./data/nyc_benchmarking_disclosure_2017_consumption_data.xlsx
./data/scotch_review.csv


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

  from pandas import Panel


In [2]:
data = pd.read_excel(PROJECT_ROOT_DIR+'/data/nyc_benchmarking_disclosure_2017_consumption_data.xlsx','Information and Metrics')




In [3]:
data = data.replace({'Not Available': np.nan})

In [4]:
missing_values_table(data)

Your selected dataframe has 60 columns.
There are 43 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
Propane Use (kBtu),34354,100.0
District Hot Water Use (kBtu),34347,100.0
District Chilled Water Use (kBtu),34325,99.9
Fuel Oil #1 Use (kBtu),34319,99.9
Diesel #2 Use (kBtu),34285,99.8
Address 2 (self-reported),33928,98.8
Fuel Oil #5 & 6 Use (kBtu),33871,98.6
District Steam Use (kBtu),32787,95.4
Parent Property Name,32632,95.0
Parent Property Id,32632,95.0


In [5]:
data.select_dtypes(exclude=['number']).keys()

Index(['Property Name', 'Parent Property Id', 'Parent Property Name',
       'NYC Borough, Block and Lot (BBL) self-reported',
       'NYC Building Identification Number (BIN)', 'Address 1 (self-reported)',
       'Address 2 (self-reported)', 'Postal Code', 'Street Number',
       'Street Name', 'Borough', 'DOF Gross Floor Area (ft²)',
       'Primary Property Type - Self Selected',
       'List of All Property Use Types at Property',
       'Largest Property Use Type', '2nd Largest Property Use Type',
       '3rd Largest Property Use Type', 'Metered Areas (Energy)',
       'Metered Areas  (Water)', 'Annual Maximum Demand (MM/YYYY)',
       'Water Required?', 'Generation Date',
       'DOF Benchmarking Submission Status'],
      dtype='object')

In [6]:
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

reg_target="ENERGY STAR Score"
spec_cols=['Water Use (All Water Sources) (kgal)','log_Water Use (All Water Sources) (kgal)','Largest Property Use Type - Gross Floor Area (ft²)','index','Order','Property Id']

prep_pipeline = Pipeline([
        ('deleter', MissVals_Deleter()),
        ('outremover', Outliers_Remover(['Site EUI (kBtu/ft²)'])),
        ('feature_adderencoder', Feature_AdderEncoder(reg_target,['Borough', 'Largest Property Use Type'])),
        ('collinearfeatures_remover', CollinearFeatures_Remover(reg_target)),
        ('speccolumn_remover', SpecColumn_Remover(spec_cols)),
        ('strattraintest_splitter', StratTrainTest_Splitter(reg_target, [0., 20., 45., 70., 90., np.inf], [1, 2, 3, 4, 5], verbose = True)),
    ])

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(missing_values=np.nan, strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

X, X_test, y, y_test = prep_pipeline.fit_transform(data)

Your selected dataframe has 60 columns.
There are 43 columns that have missing values.
By the remove percentage criterion 50, we may remove 19 columns.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numeric_subset['sqrt_' + col] = np.sqrt(numeric_subset[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numeric_subset['log_' + col] = np.log(numeric_subset[col])
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view

(0, 83)
(24724, 83)
               Overall  Stratified    Random  Rand. %error  Strat. %error
(0.0, 20.0]   0.196449    0.196414  0.195201     -0.635245      -0.017647
(20.0, 45.0]  0.197177    0.197223  0.199110      0.980561       0.023400
(45.0, 70.0]  0.223184    0.223106  0.220005     -1.424188      -0.034945
(70.0, 90.0]  0.198269    0.198301  0.199110      0.424365       0.016412
(90.0, inf]   0.184922    0.184956  0.186573      0.893172       0.018375


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [7]:
X.shape

(17306, 82)

In [8]:
X_test.shape

(7418, 82)

In [9]:
y.shape

(17306, 1)

In [10]:
y_test.shape

(7418, 1)

In [11]:
X = num_pipeline.fit_transform(X)
X

array([[-1.0869268 , -0.19386517,  1.2861418 , ..., -0.01075082,
        -0.0672868 ,  0.        ],
       [ 0.6945068 , -0.4356981 ,  0.14188102, ..., -0.01075082,
        -0.0672868 ,  0.        ],
       [ 0.6919314 , -0.39547807, -0.74809957, ..., -0.01075082,
        -0.0672868 ,  0.        ],
       ...,
       [-1.0905083 ,  1.0099249 ,  1.731132  , ..., -0.01075082,
        -0.0672868 ,  0.        ],
       [-1.0788532 ,  0.00401228,  0.45973125, ..., -0.01075082,
        -0.0672868 ,  0.        ],
       [-1.0838028 , -0.4256431 , -1.0341649 , ..., -0.01075082,
        -0.0672868 ,  0.        ]], dtype=float32)

In [12]:
X_test = num_pipeline.fit_transform(X_test)
X_test

array([[-1.0936925 , -0.20781262, -0.6964435 , ..., -0.02322757,
        -0.06478088,  0.        ],
       [ 0.62156785, -0.48219168, -1.0157301 , ..., -0.02322757,
        -0.06478088,  0.        ],
       [-0.22004174, -0.4518129 , -0.9518728 , ..., -0.02322757,
        -0.06478088,  0.        ],
       ...,
       [-1.0978166 , -0.27056694,  1.3789197 , ..., -0.02322757,
        -0.06478088,  0.        ],
       [-1.0970192 ,  0.91672206, -0.9838015 , ..., -0.02322757,
        -0.06478088,  0.        ],
       [-1.0919884 , -0.081163  , -0.1217275 , ..., -0.02322757,
        -0.06478088,  0.        ]], dtype=float32)