Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Analysis update (changes to class and notebooks) #88

Merged
merged 2 commits into from
Mar 1, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
946 changes: 938 additions & 8 deletions apps/Data_quality_analysis/Clean_Data.py

Large diffs are not rendered by default.

67,133 changes: 0 additions & 67,133 deletions apps/Data_quality_analysis/Data Analysis_MDAL.ipynb

This file was deleted.

868,606 changes: 868,606 additions & 0 deletions apps/Data_quality_analysis/Data_Analysis_MDAL.ipynb

Large diffs are not rendered by default.

454 changes: 274 additions & 180 deletions apps/Data_quality_analysis/Import_Data.py

Large diffs are not rendered by default.

616 changes: 116 additions & 500 deletions apps/Data_quality_analysis/Main.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions apps/Data_quality_analysis/Main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
sys.path.append("..")
from Energy_Analytics import Wrapper


# Custom func
def func(X, y):
from sklearn.linear_model import LinearRegression
Expand Down
189 changes: 189 additions & 0 deletions apps/Data_quality_analysis/Missing_data_dictionary.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Missing Data Periods to JSON"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import dataclient\n",
"import pandas as pd\n",
"import configparser\n",
"import json\n",
"\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib import style\n",
"import numpy as np\n",
"style.use(\"seaborn-notebook\")\n",
"%matplotlib inline\n",
"import pprint\n",
"\n",
"import sys\n",
"sys.path.append(\"..\")\n",
"from Wrapper import *\n",
"\n",
"\n",
"import ipywidgets as widgets\n",
"import plotly as py\n",
"import plotly.graph_objs as go\n",
"\n",
"#to install plotly extension for jupyter lab see: https://github.com/jupyterlab/jupyter-renderers/tree/master/packages/plotly-extension\n",
"\n",
"from scipy import special"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"mdal_obj = Import_MDAL()\n",
"\n",
"start = '2016-01-01T00:00:00-08:00' #'2016-01-01T00:00:00Z' for UTC\n",
"end = '2019-02-01T00:00:00-08:00'# '2019-02-01T00:00:00Z' for UTC\n",
"\n",
"site_list = ['avenal-public-works-yard',\n",
" 'avenal-movie-theatre',\n",
" 'avenal-recreation-center',\n",
" 'avenal-animal-shelter',\n",
" 'avenal-veterans-hall',\n",
" 'south-berkeley-senior-center',\n",
" 'north-berkeley-senior-center',\n",
" 'hayward-station-1',\n",
" 'hayward-station-8',\n",
" 'orinda-community-center',\n",
" 'ciee',\n",
" 'berkeley-corporate-yard']\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"missing_energy_data={}\n",
"missing_weather_data={}\n",
"missing_tstat_data={}\n",
"\n",
"\"\"\"\n",
"\n",
"Input Parameters:\n",
"obj: object returned from above MDAL Query\n",
"dictionary: pick dictionary that corresponds with data type\n",
"low_bound: all data <= this value will be interpreted as missing data\n",
"high_bound: all data > this value will be interpreted as missing data\n",
"\n",
"\n",
"Returns a JSON file with a dictionary with the missing data events (start and end)\n",
" for each column in the inputted object\n",
"\n",
"\"\"\"\n",
"\n",
"def event_duration(obj,dictionary,low_bound=None, high_bound=None):\n",
" clean_data_obj = Clean_Data(obj.df)\n",
" dictionary=clean_data_obj.event_duration(obj, dictionary, low_bound, high_bound)\n",
" return dictionary"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Missing Thermostat Data periods "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"for site in site_list:\n",
" try:\n",
" obj_tstat = mdal_obj.get_tstat(site=site, start=start, end=end, var='tstat_state', agg='MEAN', window='15min', aligned=True, return_names=True)\n",
" event_duration(obj_tstat, dictionary=missing_tstat_data)\n",
" except:\n",
" print('no data in',site)\n",
" \n",
"#Comment out below to not generate json files \n",
"# with open('missing_tstat_data.json',\"w\") as write_file:\n",
"# json.dump(missing_tstat_data, write_file)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Missing Energy Data periods "
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"for site in site_list:\n",
" obj2 = mdal_obj.get_meter(site=site, start=start, end=end, var='meter', agg='MEAN', window='15min', aligned=True, return_names=True)\n",
" event_duration(obj2, dictionary=missing_energy_data)\n",
"\n",
"#Comment out below to not generate json files \n",
"with open('missing_energy_data.json',\"w\") as write_file:\n",
" json.dump(missing_energy_data, write_file)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Missing Weather Data periods "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"for site in site_list:\n",
" obj3 = mdal_obj.get_weather(site=site, start=start, end=end, var='weather', agg='MEAN', window='15min', aligned=True, return_names=True)\n",
" event_duration(obj3, dictionary=missing_weather_data, low_bound=32, high_bound=110)\n",
"\n",
"#Comment out below to not generate json files \n",
"with open('missing_weather_data.json',\"w\") as write_file:\n",
" json.dump(missing_weather_data, write_file)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:mortar]",
"language": "python",
"name": "conda-env-mortar-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
61 changes: 20 additions & 41 deletions apps/Data_quality_analysis/Model_Data.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
""" This script splits the data into baseline and projection periods, runs models on them and displays metrics & plots.
""" This script splits the data into baseline and projection periods, runs ML models on them and displays the metrics & plots.

Last modified: October 30 2018
Note
----
Last modified: Feb 4 2019

Authors \n
@author Pranav Gupta <phgupta@ucdavis.edu>

Authors
-------
- Pranav Gupta <phgupta@ucdavis.edu>

"""

Expand Down Expand Up @@ -65,37 +69,41 @@ def __init__(self, df, input_col, output_col, alphas, cv,
self.original_data = df
self.cv = cv

# Populate input columns (indepdent columns)
if not input_col: # Using all columns except the last as input_col
input_col = list(self.original_data.columns)
input_col.remove(output_col)
self.input_col = input_col
elif not isinstance(list):
raise SystemError('Input column should be a list.')
raise TypeError('Input column should be a list.')
else:
self.input_col = input_col

# Populate output column (dependent column)
if not output_col:
raise SystemError('Please provide the target column.')
raise TypeError('Please provide the target column.')
elif not isinstance(output_col, str):
raise SystemError('Target column should be a string.')
raise TypeError('Target column should be a string.')
else:
self.output_col = output_col

# Validate the type of alphas
if not isinstance(alphas, list) and not isinstance(alphas, np.ndarray):
raise SystemError('alphas should be a list of int\'s or numpy ndarray.')
raise TypeError('alphas should be a list of int\'s or numpy ndarray.')
else:
self.alphas = alphas

# Validate the type of baseline_period, projection_period & exclude_time_period
if (len(baseline_period) % 2 != 0):
raise SystemError('baseline period needs to be a multiple of 2 (i.e. have a start and end date)')
raise ValueError('baseline period needs to be a multiple of 2 (i.e. have a start and end date)')
else:
self.baseline_period = baseline_period
if exclude_time_period and (len(exclude_time_period) % 2 != 0):
raise SystemError('exclude time period needs to be a multiple of 2 (i.e. have a start and end date)')
raise ValueError('exclude time period needs to be a multiple of 2 (i.e. have a start and end date)')
else:
self.exclude_time_period = exclude_time_period
if projection_period and (len(projection_period) % 2 != 0):
raise SystemError('projection period needs to be a multiple of 2 (i.e. have a start and end date)')
raise ValueError('projection period needs to be a multiple of 2 (i.e. have a start and end date)')
else:
self.projection_period = projection_period

Expand All @@ -115,7 +123,7 @@ def __init__(self, df, input_col, output_col, alphas, cv,


def split_data(self):
""" Split data according to baseline and projection time period values """
""" Split data according to baseline and projection time period values. """

try:
# Extract data ranging in time_period1
Expand Down Expand Up @@ -455,34 +463,6 @@ def best_model_fit(self):

"""

# X_train, X_test, y_train, y_test = train_test_split(self.baseline_in, self.baseline_out,
# test_size=0.30, random_state=42)

# self.best_model.fit(X_train, y_train)
# self.y_true = y_test # Pandas Series
# self.y_pred = self.best_model.predict(X_test) # numpy.ndarray

# # Set all negative values to zero since energy > 0
# self.y_pred[self.y_pred < 0] = 0

# # n and k values for adj r2 score
# self.n_test = X_test.shape[0] # Number of points in data sample
# self.k_test = X_test.shape[1] # Number of variables in model, excluding the constant

# # Store best model's metrics
# self.best_metrics['name'] = self.best_model_name
# self.best_metrics['r2'] = r2_score(self.y_true, self.y_pred)
# self.best_metrics['mse'] = mean_squared_error(self.y_true, self.y_pred)
# self.best_metrics['rmse'] = math.sqrt(self.best_metrics['mse'])
# self.best_metrics['adj_r2'] = self.adj_r2(self.best_metrics['r2'], self.n_test, self.k_test)

# # Normalized Mean Bias Error
# numerator = sum(self.y_true - self.y_pred)
# denominator = (self.n_test - self.k_test) * (sum(self.y_true) / len(self.y_true))
# self.best_metrics['nmbe'] = numerator / denominator

# return self.best_metrics

self.best_model.fit(self.baseline_in, self.baseline_out)

self.y_true = self.baseline_out # Pandas Series
Expand All @@ -507,7 +487,6 @@ def best_model_fit(self):
denominator = (self.n_test - self.k_test) * (sum(self.y_true) / len(self.y_true))
self.best_metrics['nmbe'] = numerator / denominator


# MAPE can't have 0 values in baseline_out -> divide by zero error
self.baseline_out_copy = self.baseline_out[self.baseline_out != 0]
self.baseline_in_copy = self.baseline_in[self.baseline_in.index.isin(self.baseline_out_copy.index)]
Expand Down
9 changes: 5 additions & 4 deletions apps/Data_quality_analysis/Plot_Data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
""" This script contains functions for displaying various plots.

Last modified: October 17 2018
Last modified: Feb 4 2019

Authors \n
@author Pranav Gupta <phgupta@ucdavis.edu>
Expand All @@ -26,6 +26,7 @@ class Plot_Data:
# Static variable to keep count of number of figures
count = 1


def __init__(self, figsize=(18,5)):
""" Constructor.

Expand Down Expand Up @@ -140,11 +141,11 @@ def baseline_projection_plot(self, y_true, y_pred,
Plot_Data.count += 1
return fig, project_df['y_true'], project_df['y_pred']
except:
raise SystemError("If projecting into the future, please specify project_ind_col that has data available \
raise TypeError("If projecting into the future, please specify project_ind_col that has data available \
in the future time period requested.")

return fig, None, None

if __name__ == '__main__':


if __name__ == '__main__':
obj = Plot_Data()
20 changes: 10 additions & 10 deletions apps/Data_quality_analysis/Preprocess_Data.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
""" This script preprocesses a dataframe according to user specifications.

Last modified: November 15 2018
Note
----
Last modified: Feb 4 2019

Authors \n
@author Marco Pritoni <marco.pritoni@gmail.com>
@author Pranav Gupta <phgupta@ucdavis.edu>

Authors
-------
- Marco Pritoni <marco.pritoni@gmail.com>
- Pranav Gupta <phgupta@ucdavis.edu>

"""

Expand All @@ -14,7 +18,7 @@

class Preprocess_Data:

""" This class preprocesses a dataframe according to user specification """
""" This class preprocesses a dataframe according to user specification. """

def __init__(self, df):
""" Constructor.
Expand Down Expand Up @@ -172,16 +176,12 @@ def add_time_features(self, year=False, month=False, week=True, tod=True, dow=Tr
# One-hot encode the time features
for var in var_to_expand:

add_var = pd.get_dummies(data[var], prefix=var)
add_var = pd.get_dummies(data[var], prefix=var, drop_first=True)

# Add all the columns to the model data
data = data.join(add_var)

# Drop the original column that was expanded
data.drop(columns=[var], inplace=True)

# Drop last column to remove multi-collinearity
cols = [col for col in data.columns if var in col]
data.drop(columns=[cols[-1]], inplace=True)

self.preprocessed_data = data
Loading