SoftwareDefinedBuildings · gtfierro · Mar 1, 2019 · Mar 1, 2019 · Mar 1, 2019
diff --git a/apps/Data_quality_analysis/Clean_Data.py b/apps/Data_quality_analysis/Clean_Data.py
diff --git a/apps/Data_quality_analysis/Data Analysis_MDAL.ipynb b/apps/Data_quality_analysis/Data Analysis_MDAL.ipynb
diff --git a/apps/Data_quality_analysis/Data_Analysis_MDAL.ipynb b/apps/Data_quality_analysis/Data_Analysis_MDAL.ipynb
diff --git a/apps/Data_quality_analysis/Import_Data.py b/apps/Data_quality_analysis/Import_Data.py
diff --git a/apps/Data_quality_analysis/Main.ipynb b/apps/Data_quality_analysis/Main.ipynb
diff --git a/apps/Data_quality_analysis/Main.py b/apps/Data_quality_analysis/Main.py
@@ -8,6 +8,7 @@
 sys.path.append("..")
 from Energy_Analytics import Wrapper
 
+
 # Custom func
 def func(X, y):
     from sklearn.linear_model import LinearRegression

diff --git a/apps/Data_quality_analysis/Missing_data_dictionary.ipynb b/apps/Data_quality_analysis/Missing_data_dictionary.ipynb
@@ -0,0 +1,189 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Missing Data Periods to JSON"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import dataclient\n",
+    "import pandas as pd\n",
+    "import configparser\n",
+    "import json\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "from matplotlib import style\n",
+    "import numpy as np\n",
+    "style.use(\"seaborn-notebook\")\n",
+    "%matplotlib inline\n",
+    "import pprint\n",
+    "\n",
+    "import sys\n",
+    "sys.path.append(\"..\")\n",
+    "from Wrapper import *\n",
+    "\n",
+    "\n",
+    "import ipywidgets as widgets\n",
+    "import plotly as py\n",
+    "import plotly.graph_objs as go\n",
+    "\n",
+    "#to install plotly extension for jupyter lab see: https://github.com/jupyterlab/jupyter-renderers/tree/master/packages/plotly-extension\n",
+    "\n",
+    "from scipy import special"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mdal_obj = Import_MDAL()\n",
+    "\n",
+    "start = '2016-01-01T00:00:00-08:00' #'2016-01-01T00:00:00Z' for UTC\n",
+    "end = '2019-02-01T00:00:00-08:00'# '2019-02-01T00:00:00Z' for UTC\n",
+    "\n",
+    "site_list = ['avenal-public-works-yard',\n",
+    "    'avenal-movie-theatre',\n",
+    "    'avenal-recreation-center',\n",
+    "    'avenal-animal-shelter',\n",
+    "    'avenal-veterans-hall',\n",
+    "    'south-berkeley-senior-center',\n",
+    "    'north-berkeley-senior-center',\n",
+    "    'hayward-station-1',\n",
+    "    'hayward-station-8',\n",
+    "    'orinda-community-center',\n",
+    "    'ciee',\n",
+    "    'berkeley-corporate-yard']\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "missing_energy_data={}\n",
+    "missing_weather_data={}\n",
+    "missing_tstat_data={}\n",
+    "\n",
+    "\"\"\"\n",
+    "\n",
+    "Input Parameters:\n",
+    "obj: object returned from above MDAL Query\n",
+    "dictionary: pick dictionary that corresponds with data type\n",
+    "low_bound: all data <= this value will be interpreted as missing data\n",
+    "high_bound: all data > this value will be interpreted as missing data\n",
+    "\n",
+    "\n",
+    "Returns a JSON file with a dictionary with the missing data events (start and end)\n",
+    "        for each column in the inputted object\n",
+    "\n",
+    "\"\"\"\n",
+    "\n",
+    "def event_duration(obj,dictionary,low_bound=None, high_bound=None):\n",
+    "    clean_data_obj = Clean_Data(obj.df)\n",
+    "    dictionary=clean_data_obj.event_duration(obj, dictionary, low_bound, high_bound)\n",
+    "    return dictionary"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Missing Thermostat Data periods "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for site in site_list:\n",
+    "    try:\n",
+    "        obj_tstat = mdal_obj.get_tstat(site=site, start=start, end=end, var='tstat_state', agg='MEAN', window='15min', aligned=True, return_names=True)\n",
+    "        event_duration(obj_tstat, dictionary=missing_tstat_data)\n",
+    "    except:\n",
+    "        print('no data in',site)\n",
+    "        \n",
+    "#Comment out below to not generate json files        \n",
+    "# with open('missing_tstat_data.json',\"w\") as write_file:\n",
+    "#     json.dump(missing_tstat_data, write_file)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Missing Energy Data periods "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for site in site_list:\n",
+    "    obj2 =  mdal_obj.get_meter(site=site, start=start, end=end, var='meter', agg='MEAN', window='15min', aligned=True, return_names=True)\n",
+    "    event_duration(obj2, dictionary=missing_energy_data)\n",
+    "\n",
+    "#Comment out below to not generate json files  \n",
+    "with open('missing_energy_data.json',\"w\") as write_file:\n",
+    "    json.dump(missing_energy_data, write_file)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Missing Weather Data periods "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for site in site_list:\n",
+    "    obj3 = mdal_obj.get_weather(site=site, start=start, end=end, var='weather', agg='MEAN', window='15min', aligned=True, return_names=True)\n",
+    "    event_duration(obj3, dictionary=missing_weather_data, low_bound=32, high_bound=110)\n",
+    "\n",
+    "#Comment out below to not generate json files  \n",
+    "with open('missing_weather_data.json',\"w\") as write_file:\n",
+    "    json.dump(missing_weather_data, write_file)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python [conda env:mortar]",
+   "language": "python",
+   "name": "conda-env-mortar-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/apps/Data_quality_analysis/Model_Data.py b/apps/Data_quality_analysis/Model_Data.py
@@ -1,9 +1,13 @@
-""" This script splits the data into baseline and projection periods, runs models on them and displays metrics & plots.
+""" This script splits the data into baseline and projection periods, runs ML models on them and displays the metrics & plots.
 
-Last modified: October 30 2018
+Note
+----
+Last modified: Feb 4 2019
 
-Authors \n
-@author Pranav Gupta <phgupta@ucdavis.edu>
+
+Authors
+-------
+- Pranav Gupta <phgupta@ucdavis.edu>
 
 """
 
@@ -65,37 +69,41 @@ def __init__(self, df, input_col, output_col, alphas, cv,
         self.original_data = df
         self.cv = cv
 
+        # Populate input columns (indepdent columns)
         if not input_col: # Using all columns except the last as input_col
             input_col = list(self.original_data.columns)
             input_col.remove(output_col)
             self.input_col = input_col
         elif not isinstance(list):
-            raise SystemError('Input column should be a list.')
+            raise TypeError('Input column should be a list.')
         else:
             self.input_col = input_col
 
+        # Populate output column (dependent column)
         if not output_col:
-            raise SystemError('Please provide the target column.')
+            raise TypeError('Please provide the target column.')
         elif not isinstance(output_col, str):
-            raise SystemError('Target column should be a string.')
+            raise TypeError('Target column should be a string.')
         else:
             self.output_col = output_col
 
+        # Validate the type of alphas
         if not isinstance(alphas, list) and not isinstance(alphas, np.ndarray):
-            raise SystemError('alphas should be a list of int\'s or numpy ndarray.')
+            raise TypeError('alphas should be a list of int\'s or numpy ndarray.')
         else:
             self.alphas = alphas
 
+        # Validate the type of baseline_period, projection_period & exclude_time_period
         if (len(baseline_period) % 2 != 0):
-            raise SystemError('baseline period needs to be a multiple of 2 (i.e. have a start and end date)')
+            raise ValueError('baseline period needs to be a multiple of 2 (i.e. have a start and end date)')
         else:
             self.baseline_period = baseline_period
         if exclude_time_period and (len(exclude_time_period) % 2 != 0):
-            raise SystemError('exclude time period needs to be a multiple of 2 (i.e. have a start and end date)')
+            raise ValueError('exclude time period needs to be a multiple of 2 (i.e. have a start and end date)')
         else:
             self.exclude_time_period = exclude_time_period
         if projection_period and (len(projection_period) % 2 != 0):
-            raise SystemError('projection period needs to be a multiple of 2 (i.e. have a start and end date)')
+            raise ValueError('projection period needs to be a multiple of 2 (i.e. have a start and end date)')
         else:
             self.projection_period = projection_period
 
@@ -115,7 +123,7 @@ def __init__(self, df, input_col, output_col, alphas, cv,
 
 
     def split_data(self):
-        """ Split data according to baseline and projection time period values """
+        """ Split data according to baseline and projection time period values. """
 
         try:
             # Extract data ranging in time_period1
@@ -455,34 +463,6 @@ def best_model_fit(self):
 
         """
 
-        # X_train, X_test, y_train, y_test = train_test_split(self.baseline_in, self.baseline_out, 
-        #                                                     test_size=0.30, random_state=42)
-
-        # self.best_model.fit(X_train, y_train)
-        # self.y_true = y_test                        # Pandas Series
-        # self.y_pred = self.best_model.predict(X_test)    # numpy.ndarray
-
-        # # Set all negative values to zero since energy > 0
-        # self.y_pred[self.y_pred < 0] = 0
-
-        # # n and k values for adj r2 score
-        # self.n_test = X_test.shape[0]   # Number of points in data sample
-        # self.k_test = X_test.shape[1]   # Number of variables in model, excluding the constant
-
-        # # Store best model's metrics
-        # self.best_metrics['name']   = self.best_model_name
-        # self.best_metrics['r2']     = r2_score(self.y_true, self.y_pred)
-        # self.best_metrics['mse']    = mean_squared_error(self.y_true, self.y_pred)
-        # self.best_metrics['rmse']   = math.sqrt(self.best_metrics['mse'])
-        # self.best_metrics['adj_r2'] = self.adj_r2(self.best_metrics['r2'], self.n_test, self.k_test)
-
-        # # Normalized Mean Bias Error
-        # numerator = sum(self.y_true - self.y_pred)
-        # denominator = (self.n_test - self.k_test) * (sum(self.y_true) / len(self.y_true))
-        # self.best_metrics['nmbe'] = numerator / denominator
-
-        # return self.best_metrics
-
         self.best_model.fit(self.baseline_in, self.baseline_out)
 
         self.y_true = self.baseline_out                             # Pandas Series
@@ -507,7 +487,6 @@ def best_model_fit(self):
         denominator = (self.n_test - self.k_test) * (sum(self.y_true) / len(self.y_true))
         self.best_metrics['nmbe']   = numerator / denominator
 
-
         # MAPE can't have 0 values in baseline_out -> divide by zero error
         self.baseline_out_copy  = self.baseline_out[self.baseline_out != 0]
         self.baseline_in_copy   = self.baseline_in[self.baseline_in.index.isin(self.baseline_out_copy.index)]

diff --git a/apps/Data_quality_analysis/Plot_Data.py b/apps/Data_quality_analysis/Plot_Data.py
@@ -1,6 +1,6 @@
 """ This script contains functions for displaying various plots.
 
-Last modified: October 17 2018
+Last modified: Feb 4 2019
 
 Authors \n
 @author Pranav Gupta <phgupta@ucdavis.edu>
@@ -26,6 +26,7 @@ class Plot_Data:
     # Static variable to keep count of number of figures
     count = 1
 
+
     def __init__(self, figsize=(18,5)):
         """ Constructor.
 
@@ -140,11 +141,11 @@ def baseline_projection_plot(self, y_true, y_pred,
                     Plot_Data.count += 1
                     return fig, project_df['y_true'], project_df['y_pred']
                 except:
-                    raise SystemError("If projecting into the future, please specify project_ind_col that has data available \
+                    raise TypeError("If projecting into the future, please specify project_ind_col that has data available \
                                         in the future time period requested.")
 
         return fig, None, None
-
-if __name__ == '__main__':
 
+
+if __name__ == '__main__':
     obj = Plot_Data()
diff --git a/apps/Data_quality_analysis/Preprocess_Data.py b/apps/Data_quality_analysis/Preprocess_Data.py
@@ -1,10 +1,14 @@
 """ This script preprocesses a dataframe according to user specifications.
 
-Last modified: November 15 2018
+Note
+----
+Last modified: Feb 4 2019
 
-Authors \n
-@author Marco Pritoni <marco.pritoni@gmail.com>
-@author Pranav Gupta <phgupta@ucdavis.edu>
+
+Authors
+-------
+- Marco Pritoni <marco.pritoni@gmail.com>
+- Pranav Gupta <phgupta@ucdavis.edu>
 
 """
 
@@ -14,7 +18,7 @@
 
 class Preprocess_Data:
 
-    """ This class preprocesses a dataframe according to user specification """
+    """ This class preprocesses a dataframe according to user specification. """
 
     def __init__(self, df):
         """ Constructor.
@@ -172,16 +176,12 @@ def add_time_features(self, year=False, month=False, week=True, tod=True, dow=Tr
         # One-hot encode the time features
         for var in var_to_expand:
 
-                add_var = pd.get_dummies(data[var], prefix=var)
+                add_var = pd.get_dummies(data[var], prefix=var, drop_first=True)
 
                 # Add all the columns to the model data
                 data = data.join(add_var)
 
                 # Drop the original column that was expanded
                 data.drop(columns=[var], inplace=True)
 
-                # Drop last column to remove multi-collinearity
-                cols = [col for col in data.columns if var in col]
-                data.drop(columns=[cols[-1]], inplace=True)
-
         self.preprocessed_data = data