In [75]:
import os
import pandas as pd
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

In [76]:
base_path = "/Users/saraawad/Desktop/Datasets/Google/"
correction_path = "/Users/saraawad/Desktop/flux-data-qaqc/sites/config/output"

In [77]:
class Helpers:
    def __init__(self):
        print("Helper")
        
    def convert_missing_values_nan(df):
        '''This function will convert -9999 to NaN'''
        df = df.replace(-9999.000000, np.NaN)
        return df

    def drop_nan_columns(df):
        '''Drops the columns having all theirs rows as Nans'''
        columns_to_exclude = ["Date", "Day", "Year", "Month", "Timestamp start"
                              , "Time", "TIMESTAMP", "Tier", "TIMESTAMP_START", "TIMESTAMP_END", "Day Status"]
        columns = df.columns
        for i in range(len(columns)):
            col = columns[i]
            if col in columns_to_exclude:
                continue
            nan_sum_col = df[col].isnull().sum()
            if nan_sum_col == len(df):
                df.drop(col, axis=1, inplace=True)
        return df
    
    def drop_nans_rows(df):
        '''This function will drop the rows having NaNs'''
        print("Before removing missing values:")
        print("number of rows:", df.shape[0], "\nnumber of columns:", df.shape[1])
        df = df.dropna(how='any')
        print("After removing missing values:")
        print("number of rows:", df.shape[0], "\nnumber of columns:", df.shape[1])
        return df
        
    def get_all_matching_columns(df, keyword):
        return df.filter(like=keyword).columns

    def generate_lags(df, column, lags_count): 
        for i in range(lags_count):
            lag_name = column + "-" + str(i + 1)
            df[lag_name] = df[column].shift(i + 1)
#             for j in range(i):
#                 df.loc[str(j+1), lag_name] = np.nan
#         df = df.dropna(how='any')
        return df

    def add_LE_conversion_rate(df, col):
        conversion_rate = 28.94
        new_col = col + "(mm)"
        df[new_col] = df[col] / conversion_rate
        return df

    def read_sites_data():
        file_path = os.path.join(base_path, "filtered_sites_all.xlsx")
        df = pd.read_excel(file_path)
        df.head()
        return df

    def export_data(df, file_path):
        export_path = os.path.join(base_path, file_path + ".csv")
        export_csv = df.to_csv(export_path, index=None, header=True)

    def load_data(file_path):
        df = pd.read_csv(file_path + ".csv", delimiter=',')
        return df
    
    def list_to_df(list_to_convert):
        '''This function will convert the provided list into a dataframe'''
        df = pd.concat(list_to_convert, sort=True)
        return df
    
    def get_files_directory(dirName):
    # create a list of file and sub directories 
    # names in the given directory 
        listOfFile = os.listdir(dirName)
        allFiles = list()
        # Iterate over all the entries
        for entry in listOfFile:
            # Create full path
            if entry.endswith(".xlsx") or entry.endswith(".icloud") or entry.endswith(".DS_Store"):
                continue
            fullPath = os.path.join(dirName, entry)
            # If entry is a directory then get the list of files in this directory 
            if os.path.isdir(fullPath):
                allFiles = allFiles + Helpers.get_files_directory(fullPath)
            else:
                allFiles.append(fullPath)

        return allFiles

    def concat_dataframe_from_files(files, skipRowsNum, split_num):
        values = []
        for i in range(len(files)):
            file_path = files[i]
            head, file_name = os.path.split(file_path)
            #Get only the sheets having the variables
            if file_name.endswith(".csv"):
#                 print("file name", file_name)
                df = pd.read_csv(file_path, delimiter=',', skiprows=skipRowsNum)
                site_id = file_name.split("_")[split_num]
#                 print("site id in file:", site_id)
                df["Site Id"] = site_id
                values.append(df)
        return Helpers.list_to_df(values)   
    
    def generate_dataframe_from_files(dirName, skipRowsNum = 0, split_num = 0):
        files = Helpers.get_files_directory(dirName)
        df = Helpers.concat_dataframe_from_files(files, skipRowsNum, split_num)
        return df
    
    def drop_duplicate_merged_columns(df):
        # list comprehension of the cols that end with '_y'
        columns_to_drop = [x for x in df if x.endswith('_y')]
        df.drop(columns_to_drop, axis=1, inplace=True)  

In [81]:
class CorrectedData:

    def __init__(self, input_path, output_path):
        print("Initializer")
        self.input_path = input_path
        self.output_path = output_path
        
    def concat_sites(self, path):
        all_ebr_list = []
        files = Helpers.get_files_directory(path)
        for i in range(len(files)):
            file_path = files[i]
            head, file_name = os.path.split(file_path)
            df_site = pd.read_csv(file_path, delimiter=',', skiprows=0)
            site_id = file_name.split("_")[0]
            if file_name.endswith("daily_data.csv"):
                df_site["Site Id"] = site_id
                all_ebr_list.append(df_site)
            
        return Helpers.list_to_df(all_ebr_list)   
        
    def merge_correction_methods(self):
        ebr_path = os.path.join(self.input_path, "EBR/")
        bowen_path = os.path.join(self.input_path, "Bowen/")
        df_ebr = self.concat_sites(ebr_path)
        df_ebr = df_ebr.rename({'LE_corr': 'LE_ebr_corr', 'H_corr': 'H_ebr_corr', 
                           'ET': 'ET_ebr', 'ET_corr': 'ET_ebr_corr', 
                           'ETrF': 'ETrF_ebr', 'ETrF_filtered': 'ETrF_filtered_ebr'}, axis=1) 
        df_bowen = self.concat_sites(bowen_path)
        df_bowen = df_bowen.rename({'LE_corr': 'LE_bowen_corr', 'H_corr': 'H_bowen_corr', 
                           'ET':'ET_bowen', 'ET_corr': 'ET_bowen_corr', 
                           'ETrF': 'ETrF_bowen', 'ETrF_filtered': 'ETrF_filtered_bowen'}, axis=1) 
        df_all = pd.merge(left=df_ebr,right=df_bowen, left_on=['Site Id', 'date'], 
                          right_on=['Site Id', 'date'], suffixes=('', '_y'))
        Helpers.drop_duplicate_merged_columns(df_all)
        
        file_name = self.output_path + "all_sites_corrected"
        Helpers.export_data(df_all, file_name) 
        
        return df_all


In [82]:
if __name__ == "__main__":
    output_path = os.path.join(correction_path, "Merged/")
    am = CorrectedData(correction_path, output_path)
    df_all = am.merge_correction_methods()

Initializer


In [83]:
df_all["Site Id"].unique()

array(['US-Ced', 'US-A74', 'US-Twt', 'US-Var', 'US-SO2', 'US-Kon',
       'US-Tw2', 'US-Pon', 'US-Bi2', 'US-A32', 'US-AR1', 'US-Goo',
       'US-SP2', 'US-Snd', 'US-Wlr', 'US-Skr', 'US-AR2', 'US-Shd'],
      dtype=object)

In [84]:
df_all.head(10)
df_all[["input_LE", "LE_bowen_corr", "LE_ebr_corr", "input_H", "H_bowen_corr", "H_ebr_corr"]]

Unnamed: 0,input_LE,LE_bowen_corr,LE_ebr_corr,input_H,H_bowen_corr,H_ebr_corr
0,,76.165051,70.411027,,,
1,60.693556,70.612895,61.744799,77.398378,90.047839,78.738957
2,107.135289,127.536709,108.658759,85.455467,101.728469,86.670648
3,134.954978,128.281493,135.484160,102.333911,97.273529,102.735180
4,138.517261,136.322478,138.128031,86.831239,85.455413,86.587246
...,...,...,...,...,...,...
22369,,75.903437,39.753796,,,
22370,93.100000,108.739620,116.387749,33.100000,38.660380,41.379533
22371,-21.700000,-487.165000,-26.323984,25.700000,576.965000,31.176332
22372,,203.147222,127.504853,,,


In [85]:
class AmerifluxErrorGraph:
    def __init__(self, path):
        print("AmerifluxErrorGraph Initializer")
        self.path = path
        
          
    def get_error_metrics(self, y_true, y_predicted):
        r2_Score = r2_score(y_true, y_predicted)
        rmse_score = np.sqrt(mean_squared_error(y_true, y_predicted))
        mse_score = mean_squared_error(y_true, y_predicted)
        mae_score = mean_absolute_error(y_true, y_predicted)

        def mean_absolute_percentage_error(y_true, y_pred):
            y_true, y_pred = np.array(y_true), np.array(y_pred)
            return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
        mape_score = mean_absolute_percentage_error(y_true, y_predicted)
        num = 2
        return (round(r2_Score, num), round(rmse_score, num), round(mse_score, num), 
                round(mae_score, num), round(mape_score, num))


    def generate_errors(self, df, first_col, second_col):
        df = df.replace(to_replace = np.nan, value =0) 
        errors = self.get_error_metrics(df[first_col], df[second_col])
        return errors
    
    def plot_et(self, df, site_id, first_column, first_b_column, second_column, path_to_save):
        fig, ax = plt.subplots(figsize=(45, 22))
        fig.subplots_adjust(bottom=0.15, left=0.2)
        plt.subplot(121)
        title1 = site_id + ": Comparison between " + first_column + " and " + second_column
        plt.rcParams.update({'font.size': 20})
        self.plot_sub(df, first_column, second_column, title1)
        
        plt.subplot(122)
        plt.rcParams.update({'font.size': 20})
        title2 = site_id + ": Comparison between " + first_b_column + " and " + second_column
        self.plot_sub(df, first_b_column, second_column, title2)

        plt.xticks(rotation=90)
#         ax.legend()
#         plt.show()
        plt.savefig(path_to_save + site_id + "_corr.png")
        plt.close(fig)

    def plot_sub(self, df, first_column, second_column, title):
        ax = sns.regplot(x=first_column, y=second_column, data=df, fit_reg=False)
        lims = [
            np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
            np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
            ]

        # now plot both limits against eachother
        ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
        ax.set_aspect('equal')
        ax.set_xlim(lims)
        ax.set_ylim(lims)
       
        plt.title(title)
        
        
    def prepare_data(self, df):
        unique_sites = df["Site Id"].unique()
        unique_sites

        r2_list, rmse_list, mse_list, mae_list, mape_list = [], [], [], [], []
        r2_b_list, rmse_b_list, mse_b_list, mae_b_list, mape_b_list = [], [], [], [], []
        first_column = "LE_bowen_corr"
        first_b_column = "LE_ebr_corr"

        second_column = "input_LE"
        for i in range(len(unique_sites)):
            site_id = unique_sites[i]
            df_site = df[df["Site Id"] == site_id]
            (r2, rmse, mse, mae, mape) = self.generate_errors(df_site, first_column, second_column)
            (r2_b, rmse_b, mse_b, mae_b, mape_b) = self.generate_errors(df_site, first_b_column, second_column)
            r2_list.append(r2)
            rmse_list.append(rmse)
            mse_list.append(mse)
            mae_list.append(mae)
            mape_list.append(mape)
            
            r2_b_list.append(r2_b)
            rmse_b_list.append(rmse_b)
            mse_b_list.append(mse_b)
            mae_b_list.append(mae_b)
            mape_b_list.append(mape_b)
            df_site['date'] = pd.to_datetime(df_site["date"])
            df_site.sort_values(by="date", inplace=True, ascending=True)
            #Plot the output feature
            if len(df_site) > 0:
                self.plot_et(df_site, site_id, first_column, first_b_column, second_column, self.path + "Graphs/")

        df_errors = pd.DataFrame({"Site Id": unique_sites,
                                  "LE": second_column,
                                  "LE Bowen": first_column,
                                  "R2": r2_list,
                                  "RMSE": rmse_list, 
                                  "MSE": rmse_list,
                                  "MAE": mae_list, 
                                  "MAPE": mape_list,
                                  "LE Ebr": first_b_column,
                                  "R2 ebr": r2_b_list,
                                  "RMSE ebr": rmse_b_list, 
                                  "MSE ebr": mse_b_list,
                                  "MAE ebr": mae_b_list, 
                                  "MAPE ebr": mape_b_list})
        
        print(df_errors)
 
        Helpers.export_data(df_errors, self.path + "corrections_errors")


In [86]:
full_path = os.path.join(base_path, "Ameriflux/Generated/")
am_eg = AmerifluxErrorGraph(full_path)
am_eg.prepare_data(df_all)

AmerifluxErrorGraph Initializer
   Site Id        LE       LE Bowen    R2    RMSE     MSE     MAE    MAPE  \
0   US-Ced  input_LE  LE_bowen_corr -1.24   52.41   52.41   39.94  108.85   
1   US-A74  input_LE  LE_bowen_corr -0.65  171.28  171.28  109.32   70.78   
2   US-Twt  input_LE  LE_bowen_corr -0.85   97.95   97.95   70.02     NaN   
3   US-Var  input_LE  LE_bowen_corr  0.74   33.53   33.53   19.55   56.28   
4   US-SO2  input_LE  LE_bowen_corr  0.85   14.04   14.04    9.68   34.87   
5   US-Kon  input_LE  LE_bowen_corr  0.40   46.69   46.69   28.36   89.45   
6   US-Tw2  input_LE  LE_bowen_corr  0.53   42.79   42.79   30.40   35.26   
7   US-Pon  input_LE  LE_bowen_corr -2.10  141.22  141.22  122.11   84.58   
8   US-Bi2  input_LE  LE_bowen_corr  0.07   64.92   64.92   41.58   64.80   
9   US-A32  input_LE  LE_bowen_corr  0.26   74.34   74.34   43.68   41.85   
10  US-AR1  input_LE  LE_bowen_corr  0.03   31.86   31.86   19.25   85.58   
11  US-Goo  input_LE  LE_bowen_corr  0.46   