<h3> Load Libraries </h3>

In [1]:
import glob
import os
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import zipfile
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
class Helpers:
    def __init__(self):
        print("Helper")
        
    def convert_missing_values_nan(df):
        '''This function will convert -9999 to NaN'''
        df = df.replace(-9999.000000, np.NaN)
        return df

    def drop_nan_columns(df):
        '''Drops the columns having all theirs rows as Nans'''
        columns_to_exclude = ["Date", "Day", "Year", "Month", "Timestamp start"
                              , "Time", "TIMESTAMP", "Tier", "TIMESTAMP_START", "TIMESTAMP_END", "Day Status"]
        columns = df.columns
        for i in range(len(columns)):
            col = columns[i]
            if col in columns_to_exclude:
                continue
            nan_sum_col = df[col].isnull().sum()
            if nan_sum_col == len(df):
                df.drop(col, axis=1, inplace=True)
        return df
    
    def drop_nans_rows(df):
        '''This function will drop the rows having NaNs'''
        print("Before removing missing values:")
        print("number of rows:", df.shape[0], "\nnumber of columns:", df.shape[1])
        df = df.dropna(how='any')
        print("After removing missing values:")
        print("number of rows:", df.shape[0], "\nnumber of columns:", df.shape[1])
        return df
        
    def get_all_matching_columns(df, keyword):
        return df.filter(like=keyword).columns

    def generate_lags(df, column, lags_count): 
        for i in range(lags_count):
            lag_name = column + "-" + str(i + 1)
            df[lag_name] = df[column].shift(i + 1)
#             for j in range(i):
#                 df.loc[str(j+1), lag_name] = np.nan
#         df = df.dropna(how='any')
        return df

    def add_LE_conversion_rate(df, col):
        conversion_rate = 28.94
        new_col = col + "(mm)"
        df[new_col] = df[col] / conversion_rate
        return df

    def read_sites_data():
        file_path = os.path.join(base_path, "filtered_sites_all.xlsx")
        df = pd.read_excel(file_path)
        df.head()
        return df

    def export_data(df, file_path):
        export_path = os.path.join(base_path, file_path + ".csv")
        export_csv = df.to_csv(export_path, index=None, header=True)

    def load_data(file_path):
        df = pd.read_csv(file_path + ".csv", delimiter=',')
        return df
    
    def list_to_df(list_to_convert):
        '''This function will convert the provided list into a dataframe'''
        df = pd.concat(list_to_convert, sort=True)
        return df
    
    def get_files_directory(dirName):
    # create a list of file and sub directories 
    # names in the given directory 
        listOfFile = os.listdir(dirName)
        allFiles = list()
        # Iterate over all the entries
        for entry in listOfFile:
            # Create full path
            if entry.endswith(".xlsx") or entry.endswith(".icloud") or entry.endswith(".DS_Store"):
                continue
            fullPath = os.path.join(dirName, entry)
            # If entry is a directory then get the list of files in this directory 
            if os.path.isdir(fullPath):
                allFiles = allFiles + Helpers.get_files_directory(fullPath)
            else:
                allFiles.append(fullPath)

        return allFiles

    def concat_dataframe_from_files(files, skipRowsNum, split_num):
        values = []
        for i in range(len(files)):
            file_path = files[i]
            head, file_name = os.path.split(file_path)
            #Get only the sheets having the variables
            if file_name.endswith(".csv"):
#                 print("file name", file_name)
                df = pd.read_csv(file_path, delimiter=',', skiprows=skipRowsNum)
                site_id = file_name.split("_")[split_num]
#                 print("site id in file:", site_id)
                df["Site Id"] = site_id
                values.append(df)
        return Helpers.list_to_df(values)   
    
    def generate_dataframe_from_files(dirName, skipRowsNum = 0, split_num = 0):
        files = Helpers.get_files_directory(dirName)
        df = Helpers.concat_dataframe_from_files(files, skipRowsNum, split_num)
        return df
        

In [5]:
class EuroFlux:

    def __init__(self, input_path, output_path):
        print("Initializer")
        self.input_path = input_path
        self.output_path = output_path
        
    def unzip_folder(self, filename):
        if filename.endswith(".zip"):
            name = os.path.splitext(os.path.basename(filename))[0]
            path = self.input_path + name
            if not os.path.isdir(path):
                try:
                    zip = zipfile.ZipFile(filename)

                    os.mkdir(path)
                    zip.extractall(path=path)
                    zip.close()
                    os.remove(filename)
                except:
                    print("BAD ZIP: ", filename)
        
    def unzip_folders(self):
        files = Helpers.get_files_directory(input_path)
        for i in range(len(files)):
            file = files[i]
            head, file_name = os.path.split(file)
            if file_name.endswith(".zip"):
                self.unzip_folder(file)
                
    def read_l2_data(self):
        files = Helpers.get_files_directory(input_path)
        l2_files = []
        df_list = []
        for i in range(len(files)):
            file = files[i]
            head, file_name = os.path.split(file)
            if file_name.endswith(".txt"):
                components = file_name.split("_")
                if components[1] == "L2":
                    print("L2 file:", file_name)
                    l2_files.append(file_name)
                    df = pd.read_csv(file , sep=",")
                    df["Site Id"] = components[3]
                    df["Year"] = components[4]
                    df["Version"] = components[5]
                    df_list.append(df)
                    
        return Helpers.list_to_df(df_list)

    def read_l3_data(self):
        files = Helpers.get_files_directory(input_path)
        l2_files = []
        df_list = []
        for i in range(len(files)):
            file = files[i]
            head, file_name = os.path.split(file)
            if file_name.endswith(".txt"):
                components = file_name.split("_")
                if len(components) > 2:
                    if components[2] == "L3":
                        print("L3 file:", file_name)
                        l2_files.append(file_name)
                        df = pd.read_csv(file , sep=",")
                        df["Site Id"] = components[3]
                        df["Year"] = components[4]
                        df["Version"] = components[5]
                        df_list.append(df)
                    
        return Helpers.list_to_df(df_list)
    
    def read_l4_data(self):
        files = Helpers.get_files_directory(input_path)
        l2_files = []
        df_list = []
        for i in range(len(files)):
            file = files[i]
            head, file_name = os.path.split(file)
            if file_name.endswith(".txt"):
                components = file_name.split("_")
                if len(components) > 2:
                    if (components[2] == "L4") and (components[3] == "h"):
                        print("L4 file:", file_name)
                        l2_files.append(file_name)
                        df = pd.read_csv(file , sep=",")
                        df["Site Id"] = components[4]
                        df["Year"] = components[5]
                        df["Version"] = components[6]
                        df_list.append(df)
                    
        return Helpers.list_to_df(df_list)
                    
input_path = "/Users/saraawad/Desktop/Datasets/Google/Euroflux/"
output_path = ""
ef = EuroFlux(input_path, output_path)
ef.unzip_folders()
df2_list = ef.read_l2_data()
df2_list.head()
print("-----------------------------------------")

df3_list = ef.read_l3_data()
df3_list.head()
print("-----------------------------------------")

df4_list = ef.read_l4_data()
df4_list.head()

Initializer
L2 file: EFDC_L2_Flx_FIJok_2002_v07_30m.txt
L2 file: EFDC_L2_Flx_ITCA2_2014_v014_30m.txt
L2 file: EFDC_L2_Flx_IECa1_2008_v02_30m.txt
L2 file: EFDC_L2_Flx_ITBCi_2010_v030_30m.txt
L2 file: EFDC_L2_Flx_FRGri_2007_v09_30m.txt
L2 file: EFDC_L2_Flx_FRGri_2013_v06_30m.txt
L2 file: EFDC_L2_Flx_FRAvi_2004_v01_30m.txt
L2 file: EFDC_L2_Flx_IECa1_2004_v03_30m.txt
L2 file: EFDC_L2_Flx_ITCA2_2012_v015_30m.txt
L2 file: EFDC_L2_Flx_ITCA2_2011_v06_30m.txt
L2 file: EFDC_L2_Flx_ITBCi_2014_v07_30m.txt
L2 file: EFDC_L2_Flx_FRGri_2006_v07_30m.txt
L2 file: EFDC_L2_Flx_FRGri_2015_v04_30m.txt
L2 file: EFDC_L2_Flx_ITCas_2007_v02_30m.txt
L2 file: EFDC_L2_Flx_ITCA2_2013_v017_30m.txt
L2 file: EFDC_L2_Flx_IECa1_2007_v02_30m.txt
L2 file: EFDC_L2_Flx_ITBCi_2008_v05_30m.txt
L2 file: EFDC_L2_Flx_ITCas_2008_v01_30m.txt
L2 file: EFDC_L2_Flx_FRGri_2009_v07_30m.txt
L2 file: EFDC_L2_Flx_FRGri_2016_v05_30m.txt
L2 file: EFDC_L2_Flx_ITBCi_2006_v01_30m.txt
L2 file: EFDC_L2_Flx_FIJok_2001_v07_30m.txt
L2 file: EFDC_L2

Unnamed: 0,Day,DoY,GPP_or_ANN,GPP_or_MDS,GPP_st_ANN,GPP_st_MDS,H_f,H_fqc,Hour,LE_f,...,Ta_f,Ta_fqc,Ts_f,Ts_fqc,VPD_f,VPD_fqc,Version,Year,qf_NEE_or,qf_NEE_st
0,1,1.021,-0.141,-0.029,-0.248,-0.026,-26.67,0,0.5,2.19,...,8.12,0,8.31,0,-9999.0,-9999,v01.txt,2008,259,259
1,1,1.042,0.387,0.387,0.391,0.391,-25.78,0,1.0,2.06,...,7.78,0,8.25,0,-9999.0,-9999,v01.txt,2008,0,0
2,1,1.063,0.225,0.225,0.229,0.229,-29.29,0,1.5,1.25,...,7.43,0,8.2,0,-9999.0,-9999,v01.txt,2008,0,0
3,1,1.083,0.321,0.321,0.325,0.325,-25.95,0,2.0,1.05,...,7.62,0,8.14,0,-9999.0,-9999,v01.txt,2008,0,0
4,1,1.104,0.225,0.225,0.229,0.229,-33.06,0,2.5,0.72,...,7.31,0,8.1,0,-9999.0,-9999,v01.txt,2008,0,0


In [8]:
base_path = "/Users/saraawad/Desktop/Datasets/Google/"
Helpers.export_data(df2_list, "EuroFlux/Processed Data/L2")
Helpers.export_data(df3_list, "EuroFlux/Processed Data/L3")
Helpers.export_data(df4_list, "EuroFlux/Processed Data/L4")