In [1]:
#############################################################################
####################Import all packages######################################
#############################################################################
import os
import random
import numpy as np
#import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
#import pandas_profiling as pp
import pandas as pd
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.mode.chained_assignment = None  # default='warn'
# Set working directory
#############################################################################
# Make Jupyter Notebook show ALL output of a cell, not only the last one
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
def read_all_csvs_in_folder(path):
    ''' Function reading in all CSV files in a given folder.
        1. First lists all files in folder.
        2. Then reads in only files which are NOT of file size = 0 and contain headers + AT LEAST 1 row of data
        3. Appends all files into a final dataframe
        4. Also prints the amount of files in the folder, and the amount of files used for the final dataframe.
    '''
    import pandas as pd
    import glob
    import os
    # Create list containing all files names of the current folder
    all_files = glob.glob(path + "/*.csv")
    # These empty lists will be filled with:
    # 1. the names of the CSV files we want to append/merge, 
    # 2. 2 lists containing numbers representing all the files that were empty (or headers but no row data) and not read
    
    list_with_dfs = []
    zero_files = []
    empty_files = []
    empty_row_files = []
    filename_list = []
    n_files_appended = 0
    
    print('')
    print('')
    print('#####################################################')
    print('STARTING READ IN')
    print('#####################################################')
    print('')
    
    for filename in all_files:        
        print('Reading current file:')
        print(filename)
        # Check whether the CSV file is larger than 0Bytes (if not, it has for sure no data and will break during read in)
        try:        
            if os.path.getsize(filename) == 0:
                zero_files.append(filename)                
                print('')
                print('------------------------------------------------------')
                print('!!! SKIPPING FILE: ' + filename)
                print('    REASON: FILE SIZE == 0')
                print('------------------------------------------------------')
                print('')
            if os.path.getsize(filename) > 0:
                df = pd.read_csv(filename, index_col=None, header=0)            
                if df.empty:
                    empty_row_files.append(filename)
                    print('')
                    print('------------------------------------------------------')
                    print('!!! SKIPPING FILE: ' + filename)
                    print('    REASON: FILE CONTAINS NO ROW DATA')
                    print('------------------------------------------------------')
                    print('')
                # Check whether the CSV is empty (this ALSO works when it DOES have headers, but no row data. This is important!)
                if not df.empty:
                    list_with_dfs.append(df)
                    n_files_appended = n_files_appended + 1
                    filename_list.append(filename)
        except pd.errors.EmptyDataError:
            print('')
            print('------------------------------------------------------')
            print('!!! SKIPPING FILE: ' + filename)
            print('    REASON: FILE ENTIRELY EMPTY WITHOUT HEADERS')
            print('------------------------------------------------------')
            print('')
            empty_files.append(filename)            
            pass
    
    # Merge all files in the file list
    frame = pd.concat(list_with_dfs, axis=0, ignore_index=True)
    print('')
    print('#####################################################')
    print('!!! DONE READING & MERGING DATA !!!')
    print('#####################################################')
    print('')
    print('------------------------------------------------------')
    print('All files: ' + str(len(all_files)))    
    print('')
    print('Successfully appended files: ' + str(n_files_appended))
    print('------------------------------------------------------')
    print('------------------------------------------------------')
    print('')
    print('Empty files without headers: ' + str(len(empty_files)))
    #print(len(empty_files))
    print(empty_files)
    print('')
    print('Zero size files: ' + str(len(zero_files)))
    #print(len(zero_files))
    print(zero_files)
    print('')
    print('Empty row files: ' + str(len(empty_row_files)))
    #print(len(empty_row_files))
    print(empty_row_files)
    print('')    
    print('#####################################################')
    
    return frame    

In [1]:
# Read in files and merge them
path_control = '/home/taco/Documents/greta_forarex/new_data/postflight_control_FINAL/Ground control_raw_20190817/'
control= read_all_csvs_in_folder(path_control)



In [4]:
control.shape

(202441, 70)

In [5]:
## Remove all unncessary columns
cols_to_keep = ['timeStamp',
                'Exp0_OxygenTemp', 'Exp0_OxygenpercentO2', 'Exp0_PhValue',
                'Exp1_OxygenTemp', 'Exp1_OxygenpercentO2',
                'Pressure_LateAccess']

#pre_and_flight = pre_and_flight[cols_to_keep]
control = control[cols_to_keep]
#extra = extra[cols_to_keep]

In [25]:
# DFs contain negligible amount of missing values (NAs), just drop them
#import missingno as msno
#msno.matrix(postflight_1)
#pre_and_flight.dropna(inplace=True)
postflight_1.dropna(inplace=True)
#extra.dropna(inplace=True)

In [8]:
# Naive DateTime TimeStamp needs to be increased by 1hour (UTC+1 for Sweden) compared to what was recorded (apparently UTC, UK)
#pre_and_flight.loc[:,'timeStamp'] = pd.to_datetime(pre_and_flight.timeStamp, unit='s')
control.loc[:,'timeStamp'] = pd.to_datetime(control.timeStamp, unit='s')
#extra.loc[:,'timeStamp'] = pd.to_datetime(extra.timeStamp, unit='s').dt.tz_localize('UTC').dt.tz_convert('Europe/Stockholm')


In [10]:
# Check the data range of the final timestamps
control.timeStamp.min()
control.timeStamp.max()



Timestamp('2019-05-21 20:25:41')

Timestamp('2019-06-27 08:30:05')

In [14]:
control.sort_values('timeStamp', inplace=True)

In [16]:
control.reset_index(inplace=True, drop=True)


In [17]:
# Create different time epochs for convenient plotting & exploration
# Postflight
control.loc[:, 'year'] = control.timeStamp.dt.year
control.loc[:, 'month'] = control.timeStamp.dt.month
control.loc[:, 'day'] = control.timeStamp.dt.day
control.loc[:, 'hour'] = control.timeStamp.dt.hour
control.loc[:, 'minute'] = control.timeStamp.dt.minute
control.loc[:, 'second'] = control.timeStamp.dt.second


In [22]:
control.head()

Unnamed: 0,timeStamp,Exp0_OxygenTemp,Exp0_OxygenpercentO2,Exp0_PhValue,Exp1_OxygenTemp,Exp1_OxygenpercentO2,Pressure_LateAccess,year,month,day,hour,minute,second
0,2019-05-21 20:25:41,23.298,21.106,0.0,22.594,27.541,1005.7,2019,5,21,20,25,41
1,2019-05-21 20:25:51,23.288,21.147,0.0,22.574,27.575,1005.6,2019,5,21,20,25,51
2,2019-05-21 20:26:01,23.283,21.117,0.0,22.55,27.52,1005.7,2019,5,21,20,26,1
3,2019-05-21 20:26:11,23.272,21.144,0.0,22.527,27.569,1005.3,2019,5,21,20,26,11
4,2019-05-21 20:26:21,23.267,21.123,0.0,22.514,27.464,1005.5,2019,5,21,20,26,21


In [21]:
# Scale columns values 
# Postflight
# All divided by 10000
control.loc[:,'Exp0_OxygenTemp'] = control.Exp0_OxygenTemp/1000
control.loc[:,'Exp1_OxygenTemp'] = control.Exp1_OxygenTemp/1000
control.loc[:,'Exp0_OxygenpercentO2'] = control.Exp0_OxygenpercentO2/1000
control.loc[:,'Exp1_OxygenpercentO2'] = control.Exp1_OxygenpercentO2/1000
control.loc[:,'Exp0_PhValue'] = control.Exp0_PhValue/1000
# Pressure divided by 10
control.loc[:, 'Pressure_LateAccess'] = control.Pressure_LateAccess/10



In [None]:
# Saving to disk the merged files (not yet processed)
# In Pickle format for fast read in
#pre_and_flight.to_pickle('../data/experiment/merged/pre_and_flight_merged.pkl')
control.to_pickle('../new_data/cleaned/control_clean.pkl')
#extra.to_pickle('../data/experiment/merged/extra_merged.pkl')
# In CSV
#pre_and_flight.to_csv('../data/experiment/merged/pre_and_flight_merged.csv')
control.to_csv('../new_data/cleaned/control_clean.csv')
#extra.to_csv('../data/experiment/merged/extra_merged.csv')
