In [None]:
#############################################################################
####################Import all packages######################################
#############################################################################
import os
import random
import numpy as np
#import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
#import pandas_profiling as pp
import pandas as pd
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.mode.chained_assignment = None  # default='warn'
# Set working directory
#############################################################################
# Make Jupyter Notebook show ALL output of a cell, not only the last one
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
def read_all_csvs_in_folder(path):
    ''' Function reading in all CSV files in a given folder.
        1. First lists all files in folder.
        2. Then reads in only files which are NOT of file size = 0 and contain headers + AT LEAST 1 row of data
        3. Appends all files into a final dataframe
        4. Also prints the amount of files in the folder, and the amount of files used for the final dataframe.
    '''
    import pandas as pd
    import glob
    import os
    # Create list containing all files names of the current folder
    all_files = glob.glob(path + "/*.csv")
    # These empty lists will be filled with:
    # 1. the names of the CSV files we want to append/merge, 
    # 2. 2 lists containing numbers representing all the files that were empty (or headers but no row data) and not read
    
    list_with_dfs = []
    zero_files = []
    empty_files = []
    empty_row_files = []
    filename_list = []
    n_files_appended = 0
    
    print('')
    print('')
    print('#####################################################')
    print('STARTING READ IN')
    print('#####################################################')
    print('')
    
    for filename in all_files:        
        print('Reading current file:')
        print(filename)
        # Check whether the CSV file is larger than 0Bytes (if not, it has for sure no data and will break during read in)
        try:        
            if os.path.getsize(filename) == 0:
                zero_files.append(filename)                
                print('')
                print('------------------------------------------------------')
                print('!!! SKIPPING FILE: ' + filename)
                print('    REASON: FILE SIZE == 0')
                print('------------------------------------------------------')
                print('')
            if os.path.getsize(filename) > 0:
                df = pd.read_csv(filename, index_col=None, header=0)            
                if df.empty:
                    empty_row_files.append(filename)
                    print('')
                    print('------------------------------------------------------')
                    print('!!! SKIPPING FILE: ' + filename)
                    print('    REASON: FILE CONTAINS NO ROW DATA')
                    print('------------------------------------------------------')
                    print('')
                # Check whether the CSV is empty (this ALSO works when it DOES have headers, but no row data. This is important!)
                if not df.empty:
                    list_with_dfs.append(df)
                    n_files_appended = n_files_appended + 1
                    filename_list.append(filename)
        except pd.errors.EmptyDataError:
            print('')
            print('------------------------------------------------------')
            print('!!! SKIPPING FILE: ' + filename)
            print('    REASON: FILE ENTIRELY EMPTY WITHOUT HEADERS')
            print('------------------------------------------------------')
            print('')
            empty_files.append(filename)            
            pass
    
    # Merge all files in the file list
    frame = pd.concat(list_with_dfs, axis=0, ignore_index=True)
    print('')
    print('#####################################################')
    print('!!! DONE READING & MERGING DATA !!!')
    print('#####################################################')
    print('')
    print('------------------------------------------------------')
    print('All files: ' + str(len(all_files)))    
    print('')
    print('Successfully appended files: ' + str(n_files_appended))
    print('------------------------------------------------------')
    print('------------------------------------------------------')
    print('')
    print('Empty files without headers: ' + str(len(empty_files)))
    #print(len(empty_files))
    print(empty_files)
    print('')
    print('Zero size files: ' + str(len(zero_files)))
    #print(len(zero_files))
    print(zero_files)
    print('')
    print('Empty row files: ' + str(len(empty_row_files)))
    #print(len(empty_row_files))
    print(empty_row_files)
    print('')    
    print('#####################################################')
    
    return frame    

In [4]:
# Read in files and merge them
#PreFlight & Flight
#path_pre_and_flight = r'../data/experiment/raw/exp_raw_pre_and_flight/' 
#pre_and_flight = read_all_csvs_in_folder(path_pre_and_flight)

path_postflight2 = '/home/taco/Documents/greta_forarex/new_data/postflight_control_FINAL/Postflightdata_2/'

# PostFlight
postflight_2 = read_all_csvs_in_folder(path_postflight2)




In [9]:
## Remove all unncessary columns
cols_to_keep = ['timeStamp',
                'Exp0_OxygenTemp', 'Exp0_OxygenpercentO2', 'Exp0_PhValue',
                'Exp1_OxygenTemp', 'Exp1_OxygenpercentO2',
                'Pressure_LateAccess']

#pre_and_flight = pre_and_flight[cols_to_keep]
postflight_2 = postflight_2[cols_to_keep]
#extra = extra[cols_to_keep]

In [None]:
# DFs contain negligible amount of missing values (NAs), just drop them
#import missingno as msno
#msno.matrix(postflight_1)
#pre_and_flight.dropna(inplace=True)
postflight_2.dropna(inplace=True)
#extra.dropna(inplace=True)

In [34]:
# Naive DateTime TimeStamp needs to be increased by 1hour (UTC+1 for Sweden) compared to what was recorded (apparently UTC, UK)
#pre_and_flight.loc[:,'timeStamp'] = pd.to_datetime(pre_and_flight.timeStamp, unit='s')
postflight_2.timeStamp = pd.to_datetime(postflight_2.timeStamp.astype('int'), unit='s')
#extra.loc[:,'timeStamp'] = pd.to_datetime(extra.timeStamp, unit='s').dt.tz_localize('UTC').dt.tz_convert('Europe/Stockholm')


In [33]:
# Drop corrupted row without any timestamp value
postflight_2.drop(251267, inplace=True)

(353579, 7)

In [36]:
# Check the data range of the final timestamps
postflight_2.timeStamp.min()
postflight_2.timeStamp.max()


Timestamp('2019-03-15 09:49:07')

Timestamp('2019-04-11 13:01:16')

In [47]:
postflight_2.sort_values('timeStamp', inplace=True)
postflight_2.reset_index(inplace=True, drop=True)

In [40]:
postflight_2.head(10)


Unnamed: 0,timeStamp,Exp0_OxygenTemp,Exp0_OxygenpercentO2,Exp0_PhValue,Exp1_OxygenTemp,Exp1_OxygenpercentO2,Pressure_LateAccess
158896,2019-03-15 09:49:07,23667.0,19830.0,7078.0,21927.0,1146.0,9466.0
248134,2019-03-15 09:49:17,23672.0,15297.0,7078.0,21927.0,811.0,9466.0
231474,2019-03-15 09:49:27,23683.0,15297.0,7077.0,21929.0,804.0,9464.0
119882,2019-03-15 09:49:37,23685.0,15296.0,7076.0,21932.0,808.0,9463.0
44927,2019-03-15 09:49:47,23685.0,15297.0,7076.0,21937.0,809.0,9466.0
294049,2019-03-15 10:07:52,0.0,0.0,0.0,0.0,0.0,9464.0
324358,2019-03-15 10:07:57,23872.0,19872.0,0.0,22150.0,4666.0,9464.0
324359,2019-03-15 10:08:02,23872.0,15292.0,7034.0,22150.0,3601.0,9464.0
102907,2019-03-15 10:08:07,23872.0,15289.0,7037.0,22155.0,3616.0,9465.0
102908,2019-03-15 10:08:12,23875.0,15297.0,7039.0,22158.0,3654.0,9464.0


In [42]:
# Create different time epochs for convenient plotting & exploration
# Postflight
postflight_2.loc[:, 'year'] = postflight_2.timeStamp.dt.year
postflight_2.loc[:, 'month'] = postflight_2.timeStamp.dt.month
postflight_2.loc[:, 'day'] = postflight_2.timeStamp.dt.day
postflight_2.loc[:, 'hour'] = postflight_2.timeStamp.dt.hour
postflight_2.loc[:, 'minute'] = postflight_2.timeStamp.dt.minute
postflight_2.loc[:, 'second'] = postflight_2.timeStamp.dt.second


In [44]:
# Scale columns values 
# Postflight
# All divided by 10000
postflight_2.loc[:,'Exp0_OxygenTemp'] = postflight_2.Exp0_OxygenTemp/1000
postflight_2.loc[:,'Exp1_OxygenTemp'] = postflight_2.Exp1_OxygenTemp/1000
postflight_2.loc[:,'Exp0_OxygenpercentO2'] = postflight_2.Exp0_OxygenpercentO2/1000
postflight_2.loc[:,'Exp1_OxygenpercentO2'] = postflight_2.Exp1_OxygenpercentO2/1000
postflight_2.loc[:,'Exp0_PhValue'] = postflight_2.Exp0_PhValue/1000
# Pressure divided by 10
postflight_2.loc[:, 'Pressure_LateAccess'] = postflight_2.Pressure_LateAccess/10


In [49]:
postflight_2.timeStamp.min()
postflight_2.timeStamp.max()
postflight_2.head(10)

Timestamp('2019-03-15 09:49:07')

Timestamp('2019-04-11 13:01:16')

Unnamed: 0,timeStamp,Exp0_OxygenTemp,Exp0_OxygenpercentO2,Exp0_PhValue,Exp1_OxygenTemp,Exp1_OxygenpercentO2,Pressure_LateAccess,year,month,day,hour,minute,second
0,2019-03-15 09:49:07,23.667,19.83,7.078,21.927,1.146,946.6,2019,3,15,9,49,7
1,2019-03-15 09:49:17,23.672,15.297,7.078,21.927,0.811,946.6,2019,3,15,9,49,17
2,2019-03-15 09:49:27,23.683,15.297,7.077,21.929,0.804,946.4,2019,3,15,9,49,27
3,2019-03-15 09:49:37,23.685,15.296,7.076,21.932,0.808,946.3,2019,3,15,9,49,37
4,2019-03-15 09:49:47,23.685,15.297,7.076,21.937,0.809,946.6,2019,3,15,9,49,47
5,2019-03-15 10:07:52,0.0,0.0,0.0,0.0,0.0,946.4,2019,3,15,10,7,52
6,2019-03-15 10:07:57,23.872,19.872,0.0,22.15,4.666,946.4,2019,3,15,10,7,57
7,2019-03-15 10:08:02,23.872,15.292,7.034,22.15,3.601,946.4,2019,3,15,10,8,2
8,2019-03-15 10:08:07,23.872,15.289,7.037,22.155,3.616,946.5,2019,3,15,10,8,7
9,2019-03-15 10:08:12,23.875,15.297,7.039,22.158,3.654,946.4,2019,3,15,10,8,12


In [None]:
# Saving to disk the merged files (not yet processed)
# In Pickle format for fast read in

postflight_2.to_pickle('../new_data/cleaned/postflight_2_clean.pkl')


postflight_2.to_csv('../new_data/cleaned/postflight_2_clean.csv')
#extra.to_csv('../data/experiment/merged/extra_merged.csv')
