# This notebook imports the motion data collected on board the R/V Akademik Tryoshnikov during the Antarctic Circumnavigation Expedition (ACE). 

Import required packages

In [1]:
import csv
import os
import pandas
import datetime
import time

  return f(*args, **kwds)
  return f(*args, **kwds)


Set up pandas display

In [2]:
pandas.set_option('display.max_columns', 100)
pandas.set_option('display.max_rows', 20000)

Set up the hard-coded variables to import the data.

In [3]:
input_data_folder = "/media/jen/SAMSUNG/motion_data/"
output_data_folder = "/home/jen/projects/ace_data_management/wip/motion_data/"

### List data files

Get the set of raw motion data files in a list.

In [4]:
def get_input_txt_files(input_data_folder):
    
    list_data_files = []
    
    os.chdir(input_data_folder)
    directory_path = os.getcwd()
    
    for filename in os.listdir(input_data_folder):
        if filename.endswith(".txt"):
            fullpath = directory_path + "/" + filename
            list_data_files.append(fullpath)
    
    return list_data_files

### Check file headers

Check that all the files have the same header so that when creating the pandas data frame, the header can always be excluded. 

The header consists of 5 lines - an example below: 

Serial n° Hydrins :     PH-497

Created the :   07/01/2017              Time :  11h 10m 39.822s

Sample period : 1000 ms

Pc - HH:MM:SS.SSS       Hydrins - HH:MM:SS.SSS  Heading (°)     Roll (°)        Pitch (°)       Heading std. dev. (°)   Roll std. dev. (°)      Pitch std. dev. (°)     North speed (m/s)       East speed (m/s)        Vert. speed (m/s)       Speed norm (knots)      North speed std. dev. (m/s)     East speed std. dev. (m/s)      Vert. speed std. dev. (m/s)     Latitude (°)    Longitude (°)   Altitude (m)    Latitude std. dev. (m)  Longitude std. dev. (m) Altitude std. dev. (m)  Zone I  Zone C  UTM North (m)   UTM East (m)    UTM altitude  (m)       High level status       System status 1 System status 2 Algo status 1   Algo status 2   GPS - Latitude (°)      GPS - Longitude (°)     GPS - Altitude (m)      GPS - Mode      GPS - Time      Manual GPS - Latitude (°)       Manual GPS - Longitude (°)      Manual GPS - Altitude (m)       Manual GPS - Latitude std. dev. Manual GPS - Longitude std. dev.        Manual GPS - Altitude std. dev.

Note that we will need the date from the second line in the header and the column header will be checked and set as the column headers in the data frame. The length of the header in the files is not the same as the number of data columns so this also needs to be checked. 

The expected header is defined in order to compare that within each file (note that this ignores the first four lines).

In [5]:
expected_header = ['Pc - HH:MM:SS.SSS', 'Hydrins - HH:MM:SS.SSS', 'Heading (°)', 'Roll (°)', 'Pitch (°)', 'Heading std. dev. (°)', 'Roll std. dev. (°)', 'Pitch std. dev. (°)', 'North speed (m/s)', 'East speed (m/s)', 'Vert. speed (m/s)', 'Speed norm (knots)', 'North speed std. dev. (m/s)', 'East speed std. dev. (m/s)', 'Vert. speed std. dev. (m/s)', 'Latitude (°)', 'Longitude (°)', 'Altitude (m)', 'Latitude std. dev. (m)', 'Longitude std. dev. (m)', 'Altitude std. dev. (m)', 'Zone I', 'Zone C', 'UTM North (m)', 'UTM East (m)', 'UTM altitude  (m)', 'High level status', 'System status 1', 'System status 2', 'Algo status 1', 'Algo status 2', 'GPS - Latitude (°)', 'GPS - Longitude (°)', 'GPS - Altitude (m)', 'GPS - Mode', 'GPS - Time', 'Manual GPS - Latitude (°)', 'Manual GPS - Longitude (°)', 'Manual GPS - Altitude (m)', 'Manual GPS - Latitude std. dev.', 'Manual GPS - Longitude std. dev.', 'Manual GPS - Altitude std. dev.', '']

In [6]:
def check_file_header(list_data_files): 
    """Check that the header for each of the files in the list is as expected. If it is correct, add one to the number of correct headers that is output and if incorrect, output the filename and a copy of the header line. Also add one to the number of incorrect headers."""
    
    correct_headers = 0
    incorrect_headers = 0
    total_number_files = len(list_data_files)
    
    total_number_headers = 0
    for file in list_data_files:
        total_number_headers += 1
        print("Checking the header of file", total_number_headers, "out of", total_number_files)
        
        with open(file, 'r', encoding="ISO-8859-1") as csvfile:
            contents = csv.reader(csvfile, delimiter='\t')
        
            line_number = 0
        
            for line in contents:
                if line_number == 4:
                    if line != expected_header:
                        print("Wrong header: ", file, "  ", line)
                        incorrect_headers += 1
                    else:
                        correct_headers += 1
                
                line_number += 1
    
    total_no_files = correct_headers + incorrect_headers
    
    print("Correct headers: ", correct_headers)
    print("Incorrect headers: ", incorrect_headers)
    print("Total number of files: ", total_no_files)

### Get the date of the data collection

Read header of file to extract the date. This is important because otherwise, when the data are read into a data frame, they will not have a date associated with them.

Note that the line within the header that contains the date is, for example: 

Created the : 07/01/2017 Time : 11h 10m 39.822s

However not all the data may be collected on that particular date. Each data file contains 1000 ms of data, so if the file begins just before midnight then it is possible that it may end with data from the following day. This needs to be taken into account when assigning the date to the data. 

So far only the PC date has been added as a separate field (pc_date_utc). it is possible that the other time fields have a different date associated with them (Hydrins and GPS). 

In [7]:
def read_motion_file_date(filename):
    
    with open(filename, 'r', encoding="ISO-8859-1") as data_file: # need encoding because of degree characters
        data_file.readline() # skips first line
        date_line = data_file.readline()
        
        date = date_line.split("\t")[1]
        
        return datetime.datetime.strptime(date, "%d/%m/%Y")
    
def insert_date_of_data(list_data_files, rows_of_data):
    
    total_number_files = len(list_data_files)
    
    total_rows = 0
    number_files = 0
    for file in list_data_files:
        number_files += 1
        print("Processing file", number_files, " of ", total_number_files)
        
        date_of_file = read_motion_file_date(file)
    
        with open(file, 'r', encoding="ISO-8859-1") as data_file:
            contents = csv.reader(data_file, delimiter='\t')
            for i in range(5): 
                next(contents)
                    
            previous_time = None
            row_count = 0
            for line in contents:

                time_now = time.strptime(line[0], "%H:%M:%S.%f")

                if previous_time is None: 
                    current_date = date_of_file
                elif time_now < previous_time:
                    current_date = current_date + datetime.timedelta(days=1)
                
                line.insert(0, current_date.strftime("%Y-%m-%d"))
                previous_time = time_now         
                rows_of_data.append(line) 

                row_count += 1
            
            print(file, "contains a total number of rows:", row_count)
        
        total_rows += row_count
    print("\nTotal number of rows expected in dataframe:", total_rows)
                
    return rows_of_data

### Define the column headers

Get the header for the columns so that this can be assigned to the data frame. 

Currently this is listed in a csv file so that it can be changed and reimported as necessary. 

The data contains more columns than are listed in the header in the data files, so we need to ensure that these are assigned correctly in the pandas data frame. 

In [8]:
def define_column_headers(header_file):
    """Import a list of column headers from a csv file and output them as a list to be ready to be set as the column headers for a pandas data frame."""

    header = []

    with open(header_file) as headerfile:
        contents = csv.reader(headerfile)
        header_list = list(contents)
    
        for item in header_list: 
            header.append(item[0])
            
    return header

In [9]:
header_file = "/home/jen/projects/ace_data_management/wip/motion_data/file_header.csv"

header = define_column_headers(header_file)

### Create data frame and import data

Import all of the data into a data frame using pandas, which has the column headers as defined from the csv file of column headers.

The date needs to be extracted from the file header depending on the time of the data point, and inserted as an additional column in the data frame. 

Data will be put into a list first, so that the date can be added, then this list of data will be added to the pandas data frame.

In [10]:
def data_to_dataframe(rows_of_data, dataframe, header):
    
    dataframe = dataframe.append(pandas.DataFrame(rows_of_data, columns = header), ignore_index = True)
    
    return dataframe

### Output the data into csv files with one file per date.

In [11]:
def output_daily_files(dataframe, output_data_folder):
    
    output_filename_base = 'ace_hydrins_'

    date_group = dataframe.groupby('pc_date_utc')
    print("Aggregated groups by date with counts:")
    print(dataframe.groupby('pc_date_utc').size())
    print("\nTotal number of records:")
    print(dataframe.groupby('pc_date_utc').size().sum())
    
    for date in date_group.groups:
        date_formatted = datetime.datetime.strptime(date, "%Y-%m-%d")    

        date_string = date_formatted.strftime('%Y%m%d')
       
        output_filename = output_data_folder + output_filename_base + date_string + ".csv"
        
        date_group.get_group(date).to_csv(output_filename, sep=",", header=True, index=False)
        print(date, "file created") # TODO put a better check here that the file exists

### Set up a test list of files and test the import. 

In [None]:
#test_list_data_files = ['/media/jen/SAMSUNG/motion_data/ACE_Bremen-2_300.txt', '/media/jen/SAMSUNG/motion_data/ACE_Bremen-2_301.txt', '/media/jen/SAMSUNG/motion_data/ACE_Bremen-2_302.txt', '/media/jen/SAMSUNG/motion_data/ACE_Bremen-2_303.txt', '/media/jen/SAMSUNG/motion_data/ACE_Bremen-2_304.txt', '/media/jen/SAMSUNG/motion_data/ACE_Bremen-2_305.txt', '/media/jen/SAMSUNG/motion_data/ACE_Bremen-2_306.txt', '/media/jen/SAMSUNG/motion_data/ACE_Bremen-2_307.txt', '/media/jen/SAMSUNG/motion_data/ACE_Bremen-2_308.txt']

In [None]:
#print("Checking headers of files")
#print("\n")

#check_file_header(test_list_data_files)

#print("Reading files and creating data frame")

#motiondf = pandas.DataFrame(columns = header)

#rows_of_data = list()    

#rows_of_data = insert_date_of_data(test_list_data_files, rows_of_data)

#motiondf = data_to_dataframe(rows_of_data, motiondf, header)

#print("The dataframe has", len(motiondf), "rows")

Preview the dataframe

In [None]:
#motiondf.iloc[:5]

Test outputting the files (one per day)

In [None]:
#output_daily_files(motiondf, output_data_folder)

## Get the list of data files and import them into the dataframe. 

In [12]:
list_motion_data_files = get_input_txt_files(input_data_folder)
print(len(list_motion_data_files))

2819


In [13]:
print("Checking headers of files")
print("\n")

check_file_header(list_motion_data_files)

Checking headers of files


Checking the header of file 1 out of 2819
Checking the header of file 2 out of 2819
Checking the header of file 3 out of 2819
Checking the header of file 4 out of 2819
Checking the header of file 5 out of 2819
Checking the header of file 6 out of 2819
Checking the header of file 7 out of 2819
Checking the header of file 8 out of 2819
Checking the header of file 9 out of 2819
Checking the header of file 10 out of 2819
Checking the header of file 11 out of 2819
Checking the header of file 12 out of 2819
Checking the header of file 13 out of 2819
Checking the header of file 14 out of 2819
Checking the header of file 15 out of 2819
Checking the header of file 16 out of 2819
Checking the header of file 17 out of 2819
Checking the header of file 18 out of 2819
Checking the header of file 19 out of 2819
Checking the header of file 20 out of 2819
Checking the header of file 21 out of 2819
Checking the header of file 22 out of 2819
Checking the header of file 23 out 

In [None]:
print("Reading files and creating data frame")
motiondf = pandas.DataFrame(columns = header)

rows_of_data = list()    

rows_of_data = insert_date_of_data(list_motion_data_files, rows_of_data)

motiondf = data_to_dataframe(rows_of_data, motiondf, header)

print("The dataframe has", len(motiondf), "rows\n")


Reading files and creating data frame
Processing file 1  of  2819
/media/jen/SAMSUNG/motion_data/ACE-1_0.txt contains a total number of rows: 2877
Processing file 2  of  2819
/media/jen/SAMSUNG/motion_data/ACE-2_0.txt contains a total number of rows: 3600
Processing file 3  of  2819
/media/jen/SAMSUNG/motion_data/ACE-2_1.txt contains a total number of rows: 3600
Processing file 4  of  2819
/media/jen/SAMSUNG/motion_data/ACE-2_10.txt contains a total number of rows: 3600
Processing file 5  of  2819
/media/jen/SAMSUNG/motion_data/ACE-2_11.txt contains a total number of rows: 3600
Processing file 6  of  2819
/media/jen/SAMSUNG/motion_data/ACE-2_12.txt contains a total number of rows: 3600
Processing file 7  of  2819
/media/jen/SAMSUNG/motion_data/ACE-2_13.txt contains a total number of rows: 3600
Processing file 8  of  2819
/media/jen/SAMSUNG/motion_data/ACE-2_14.txt contains a total number of rows: 3600
Processing file 9  of  2819
/media/jen/SAMSUNG/motion_data/ACE-2_15.txt contains a to

In [None]:
motiondf.iloc[:5]

In [None]:
output_daily_files(motiondf, output_data_folder)

### Check that the output files have the same number of data rows as the dataframe had per day.

In [None]:
data_folder = "/home/jen/projects/ace_data_management/wip/motion_data"

Get a list of the output files to check (named ace_hydrins_yyyymmdd.csv)

In [None]:
def get_input_files(input_data_folder):
    
    list_data_files = []
    
    os.chdir(input_data_folder)
    directory_path = os.getcwd()
    
    for filename in os.listdir(input_data_folder):
        if filename.startswith("ace_hydrins_"):
            fullpath = directory_path + "/" + filename
            list_data_files.append(fullpath)
    
    return list_data_files

Count the number of rows in each file, ignoring the header and output with the date and number of rows

In [None]:
def check_rows_in_file(list_data_files):

    total_rows = 0
    for filepath in list_data_files:
        filename = os.path.basename(filepath)
        filedate = (filename.split('_')[-1]).split('.')[0] 

        with open(filepath, 'r') as csvfile:
            contents = csv.reader(csvfile)
            next(contents)

            row_count = 0
            for line in contents:
                row_count += 1

            print(filedate, " ", row_count)
        
        total_rows += row_count
    
    print("Total number of rows in files: ", total_rows)

In [None]:
list_data_files_to_check = get_input_files(data_folder)

check_rows_in_file(list_data_files_to_check)