# Ocean Climate Project
## Extracting Headers
Giulia Bronzi  
June 2020

In [302]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime
import os

In [303]:
#the code you provided
# Generate the list
infiles = 'comfortcove.list'
os.system('ls ~/Desktop/ComfortCove/*.rpf > ' + infiles)
filelist = np.genfromtxt(infiles, dtype=str)
filelist = np.reshape(filelist, filelist.size) 

In [304]:
#the code you provided
dfs = [] #array of all the dataframes, one for each file
for fname in filelist: 
    df = pd.read_csv(fname, sep='\s+',  parse_dates={'datetime': [0, 1]}, header=15)
    df = df.set_index('datetime')
    df.columns = ['temperature']
    df = df.replace(9999.99, np.NaN)
    #print(fname, df.max())
    dfs.append(df)

In [305]:
#the code you provided
# concatenate all data <----------- Here we need to check if duplicates   
df_all = pd.concat(dfs, axis=0)
df_all = df_all.sort_index()

### Extracting data from file headers

In [306]:
dfsHeader = [] 

#Creates an array with a dataframe for each file. Each data frame is composed of the header data.
for fname in filelist:
    df = pd.read_csv(fname, nrows = 14)
    headerValue = df["HEADER"].str.split("=", expand = True)
    df["Title"] = headerValue[0]
    df["Value"] = headerValue[1]
    df = df[['Title',"Value"]]
    dfsHeader.append(df)

In [307]:
dfsHeader[:3] #first three file headers

[                   Title             Value
 0                STATION               118
 1              SITE_NAME      COMFORT COVE
 2             START_DATE        1994-08-10
 3             START_TIME          17:00:00
 4               END_DATE        1994-12-01
 5               END_TIME          17:00:00
 6               LATITUDE             49.41
 7              LONGITUDE             54.83
 8              INST_TYPE            Hugrun
 9          SERIAL_NUMBER            008972
 10           WATER_DEPTH               009
 11            INST_DEPTH               009
 12     SAMPLING_INTERVAL               060
 13             FILE_NAME  1189405_utf8.PRO,
                    Title             Value
 0                STATION               118
 1              SITE_NAME      COMFORT COVE
 2             START_DATE        2008-05-22
 3             START_TIME          22:00:00
 4               END_DATE        2008-11-02
 5               END_TIME          13:00:00
 6               LATITUDE      

In [308]:
station = []
siteName = []
startDate = []
startTime = []
endDate = []
endTime = []
latitude = []
longitude = []
instType = []
serialNumber = []
waterDepth = []
instDepth = []
samplingInterval = []
fileName = []

count = 0

#extracts each individual value from each header item, from each file
#adds each value to correlated array
for file in dfsHeader:
    station.append(dfsHeader[count]["Value"][0])
    siteName.append(dfsHeader[count]["Value"][1])
    startDate.append(dfsHeader[count]["Value"][2])
    startTime.append(dfsHeader[count]["Value"][3])
    endDate.append(dfsHeader[count]["Value"][4])
    endTime.append(dfsHeader[count]["Value"][5])
    latitude.append(dfsHeader[count]["Value"][6])
    longitude.append(dfsHeader[count]["Value"][7])
    instType.append(dfsHeader[count]["Value"][8])
    serialNumber.append(dfsHeader[count]["Value"][9])
    waterDepth.append(dfsHeader[count]["Value"][10])
    instDepth.append(dfsHeader[count]["Value"][11])
    samplingInterval.append(dfsHeader[count]["Value"][12])
    fileName.append(dfsHeader[count]["Value"][13])
    count += 1

In [309]:
#creates new dataframe based on the arrays
headers = {'Stations': station,
           'Site Name': siteName,
           'Start Date': startDate,
           'Start Time': startTime,
           'End Date': endDate,
           'End Time': endTime,
           'Latitude': latitude,
           'Longitude': longitude,
           'Inst Type': instType,
           'Serial Number': serialNumber,
           'Water Depth': waterDepth,
           'Inst Depth': instDepth,
           'Sampling Interval': samplingInterval,
           'File Name': fileName }

headersdf = pd.DataFrame(headers)

In [310]:
#dataframe where each column is a component of the header
#each row is an individual file
headersdf

Unnamed: 0,Stations,Site Name,Start Date,Start Time,End Date,End Time,Latitude,Longitude,Inst Type,Serial Number,Water Depth,Inst Depth,Sampling Interval,File Name
0,118.0,COMFORT COVE,1994-08-10,17:00:00,1994-12-01,17:00:00,49.41,54.83,Hugrun,008972,9.0,9.0,060,1189405_utf8.PRO
1,118.0,COMFORT COVE,2008-05-22,22:00:00,2008-11-02,13:00:00,49.41,54.83,Hugrun,007934,9.0,9.0,060,1180801_utf8.PRO
2,118.0,COMFORT COVE NOTRE DAME BAY,1998-05-20,15:00:00,1998-09-09,13:00:00,49.41,54.83,Hugrun,008955,9.0,9.0,060,1189802_utf8.PRO
3,118.0,COMFORT COVE NOTRE DAME BAY,1995-11-23,16:00:00,1996-04-24,14:00:00,49.41,54.83,Hugrun,007939,9.0,9.0,060,1189601_utf8.PRO
4,118.0,,1992-11-25,15:00:00.00,1993-05-06,14:00:00.00,49.41,54.83,SEAMONUTR-B,007818,9.0,9.0,3600.0000,MTR_1992_118_007818_009_utf8.pipe
5,118.0,,1989-09-26,13:00:00.00,1989-11-25,14:00:00.00,49.41,54.83,SEAMONUTR-B,004361,9.0,9.0,3600.0000,MTR_1989_118_004361_009_utf8.pipe
6,,Comfort Cove,2005-05-18,12:00:00,2005-11-21,13:28:39,,,Minilog-T,2186,,,01:00:00,Asc2186_utf8.006
7,118.0,COMFORT COVE NOTRE DAME BAY,1996-11-29,16:00:00,1997-05-08,14:00:00,49.41,54.83,Hugrun,005475,9.0,9.0,060,1189702_utf8.PRO
8,118.0,,1993-08-27,15:00:00.00,1993-12-02,15:00:00.00,49.41,54.83,SEAMONUTR-B,008962,9.0,9.0,3600.0000,MTR_1993_118_008962_009_utf8.pipe
9,118.0,COMFORT COVE NOTRE DAME BAY,1995-06-01,15:00:00,1995-09-21,13:00:00,49.41,54.83,Hugrun,008967,9.0,9.0,060,1189504_utf8.PRO
