<h1>Surfcast.ca</h1>
<h3>A Goodfellow Analytics Creation</h3>
<h5>In partnership with Griffin Global</h5>

In [1]:
# Reset Notebook
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [2]:
# Import python packages
from bs4 import BeautifulSoup
from urllib import request
import requests
from urllib.request import urlopen
from datetime import datetime, timedelta
from pytz import timezone
import pytz
import matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dateutil import parser
from dateutil import tz
import scipy.interpolate
import os
from sqlalchemy import create_engine # database connection

# Matplotlib inline
%matplotlib inline

# Hide ipython notebook warnings
import warnings
warnings.filterwarnings('ignore')

<h1>Step 1</h1>
<h3>Get HTML Database as List of Files</h3>

In [3]:
# File types of interest [wave, wind, surface current, surface temperature]
#extension_list = ['wav', 'wnd', 'cur', 'swt', 'ice', 'o', 'e', 's', 'm', 'h']
extensionList = ['wav', 'wnd','o', 's']

In [4]:
# Define NOAA database class
class NoaaDB:
    
    
    """
    Class: NoaaDB
        - This class converts the NCAST|FCAST FTP html database list into a pandas DataFrame
        
        - The gridded fields filename format is:

          LYYYYDDDHH.N.EXT

          L    = lake letter (s=Superior, m=Michigan, h=Huron, e=Erie, o=Ontario)
          YYYY = year at start of simulation (GMT)
          DDD  = Day Of Year at start of simulation (GMT)
          HH   = hr at start of simulation (GMT)
          N    = Site Number
    """
    
    
    
    
    # Set data URLs
    dataURL      = 'http://www.glerl.noaa.gov/ftp/EMF/glcfs/gridded_fields/'
    ncastDataURL = dataURL + 'NCAST/'
    fcastDataURL = dataURL + 'FCAST/'
    mapDataURL   = dataURL + 'map_files/'
    
    # Set NOAA File attributes
    attributes = {
        'wave':        ['grid_number', 'wave_height', 'wave_direction', 'wave_period'],  
        'wind':        ['grid_number', 'wind_speed', 'wind_direction'],
        'temperature': ['grid_number', 'surface_temperature'],
        'current':     ['grid_number', 'current_speed', 'current_direction'],
        'ice':         ['grid_number', 'ice_concentration', 'ice_thickness', 'ice_speed', 'ice_direction']
    }
    
    
    
    
    def __init__(self, extensionList):
        
        # Set Instance attributes
        self.extensionList = extensionList       # List of file type extensions   
        
        # Current UTC time as GMT
        self.currentDatetimeGMT = datetime.utcnow().replace(tzinfo=tz.gettz('GMT'))
        
        # Database dataframe
        self.DB = pd.DataFrame(index=[], columns=['fileName', 'fileExtension', 
                                                  'fileType', 'lake', 'fileDatetime', 
                                                  'currentDatetime','forecastType', 'fileURL']) 
        
        
        
        
    @staticmethod
    def getFileType(fileExtension):
        
        if fileExtension == 'wav':           # Wave
            return 'WAVES'
        elif fileExtension == 'wnd':         # Wind
            return 'WINDS'
        elif fileExtension == 'cur':         # Surface Current
            return 'SURFACE CURRENTS'
        elif fileExtension == 'swt':         # Surface Temperature
            return 'SURFACE TEMPS'
        elif fileExtension == 'ice':         # Ice Conditions
            return 'ICE PARAMS'
        
        
        
    
    @staticmethod
    def getLake(fileName):
        
        if fileName[0] == 'e':       # Lake Erie
            return 'erie'
        elif fileName[0] == 'h':     # Lake Huron
            return 'huron'
        elif fileName[0] == 'o':     # Lake Ontario
            return 'ontario'
        elif fileName[0] == 's':     # Lake Superior
            return 'superior'
        elif fileName[0] == 'm':     # Lake Michigan
            return 'michigan'  
        
        
        
        
    def getDB(self, DBType):
        
        """
        Get NOAA Database 
        """
        
        # Get DataFrame row count
        rows = self.DB.shape[0]-1
        
        # Set Database URL
        if DBType == 'NCAST':
            url = NoaaDB.ncastDataURL
        elif DBType == 'FCAST':
            url = NoaaDB.fcastDataURL
        
        # Get HTML from database page
        html = requests.get(url)
        
        # Create BeautifulSoup object
        htmlObj = BeautifulSoup(html.content)
        
        # Set database list as dataframe 
        for link in htmlObj.findAll('a', href=True):
            if (
                link.contents[0].split('.')[-1] in self.extensionList and 
                link.contents[0][0] in self.extensionList
               ):
                
                rows += 1  # row count                                                 
                
                fileName      = link.contents[0]                 # file name
                fileExtension = link.contents[0].split('.')[-1]  # file extension
                
                # file datetime as GMT
                fileDatetime = datetime.strptime(link.contents[0].split('.')[0]
                                                 [1:len(link.contents[0].split('.')[0])], "%Y%j%H") 
                fileDatetime = fileDatetime.replace(tzinfo=tz.gettz('GMT'))
                
                # Set file type
                fileType = NoaaDB.getFileType(fileExtension)
                
                # Set great lake
                lake = NoaaDB.getLake(fileName)          
                
                # save to dataframe
                self.DB.loc[rows] = [fileName, fileExtension, fileType, 
                                     lake, fileDatetime, self.currentDatetimeGMT, 
                                     DBType, url]   
                
                
                
                
    def getNewestEntries(self):
        
        """
        Get Newest NCAST and FCAST Files in database and save as DataFrame
        """
        
        # NCAST most recent upload time 
        maxtime = self.DB[(self.DB.forecastType == 'NCAST')]['fileDatetime'].max()  

        # Filter DB by NCAST and maxtime
        self.newestEntries = self.DB[(self.DB.forecastType == 'NCAST') &
                                     (self.DB.fileDatetime == maxtime)].reset_index(drop=True)

        # FCAST most recent upload time 
        maxtime = self.DB[(self.DB.forecastType == 'FCAST')]['fileDatetime'].max()    

        # Filter DB by FCAST and maxtime
        self.newestEntries = self.newestEntries.append(self.DB[(self.DB.forecastType == 'FCAST') & 
                                                               (self.DB.fileDatetime == maxtime)]).reset_index(drop=True)
        
        
        
        
    @staticmethod
    def getHeaderValue(dataString):
        
        """
        Define function to get and update text file header column
        """

        global headerString

        if 'dat' in dataString and dataString != headerString:
            headerString = dataString
            return headerString
        else:
            return headerString
        
        
        
        
    @staticmethod
    def getMapFile(mapString, lake):
        
        """
        Define function to format map string
        """

        map = mapString.split('/')[-1]
        map = map.split('.')[0] + '.' + 'map'

        if lake == 'superior':
            map = 'superior' + map.split('sup')[1]

        return map

    
    
    
    @staticmethod
    def getAttributeList(fileType):
        
        """
        Define function to get list of attributes based on filetype
        """

        if  fileType == 'WAVES':
            return NoaaDB.attributes['wave']
        elif fileType == 'WINDS':
            return NoaaDB.attributes['wind']
        elif fileType == 'SURFACE TEMPS':
            return NoaaDB.attributes['temperature']
        elif fileType == 'SURFACE CURRENTS':
            return NoaaDB.attributes['current']
        elif fileType == 'ICE PARAMS':
            return NoaaDB.attributes['ice']
        
        
        
        
    def setupGriddedFieldsData(self):
        
        """
        Setup Gridded Fields Data DataFrame
        """
        
        # Get list of unique file types
        uniqueFileTypes = self.newestEntries.fileType.unique()
        
        # Setup Gridded Fields Data DataFrame
        griddedFieldsData = pd.DataFrame(columns=['year', 'day', 'hour', 'datetime',                  
                                                   'grid_number', 'latitude', 'longitude', 
                                                   'map', 'lake'])   
        
        if any('WAVES' in s for s in uniqueFileTypes):                             # wave
            for col in NoaaDB.attributes['wave']:
                griddedFieldsData[col] = pd.Series(index=griddedFieldsData.index) 
                
        if any('WINDS' in s for s in uniqueFileTypes):                             # wind              
            for col in NoaaDB.attributes['wind']:
                griddedFieldsData[col] = pd.Series(index=griddedFieldsData.index)
                
        if any('SURFACE TEMPS' in s for s in uniqueFileTypes):                     # temperature
            for col in NoaaDB.attributes['temperature']:
                griddedFieldsData[col] = pd.Series(index=griddedFieldsData.index)
                
        if any('SURFACE CURRENTS' in s for s in uniqueFileTypes):                  # current
            for col in NoaaDB.attributes['current']:
                griddedFieldsData[col] = pd.Series(index=griddedFieldsData.index)
                
        if any('ICE PARAMS' in s for s in uniqueFileTypes):                        # ice
            for col in NoaaDB.attributes['ice']:
                griddedFieldsData[col] = pd.Series(index=griddedFieldsData.index)
                
        return griddedFieldsData
    
    

    
    @staticmethod
    def getTextFileData(row):

        # Get list of attributes based on filetype
        print(row.ix[0, 'fileType'])
        cols = NoaaDB.getAttributeList(row.ix[0, 'fileType'])

        # Download lake attribute text file and save as DataFrame
        dfFile = pd.read_table(
            row.ix[0, 'fileURL'] + row.ix[0, 'fileName'], 
            header=None,
            names=['data']
        )

        # Set global variable text file header
        global headerString
        headerString = dfFile.ix[0, 'data']

        # Get text file header as new DataFrame column   
        dfFile['header'] = dfFile['data'].map(lambda x: NoaaDB.getHeaderValue(x))

        # Add Lake column
        dfFile['lake'] = row.ix[0, 'lake']

        # Add CAST Type
        dfFile['forecast_type'] = row.ix[0, 'forecastType']

        # Remove header rows
        dfFile['data'] = dfFile['data'].map(lambda x: np.nan if 'dat' in x else x)
        dfFile = dfFile.dropna()

        # Extract date and map information from header and set as DataFrame columns
        dfFile[['year', 'day', 'hour', 'map']] = dfFile['header'].str.split(return_type='frame').ix[:, 0:3]

        # Parse attribute column as set as DataFrame columns
        dfFile[cols] = dfFile['data'].str.split(return_type='frame')

        # Formate map string column
        dfFile['map'] = dfFile['map'].map(lambda x: NoaaDB.getMapFile(x, row.ix[0, 'lake']))

        # Add Datetime Object
        dfFile['datetime'] = dfFile.apply(lambda x: 
                                          datetime.strptime(x['year'] + x['day'] + x['hour'], "%Y%j%H"), 
                                          axis=1)

        # Drop useless columns 
        dfFile = dfFile.drop('data', axis=1).drop('header', axis=1)
        
        # Return text file DataFrame
        return dfFile
                
                
                
                
    def getAllGriddedFieldsData(self):
        
        """
        Get gridded fields data
        """
   
        # Setup Gridded Fields Data DataFrame
        griddedFieldsData = self.setupGriddedFieldsData()
                
        # Loop through lakes
        for lake in self.newestEntries.lake.unique():
            
            # Set attribute counter
            attributeCount = 0
            
            # Loop through atributes
            for fileType in self.newestEntries.fileType.unique():
                
                # Update attribute counter
                attributeCount += 1
                
                # Get NCAST file data
                dfFileNCAST = NoaaDB.getTextFileData(self.newestEntries[
                        (self.newestEntries['lake'] == lake) &
                        (self.newestEntries['fileType'] == fileType) &
                        (self.newestEntries['forecastType'] == 'NCAST')]).reset_index(drop=True)
                
                # Get FCAST file data
                dfFileFCAST = NoaaDB.getTextFileData(self.newestEntries[
                        (self.newestEntries['lake'] == lake) &
                        (self.newestEntries['fileType'] == fileType) &
                        (self.newestEntries['forecastType'] == 'FCAST')]).reset_index(drop=True)
                
                # Concatenate NCAST and FCAST DataFrames
                dfFile = dfFileNCAST
                dfFile = dfFile.append(dfFileFCAST[dfFileFCAST['datetime'] > dfFileNCAST['datetime'].max()])

                # Merge lake specific attribute DataFrames
                if attributeCount == 1:
                    dfLake = dfFile
                else:
                    dfLake = pd.merge(dfLake, dfFile[cols + ['datetime']])

            # Append lake DataFrames
            griddedFieldsData = griddedFieldsData.append(dfLake, ignore_index=True)

        return griddedFieldsData
    

In [5]:
# Create NoaaDB object
noaaDB = NoaaDB(extensionList)

In [6]:
# Get NCAST and FCAST Database
noaaDB.getDB('NCAST') 
noaaDB.getDB('FCAST') 

In [7]:
# Show Dataframe of NoaaDB files 
noaaDB.DB.head()

Unnamed: 0,fileName,fileExtension,fileType,lake,fileDatetime,currentDatetime,forecastType,fileURL
0,o201615812.0.wav,wav,WAVES,ontario,2016-06-06 12:00:00,2016-06-14 23:59:44.132585,NCAST,http://www.glerl.noaa.gov/ftp/EMF/glcfs/gridde...
1,o201615812.0.wnd,wnd,WINDS,ontario,2016-06-06 12:00:00,2016-06-14 23:59:44.132585,NCAST,http://www.glerl.noaa.gov/ftp/EMF/glcfs/gridde...
2,o201615818.0.wav,wav,WAVES,ontario,2016-06-06 18:00:00,2016-06-14 23:59:44.132585,NCAST,http://www.glerl.noaa.gov/ftp/EMF/glcfs/gridde...
3,o201615818.0.wnd,wnd,WINDS,ontario,2016-06-06 18:00:00,2016-06-14 23:59:44.132585,NCAST,http://www.glerl.noaa.gov/ftp/EMF/glcfs/gridde...
4,o201615900.0.wav,wav,WAVES,ontario,2016-06-07 00:00:00,2016-06-14 23:59:44.132585,NCAST,http://www.glerl.noaa.gov/ftp/EMF/glcfs/gridde...


In [8]:
# Show Dataframe of NoaaDB files 
noaaDB.DB.tail()

Unnamed: 0,fileName,fileExtension,fileType,lake,fileDatetime,currentDatetime,forecastType,fileURL
177,s201616512.0.wnd,wnd,WINDS,superior,2016-06-13 12:00:00,2016-06-14 23:59:44.132585,FCAST,http://www.glerl.noaa.gov/ftp/EMF/glcfs/gridde...
178,s201616600.0.wav,wav,WAVES,superior,2016-06-14 00:00:00,2016-06-14 23:59:44.132585,FCAST,http://www.glerl.noaa.gov/ftp/EMF/glcfs/gridde...
179,s201616600.0.wnd,wnd,WINDS,superior,2016-06-14 00:00:00,2016-06-14 23:59:44.132585,FCAST,http://www.glerl.noaa.gov/ftp/EMF/glcfs/gridde...
180,s201616612.0.wav,wav,WAVES,superior,2016-06-14 12:00:00,2016-06-14 23:59:44.132585,FCAST,http://www.glerl.noaa.gov/ftp/EMF/glcfs/gridde...
181,s201616612.0.wnd,wnd,WINDS,superior,2016-06-14 12:00:00,2016-06-14 23:59:44.132585,FCAST,http://www.glerl.noaa.gov/ftp/EMF/glcfs/gridde...


In [9]:
# Get most recently uploaded data
noaaDB.getNewestEntries()

In [10]:
# Show Dataframe of newest NoaaDB files 
noaaDB.newestEntries.head()

Unnamed: 0,fileName,fileExtension,fileType,lake,fileDatetime,currentDatetime,forecastType,fileURL
0,o201616612.0.wav,wav,WAVES,ontario,2016-06-14 12:00:00,2016-06-14 23:59:44.132585,NCAST,http://www.glerl.noaa.gov/ftp/EMF/glcfs/gridde...
1,o201616612.0.wnd,wnd,WINDS,ontario,2016-06-14 12:00:00,2016-06-14 23:59:44.132585,NCAST,http://www.glerl.noaa.gov/ftp/EMF/glcfs/gridde...
2,s201616612.0.wav,wav,WAVES,superior,2016-06-14 12:00:00,2016-06-14 23:59:44.132585,NCAST,http://www.glerl.noaa.gov/ftp/EMF/glcfs/gridde...
3,s201616612.0.wnd,wnd,WINDS,superior,2016-06-14 12:00:00,2016-06-14 23:59:44.132585,NCAST,http://www.glerl.noaa.gov/ftp/EMF/glcfs/gridde...
4,o201616612.0.wav,wav,WAVES,ontario,2016-06-14 12:00:00,2016-06-14 23:59:44.132585,FCAST,http://www.glerl.noaa.gov/ftp/EMF/glcfs/gridde...


In [11]:
# Show Dataframe of newest NoaaDB files 
noaaDB.newestEntries.tail()

Unnamed: 0,fileName,fileExtension,fileType,lake,fileDatetime,currentDatetime,forecastType,fileURL
3,s201616612.0.wnd,wnd,WINDS,superior,2016-06-14 12:00:00,2016-06-14 23:59:44.132585,NCAST,http://www.glerl.noaa.gov/ftp/EMF/glcfs/gridde...
4,o201616612.0.wav,wav,WAVES,ontario,2016-06-14 12:00:00,2016-06-14 23:59:44.132585,FCAST,http://www.glerl.noaa.gov/ftp/EMF/glcfs/gridde...
5,o201616612.0.wnd,wnd,WINDS,ontario,2016-06-14 12:00:00,2016-06-14 23:59:44.132585,FCAST,http://www.glerl.noaa.gov/ftp/EMF/glcfs/gridde...
6,s201616612.0.wav,wav,WAVES,superior,2016-06-14 12:00:00,2016-06-14 23:59:44.132585,FCAST,http://www.glerl.noaa.gov/ftp/EMF/glcfs/gridde...
7,s201616612.0.wnd,wnd,WINDS,superior,2016-06-14 12:00:00,2016-06-14 23:59:44.132585,FCAST,http://www.glerl.noaa.gov/ftp/EMF/glcfs/gridde...


In [12]:
# Get Gridded Fields Data
griddedFieldsData = noaaDB.getAllGriddedFieldsData()

# View DataFrame
griddedFieldsData.head()

WAVES


KeyError: 0

<h1>Step 2</h1>
<h3>Find Most Recently Uploaded Files</h3>

In [18]:
class NoaaData():
    
    """
    NOAA database file class
    """
    
    def __init__(self, noaa_files, url_map):
        
        """
        Initialize object
        """
        
        # Set object attributes
        self.noaa_files = noaa_files              # user input NoaaDB object (Pandas Dataframe of all files in Noaa DB)
        self.df_ncast = {}                        # downloaded NCAST text files as DataFrame
        self.df_fcast = {}                        # downloaded FCAST text files as DataFrame
        self.df = {}                              # conbined NCAST and FCAST DataFrames of most recent 120 hr forecast
        self.url_map = url_map                    # url containing map files
        
        # Set NOAA File attributes
        self.attributes = {
            'wave':        ['grid_number', 'wave_height', 'wave_direction', 'wave_period'],  
            'wind':        ['grid_number', 'wind_speed', 'wind_direction'],
            'temperature': ['grid_number', 'surface_temperature'],
            'current':     ['grid_number', 'current_speed', 'current_direction'],
            'ice':         ['grid_number', 'ice_concentration', 'ice_thickness', 'ice_speed', 'ice_direction']
        }

        # Set map path
        self.map_path = r'C:\Users\Sebastian\Projects\Websites\Surfcast\GetData\GridFiles'
        
        # Get Newest NCAST Files in database and save as DataFrame
        maxtime = noaa_files.df[(noaa_files.df.forecast_type == 'NCAST')]['file_datetime'].max()
        self.newest_files_ncast = noaa_files.df[(noaa_files.df.forecast_type == 'NCAST') &
                                                (noaa_files.df.file_datetime == maxtime)].reset_index(drop=True)
        
        # Get Newest FCAST Files in database and save as DataFrame
        maxtime = noaa_files.df[(noaa_files.df.forecast_type == 'FCAST')]['file_datetime'].max()
        self.newest_files_fcast = noaa_files.df[(noaa_files.df.forecast_type == 'FCAST') & 
                                                (noaa_files.df.file_datetime == maxtime)].reset_index(drop=True)

        
        
        
    def df_setup(self):
        
        """
        Set up empty DataFrames to hold text file data
        """
 
        # Get list of unique NCAST file types and lakes
        filetype_ncast = self.newest_files_ncast.filetype.unique()
        
        # Setup NCAST DataFrame
        self.df_ncast  = pd.DataFrame(columns=['year', 'day', 'hour', 'datetime',                  
                                               'grid_number', 'latitude', 'longitude', 
                                               'map', 'lake'])   
        
        if any('WAVES' in s for s in filetype_ncast):                         # wave
            for col in self.attributes['wave']:
                self.df_ncast[col] = pd.Series(index=self.df_ncast.index) 
                
        if any('WINDS' in s for s in filetype_ncast):                         # wind              
            for col in self.attributes['wind']:
                self.df_ncast[col] = pd.Series(index=self.df_ncast.index)
                
        if any('SURFACE TEMPS' in s for s in filetype_ncast):                 # temperature
            for col in self.attributes['temperature']:
                self.df_ncast[col] = pd.Series(index=self.df_ncast.index)
                
        if any('SURFACE CURRENTS' in s for s in filetype_ncast):              # current
            for col in self.attributes['current']:
                self.df_ncast[col] = pd.Series(index=self.df_ncast.index)
                
        if any('ICE PARAMS' in s for s in filetype_ncast):                    # ice
            for col in self.attributes['ice']:
                self.df_ncast[col] = pd.Series(index=self.df_ncast.index)

        # Get list of unique FCAST file types and lakes
        filetype_fcast = self.newest_files_fcast.filetype.unique()

        # Setup FCAST DataFrame
        self.df_fcast  = pd.DataFrame(columns=['year', 'day', 'hour', 'datetime',              
                                               'grid_number', 'latitude', 'longitude', 
                                               'map', 'lake'])   
        
        if any('WAVES' in s for s in filetype_fcast):                         # wave
            for col in self.attributes['wave']:
                self.df_fcast[col] = pd.Series(index=self.df_fcast.index) 
                
        if any('WINDS' in s for s in filetype_fcast):                         # wind              
            for col in self.attributes['wind']:
                self.df_fcast[col] = pd.Series(index=self.df_fcast.index)
                
        if any('SURFACE TEMPS' in s for s in filetype_fcast):                 # temperature
            for col in self.attributes['temperature']:
                self.df_fcast[col] = pd.Series(index=self.df_fcast.index)
                
        if any('SURFACE CURRENTS' in s for s in filetype_fcast):              # current
            for col in self.attributes['current']:
                self.df_fcast[col] = pd.Series(index=self.df_fcast.index)
                
        if any('ICE PARAMS' in s for s in filetype_fcast):                    # ice
            for col in self.attributes['ice']:
                self.df_fcast[col] = pd.Series(index=self.df_fcast.index)

                
   

    @staticmethod
    def getHeaderValue(val):
        
        """
        Define function to get and update text file header column
        """

        global headerString

        if 'dat' in val and val != headerString:
            headerString = val
            return headerString
        else:
            return headerString
        
        
        
        
    @staticmethod
    def getMapFile(val, lake):
        
        """
        Define function to format map string
        """

        map = val.split('/')[-1]
        map = map.split('.')[0] + '.' + 'map'

        if lake == 'superior':
            map = 'superior' + map.split('sup')[1]

        return map

    
    
    
    @staticmethod
    def getAttributeList(fileType, attributes):
        
        """
        Define function to get list of attributes based on filetype
        """

        if  fileType == 'WAVES':
            return attributes['wave']
        elif fileType == 'WINDS':
            return attributes['wind']
        elif fileType == 'SURFACE TEMPS':
            return attributes['temperature']
        elif fileType == 'SURFACE CURRENTS':
            return attributes['current']
        elif fileType == 'ICE PARAMS':
            return attributes['ice']
        
        
                
                        
    def df_fill(self):
        
        """
        Fill NCAST and FCAST DataFrames with text file data
        """
        
        # -------------------------------------------------------------------------------------------------------------------- #
        # NCAST 
        # -------------------------------------------------------------------------------------------------------------------- #
        
        # Loop through lakes
        for lake in self.newest_files_ncast.lake.unique():

            # Get DataFrame of attribute files to downhole
            df_lake = self.newest_files_ncast[(self.newest_files_ncast.lake == lake)]
            
            # Set attribute counter
            attributeCount = 0
            
            # Loop through atributes
            for df_index in df_lake.index:
                
                # Update attribute counter
                attributeCount += 1
                
                # Get list of attributes based on filetype
                cols = self.getAttributeList(self.newest_files_ncast.filetype[df_index], self.attributes)

                # Download lake attribute text file and save as DataFrame
                dfFile = pd.read_table(
                    self.newest_files_ncast.file_url[df_index] + self.newest_files_ncast.filename[df_index], 
                    header=None,
                    names=['data']
                )

                # Set global variable text file header
                global headerString
                headerString = dfFile.ix[0, 'data']
       
                # Get text file header as new DataFrame column   
                dfFile['header'] = dfFile['data'].map(lambda x: self.getHeaderValue(x))

                # Add Lake column
                dfFile['lake'] = lake
                
                # Add CAST Type
                dfFile['forecast_type'] = 'NCAST'

                # Remove header rows
                dfFile['data'] = dfFile['data'].map(lambda x: np.nan if 'dat' in x else x)
                dfFile = dfFile.dropna()

                # Extract date and map information from header and set as DataFrame columns
                dfFile[['year', 'day', 'hour', 'map']] = dfFile['header'].str.split(return_type='frame').ix[:, 0:3]

                # Parse attribute column as set as DataFrame columns
                dfFile[cols] = dfFile['data'].str.split(return_type='frame')

                # Formate map string column
                dfFile['map'] = dfFile['map'].map(lambda x: self.getMapFile(x, lake))
                
                # Add Datetime Object
                dfFile['datetime'] = dfFile.apply(lambda x: 
                                                  datetime.strptime(x['year'] + x['day'] + x['hour'], "%Y%j%H"), 
                                                  axis=1)

                # Drop useless columns 
                dfFile = dfFile.drop('data', axis=1).drop('header', axis=1)
                
                # Merge lake specific attribute DataFrames
                if attributeCount == 1:
                    dfLake = dfFile
                else:
                    dfLake = pd.merge(dfLake, dfFile[cols + ['datetime']])
                
            # Get map grid locations
            mapFile = pd.read_table(self.url_map + dfFile.ix[dfFile.index[0], 'map'], header=None, 
                                    sep=r"\s*", names=['sequence number', 'fortran column','fortran row',
                                                       'latitude', 'longitude', 'depth'])
                
            # Add latitude and longitude
            multiple = int(dfLake.shape[0] / mapFile.shape[0])

            dfLake['latitude'] =  pd.concat([mapFile] * multiple, ignore_index=True)['latitude'] 
            dfLake['longitude'] =  pd.concat([mapFile] * multiple, ignore_index=True)['longitude']

            # Append lake DataFrames
            self.df_ncast = self.df_ncast.append(dfLake, ignore_index=True)
            
        # -------------------------------------------------------------------------------------------------------------------- #
        # FCAST 
        # -------------------------------------------------------------------------------------------------------------------- #
        
        # Loop through lakes
        for lake in self.newest_files_fcast.lake.unique():
            
            # Get DataFrame of attribute files to downhole
            df_lake = self.newest_files_fcast[(self.newest_files_fcast.lake == lake)]

            # Set attribute counter
            attributeCount = 0

            # Loop through atributes
            for df_index in df_lake.index:

                # Update attribute counter
                attributeCount += 1

                # Get list of attributes based on filetype
                cols = self.getAttributeList(self.newest_files_fcast.filetype[df_index], self.attributes)

                # Download lake attribute text file and save as DataFrame
                dfFile = pd.read_table(
                    self.newest_files_fcast.file_url[df_index] + self.newest_files_fcast.filename[df_index], 
                    header=None,
                    names=['data']
                )

                # Set global variable text file header
                global headerString
                headerString = dfFile.ix[0, 'data']

                # Get text file header as new DataFrame column   
                dfFile['header'] = dfFile['data'].map(lambda x: self.getHeaderValue(x))

                # Add Lake column
                dfFile['lake'] = lake
                
                # Add CAST Type
                dfFile['forecast_type'] = 'FCAST'

                # Remove header rows
                dfFile['data'] = dfFile['data'].map(lambda x: np.nan if 'dat' in x else x)
                dfFile = dfFile.dropna()

                # Extract date and map information from header and set as DataFrame columns
                dfFile[['year', 'day', 'hour', 'map']] = dfFile['header'].str.split(return_type='frame').ix[:, 0:3]

                # Parse attribute column as set as DataFrame columns
                dfFile[cols] = dfFile['data'].str.split(return_type='frame')

                # Formate map string column
                dfFile['map'] = dfFile['map'].map(lambda x: self.getMapFile(x, lake))

                # Add Datetime Object
                dfFile['datetime'] = dfFile.apply(lambda x: 
                                                  datetime.strptime(x['year'] + x['day'] + x['hour'], "%Y%j%H"), 
                                                  axis=1)

                # Drop useless columns 
                dfFile = dfFile.drop('data', axis=1).drop('header', axis=1)

                # Merge lake specific attribute DataFrames
                if attributeCount == 1:
                    dfLake = dfFile
                else:
                    dfLake = pd.merge(dfLake, dfFile[cols + ['datetime']])
                
            # Get map grid locations
            mapFile = pd.read_table(self.url_map + dfFile.ix[dfFile.index[0], 'map'], header=None, 
                                    sep=r"\s*", names=['sequence number', 'fortran column','fortran row',
                                                       'latitude', 'longitude', 'depth'])
            
            # Add latitude and longitude
            multiple = int(dfLake.shape[0] / mapFile.shape[0])

            dfLake['latitude'] =  pd.concat([mapFile] * multiple, ignore_index=True)['latitude'] 
            dfLake['longitude'] =  pd.concat([mapFile] * multiple, ignore_index=True)['longitude']
            
            # Append lake DataFrames
            self.df_fcast = self.df_fcast.append(dfLake, ignore_index=True)
            
        # -------------------------------------------------------------------------------------------------------------------- #
        # Merge NCAST and FCAST 
        # -------------------------------------------------------------------------------------------------------------------- #
        
        # Add NCAST
        self.df = self.df_ncast 
        
        # Add FCAST where time > maximum NCAST time
        self.df = self.df.append(self.df_fcast[self.df_fcast['datetime'] > self.df_ncast['datetime'].max()])

In [19]:
# Get all newest files
noaaData = NoaaData(noaa_files, url_map)

In [20]:
# Show newest FCAST files
noaaData.newest_files_fcast.head()

Unnamed: 0,filename,file_extension,filetype,lake,file_datetime,current_datetime,forecast_type,file_url
0,e201616000.0.cur,cur,SURFACE CURRENTS,erie,2016-06-08,2016-06-08 01:24:33.446723,FCAST,http://www.glerl.noaa.gov/ftp/EMF/glcfs/gridde...
1,e201616000.0.swt,swt,SURFACE TEMPS,erie,2016-06-08,2016-06-08 01:24:33.446723,FCAST,http://www.glerl.noaa.gov/ftp/EMF/glcfs/gridde...
2,e201616000.0.wav,wav,WAVES,erie,2016-06-08,2016-06-08 01:24:33.446723,FCAST,http://www.glerl.noaa.gov/ftp/EMF/glcfs/gridde...
3,e201616000.0.wnd,wnd,WINDS,erie,2016-06-08,2016-06-08 01:24:33.446723,FCAST,http://www.glerl.noaa.gov/ftp/EMF/glcfs/gridde...
4,h201616000.0.cur,cur,SURFACE CURRENTS,huron,2016-06-08,2016-06-08 01:24:33.446723,FCAST,http://www.glerl.noaa.gov/ftp/EMF/glcfs/gridde...


<h1>Step 3</h1>
<h3>Save Data From Newest Files As DataFrame</h3>

In [23]:
# Get newest files
noaaData.df_setup()

# View DataFrame
noaaData.df_ncast.head()

Unnamed: 0,year,day,hour,datetime,grid_number,latitude,longitude,map,lake,wave_height,wave_direction,wave_period,wind_speed,wind_direction,surface_temperature,current_speed,current_direction


In [24]:
# Get data
noaaData.df_fill()

KeyboardInterrupt: 

In [None]:
# View NCAST DataFrame
noaaData.df_ncast.tail()

In [None]:
# View FCAST DataFrame
noaaData.df_fcast.tail()

In [None]:
# View merged DataFrame
noaaData.df.tail()

In [None]:
# Save DataFrame to CSV
# noaaData.df.to_csv('C:\\Users\\sgoodfellow\\Documents\\Sebastian\\Projects\\Websites\\Surfcast\\GetData\\Test.csv')

<h1>Step 4</h1>
<h3>Load Surf Spots</h3>
<h5>Thanks Grif!</h5>

In [None]:
# Set surf spots file path and file name
path = r'C:\Users\sgoodfellow\Documents\Sebastian\Projects\Websites\Surfcast\GetData\SurfSpots'
file = 'SurfSpots.csv'

In [None]:
# Load surf spot data as DataFrame
surf_spots = pd.read_csv(os.path.join(path, file))

In [None]:
# Show surf spot data
surf_spots.sort('lake')

<h1>Step 5</h1>
<h3>Plot Raw Data</h3>

In [None]:
# Plot grid points
plt.figure(figsize=(15, 10))

for lake in noaaData.df_ncast.lake.unique():
    
    df = noaaData.df_ncast[(noaaData.df_ncast.lake == lake) & 
                               (noaaData.df_ncast.datetime == noaaData.df_ncast.datetime.unique()[0])] 
    
    plt.plot(-df.longitude, df.latitude, marker='.', linestyle='none', ms=2)

plt.plot(surf_spots.longitude, surf_spots.latitude, color='k', marker='o', linestyle='none', ms=5)    
    
plt.tick_params(labelsize=14)

plt.xlabel('Longitude', fontsize=20)
plt.ylabel('Latitude', fontsize=20)

plt.grid('on')

In [None]:
# Plot grid points as 3D scatter
attribute = 'wave_height'

plt.figure(figsize=(10, 6))
    
df = noaaData.df_ncast[(noaaData.df_ncast.datetime == noaaData.df_ncast.datetime.unique()[0])] 

sc = plt.scatter(-df.longitude, df.latitude, c = df[attribute], edgecolors='none')

cb = plt.colorbar(sc)
cb.set_label(attribute.replace('_', ' ').title(), fontsize=20)
plt.tick_params(labelsize=14)

plt.xlabel('Longitude', fontsize=20)
plt.ylabel('Latitude', fontsize=20)

plt.grid('on')

<h1>Step 6</h1>
<h3>Create SQL Database</h3>

In [None]:
# Initializes database with filename 311_8M.db in current directory
#surfcast_sql_db = create_engine(r'sqlite:///C:\Users\Sebastian\Projects\Websites\Surfcast\GetData\Surfcast.db')

<h1>Step 7</h1>
<h3>Update SQL Database</h3>

In [None]:
noaa_files.df.to_sql('data', surfcast_sql_db, if_exists='append')

In [None]:
df = pd.read_sql_query('SELECT lake FROM data', surfcast_sql_db)
df.head()

In [None]:
data  = pd.DataFrame(columns=['year', 'day', 'hour', 'datetime', 'grid_number', 'latitude', 'longitude', 'map', 'lake',
                              'grid_number', 'wave_height', 'wave_direction', 'wave_period',
                              'wind_speed', 'wind_direction',
                              'surface_temperature',
                              'currect_speed', 'current_direction',
                              'ice_concentration', 'ice_thickness', 'ice_speed', 'ice_direction']) 

data.head()

In [None]:
# NOAA attributes
wave =        ['grid_number', 'wave_height', 'wave_direction', 'wave_period']
wind =        ['grid_number', 'wind_speed', 'wind_direction']
temperature = ['grid_number', 'surface_temperature']
current =     ['grid_number', 'currect_speed', 'current_direction']
ice =         ['grid_number', 'ice_concentration', 'ice_thickness', 'ice_speed', 'ice_direction']

lake = 'superisor'

# Load text load
df = pd.read_table('http://www.glerl.noaa.gov/ftp/EMF/glcfs/gridded_fields/FCAST/h201615012.0.wav', header=None)

# Name text column
df.columns = ['data']

# Set global variable text file header
headerString = df.ix[0, 'data']

# Define function to get text file header column
def getHeaderValue(val):
    
    global headerString
    
    if 'dat' in val and val != headerString:
        headerString = val
        return headerString
    else:
        return headerString

# Get text file header column    
df['header'] = df['data'].map(lambda x: getHeaderValue(x))

# Add Lake column
df['lake'] = lake

# Remove header row
df['data'] = df['data'].map(lambda x: np.nan if 'dat' in x else x)
df = df.dropna()

df['header'].str.split(return_type='frame')

# Get Year column
df[['year', 'day', 'hour', 'map', 'type', 'point']] = df['header'].str.split(return_type='frame')

# Get wave data
df[['grid_number', 'wave_height', 'wave_direction', 'wave_period']] = df['data'].str.split(return_type='frame')

# Define function to format file
def getMapFile(val, lake):
    
    map = val.split('/')[-1]
    map = map.split('.')[0] + '.' + 'map'
    
    if lake == 'superior':
        map = 'superior' + map.split('sup')[1]
        
    return map

# format file
df['map'] = df['map'].map(lambda x: getMapFile(x, lake))

# Add Datetime Object
df['datetime'] = df.apply(lambda x: datetime.strptime(x['year'] + x['day'] + x['hour'], "%Y%j%H"), axis=1)

# Drop 
df = df.drop('data', axis=1).drop('header', axis=1)

df.head()

In [None]:
yup = df['header'].str.split(return_type='frame')
yup.head()

In [None]:
yup[[4, 5]].apply(lambda x : x[0] + ' ' + x[1], axis=1)

In [None]:
yup[[4, 5]]

In [None]:
pd.concat([data, df], join='outer', axis = 1)

data.head()

In [None]:
wave =        ['grid_number', 'wave_height', 'wave_direction', 'wave_period']

In [None]:
wave + ['hi']

In [None]:
wave

In [None]:
dun = {
    'wave': ['wave_height', 'wave_direction', 'wave_period'],  
    'wind': ['wind_speed', 'wind_direction'],
    'temperature': ['surface_temperature'],
    'current': ['currect_speed', 'current_direction'],
    'ice': ['ice_concentration', 'ice_thickness', 'ice_speed', 'ice_direction']
}

In [None]:
dun['wave']

In [None]:
mapTest = pd.read_table(url_map + 'erie2km.map', header=None, sep=r"\s*", names=['sequence number',
                                                                                 'fortran column',
                                                                                 'fortran row',
                                                                                 'latitude',
                                                                                 'longitude',
                                                                                 'depth'])
mapTest.head()