In [1]:
"""
Ipython notebook to test/document function that loads
the PRMS statvar output file to a pandas dataframe object
Author: John Volk, March 2015
Python 2.7.7
Pandas 0.16.2
"""
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import os
#%matplotlib inline  ## was having trouble with inline plots with pandas- not sure why
#%pylab inline

###Function to load data from a PRMS statistical output variable file into a pandas dataframe
---
* The statistical output variable file from PRMS aka statvar file holds time series output
* Examples of data within this file include simulated temperature at one location in the model or basin area weighted evaporation
 - and many others, listed in table in PRMS manual
------
####Loading time series data into Pandas is useful because Pandas has been built for to handle and manipulate tabular or time series data
* Pandas objects can conveniently be index by dates and tabular operations such as pivot tables, sort, groupby are readily available
* From Pandas the data can also be easily converted to other formats such as csv or an excel file

In [2]:
def load_PRMSstatvar(statvarfile):
    """ 
    INPUT: statvarfile = statvar file path (string)
    OUTPUT: df = Pandas dataframe of PRMS variables date indexed from statvarfile
    """
    #### make list of statistical output variables for df header
    column_list = ['index','year','month','day','hh','mm','sec']
    ## append to header list the variables present in the file
    with open(statvarfile, 'r') as inf:
        for i,line in enumerate(inf):
            if i == 0: ## first line is always int number of variables
                nstatvars = int(line)
            elif i <= nstatvars and i != 0:
                column_list.append(line.rstrip())
            else: break
    column_list.append('whitespace') ## white space after last header can sometimes make extra column via delimiting
    ### Arguments for read_csv function
    delim=" "; head = -1; missing_value= -999; skiprows = nstatvars + 1 ## plus 1 for first line, -999 is the missing data representation for PRMS files  
    df = pd.read_csv(statvarfile, sep=delim, skiprows=skiprows,header=-1, na_values=[missing_value] )
    df.columns = column_list  ## use statvar list as the column names of the df
    date = pd.Series(pd.to_datetime(df.year*10000+df.month*100+df.day, format='%Y%m%d'), index=df.index)
    df.index = pd.to_datetime(date) ## make the df index the datetime for the time series data
    df.drop(['index','year','month','day','hh','mm','sec','whitespace'],axis=1,inplace=True)
    df.columns.name = 'statistical variables' ; df.index.name = 'date'
    return df

###Call the function using an example statvar file which returns a Pandas dataframe

In [3]:
## the statvar file path is set in the PRMS control file, it is user defined- we will not assume its name
inf = os.path.join(os.getcwd(),'..','models/acf/statvar') 
df = load_PRMSstatvar(inf)
## view first five lines of dataframe
df.head()

statistical variables,basin_cfs 1,basin_potet 1,orad 1,runoff 57
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1984-07-01,864.475023,0.087736,201.736343,19800
1984-07-02,3665.486368,0.068127,201.237671,21900
1984-07-03,4885.737093,0.104705,322.316528,23100
1984-07-04,7464.010971,0.158801,302.056671,24600
1984-07-05,13864.822171,0.179917,281.067871,26500


###Date indexing and slicing is now easily done in Pandas
* Example below will show date slicing using strings that pandas interprets, e.g. "10/01/1984"
* Then plot the time series of the statvar variables: simulated streamflow (basin_cfs) and measured streamflow (runoff) 

In [4]:
## get measured and simulated streamflow on a specific date
df[['basin_cfs 1','runoff 57']]["10/01/1984"]

statistical variables,basin_cfs 1,runoff 57
date,Unnamed: 1_level_1,Unnamed: 2_level_1
1984-10-01,7637.170504,11100


In [10]:
## get a slice of the dataframe for the two streamflow variables and slice for a water year and plot
plot_data = df[['basin_cfs 1','runoff 57']]["10/01/1984":"09/30/1985"]
## create plot object using pandas which defaults to use the index as x axis, and other columns as lines
## labels are given for each column name, and column/index names are also used in labels 
plot_data.plot()  ## could also use matplotlib or other plot package
plt.ylabel('streamflow cfs')
plt.show()

In [16]:
## use pandas function to get summary statistics for the 1985 water year
plot_data.describe()

Unnamed: 0,basin_cfs 1,runoff 57
count,365.0,365.0
mean,13538.17513,17724.657534
std,8235.247999,8772.001833
min,5400.609965,9480.0
25%,8013.978742,11800.0
50%,11762.005019,14100.0
75%,15590.54178,21800.0
max,61061.663343,51500.0


###Last example- time function for testing average speed of the function
* Useful when developing to compare relative speed of different versions of code doing the same thing
* Feel free to use this function or use the magic command %timeit 

In [9]:
import time
def time_it(f, *args):
    start = time.clock()
    f(*args)
    return (time.clock() - start)
n = 50
t = 0
for i in range(n):
    t += time_it(load_PRMSstatvar, inf)
print 'The average time it took to run the function over {n} runs was {avg} seconds'.format(n=n,avg=t/n)

The average time it took to run the function over 50 runs was 0.00433996 seconds
