In [50]:
import pandas as pd
import numpy as np
import os
from itertools import product
import datetime

In [33]:
## Empty val for measurement flags
## indicates no problem with data
def drop_flags(obj):
    return str(obj) == ' '

## Process one at a time for memory concerns
files = os.listdir('../data/precip_raw')
for file in files:
    df = pd.read_csv('../data/precip_raw/' + file)
    # convert/extract date information
    df['pdate'] = pd.to_datetime( list(map(str,df['DATE'])), format = '%Y%m%d %H:%M')
    df['year'] = df['pdate'].dt.year
    df['month'] = df['pdate'].dt.month
    df['day'] = df['pdate'].dt.day
    
    # drop flagged rows
    df = df[list(map(drop_flags, df['Measurement Flag']))]
    # keep useful columns
    df = df[['STATION', 'LATITUDE', 'LONGITUDE', 'HPCP', 'year', 'month', 'day']]
    #aggregate to station-day total precipitation
    agg = df.groupby(['STATION','LATITUDE','LONGITUDE','year','month','day']).sum()
    #output
    agg.to_csv('../data/precip_agg/' + file)


#df.head()
#df.shape

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [36]:
# Load aggregates.
files = os.listdir('../data/precip_agg')
df = pd.DataFrame()
for file in files:
    current_data = pd.read_csv("../data/precip_agg/"+file)
    df = pd.concat([df, current_data])

df.shape


(195843, 7)

In [41]:
## get distinct stations
stations = df[['STATION','LATITUDE','LONGITUDE']].drop_duplicates('STATION')
#print(stations.shape)
#stations.head()

## export for FIPS codes to be attached
stations.to_csv('../data/station_coords.csv')

(263, 3)


In [47]:
dates = pd.date_range('1992-01-01', '2014-01-01')
prod = product(stations['STATION'], dates)
timeframe = pd.DataFrame([list(p) for p in prod],
                        columns = ['STATION', 'date'])

In [53]:
#timeframe.head()
#df.head()


dtvec = np.vectorize(datetime.datetime)

df['date'] = pd.to_datetime(dtvec(df.year, df.month, df.day))

In [62]:
compleat = pd.merge(df, timeframe, on = ['STATION', 'date'], how = 'outer')
compleat = compleat.sort_values(['STATION','date'])
compleat['HPCP'] = compleat['HPCP'].fillna(0)
compleat = compleat[['STATION', 'date', 'HPCP']]

In [66]:
#compleat.head()

geo_stations = pd.read_csv('../data/stations_block.csv')
geo_stations = geo_stations[['STATION', 'STCT_FIPS']]
geo_stations.head()

Unnamed: 0,STATION,STCT_FIPS
0,COOP:040014,6037
1,COOP:040161,6049
2,COOP:040212,6055
3,COOP:040322,6053
4,COOP:040368,6055


In [76]:
## Merge with FIPS codes and aggregate to FIPS-day

byfips = pd.merge(compleat, geo_stations, on = 'STATION', how = 'left')

# consider both sum and mean of all stations in FIPS
agg_series = byfips.groupby(['STCT_FIPS', 'date']).sum().rename(columns={'HPCP':'station_sum'})
agg_series['station_mean'] = byfips.groupby(['STCT_FIPS', 'date']).mean()['HPCP']

agg_series.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,station_sum,station_mean
STCT_FIPS,date,Unnamed: 2_level_1,Unnamed: 3_level_1
6001,1992-01-01,0.0,0.0
6001,1992-01-02,0.0,0.0
6001,1992-01-03,0.0,0.0
6001,1992-01-04,0.0,0.0
6001,1992-01-05,0.0,0.0


In [77]:
## Create rolling time series

agg_series['past30_ss_sum'] = agg_series['station_sum'].rolling(30).sum()
agg_series['past30_sm_sum'] = agg_series['station_mean'].rolling(30).sum()

agg_series.reset_index('date', inplace = True)

agg_series['year'] = agg_series['date'].dt.year
agg_series['month'] = agg_series['date'].dt.month
agg_series['day'] = agg_series['date'].dt.day

agg_series.head()


Unnamed: 0_level_0,date,station_sum,station_mean,past30_ss_sum,past30_sm_sum,year,month,day
STCT_FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
6001,1992-01-01,0.0,0.0,,,1992,1,1
6001,1992-01-02,0.0,0.0,,,1992,1,2
6001,1992-01-03,0.0,0.0,,,1992,1,3
6001,1992-01-04,0.0,0.0,,,1992,1,4
6001,1992-01-05,0.0,0.0,,,1992,1,5


In [78]:
agg_series.to_csv('../data/precip_agg_series.csv')