In [1]:
import pandas as pd
import numpy as np
import os
from itertools import product
import datetime

## Import The Precipation Data
- Import all CSVs, download [here]()
- Check for missing data
- Drop flagged rows and keep useful columns
- Aggregate total precipation by Station and Day, export

In [2]:
## Empty val for measurement flags
## indicates no problem with data
def drop_flags(obj):
    return str(obj) == ' '

## Process one at a time for memory concerns
files = os.listdir('../data/precip_raw')
for file in files:
    df = pd.read_csv('../data/precip_raw/' + file)
    # convert/extract date information
    df['pdate'] = pd.to_datetime( list(map(str,df['DATE'])), format = '%Y%m%d %H:%M')
    df['year'] = df['pdate'].dt.year
    df['month'] = df['pdate'].dt.month
    df['day'] = df['pdate'].dt.day
    
    # drop flagged rows
    df = df[list(map(drop_flags, df['Measurement Flag']))]
    # keep useful columns
    df = df[['STATION', 'LATITUDE', 'LONGITUDE', 'HPCP', 'year', 'month', 'day']]
    #aggregate to station-day total precipitation
    agg = df.groupby(['STATION','LATITUDE','LONGITUDE','year','month','day']).sum()
    #output
    agg.to_csv('../data/precip_agg/' + file)

  interactivity=interactivity, compiler=compiler, result=result)


## Clean the Aggregate Data
- Concatenate aggregate files to single dataframe
- Export unique Station locations to CSV
- 

In [3]:
# Load aggregates.
files = os.listdir('../data/precip_agg')
df = pd.DataFrame()
for file in files:
    current_data = pd.read_csv("../data/precip_agg/"+file)
    df = pd.concat([df, current_data])
df.shape


(195843, 7)

## Clean the Aggregate Data
- Drop duplicate measurements by STATION
- Export to CSV of station and coordinates
- Set datetime column

In [4]:
## get distinct stations
stations = df[['STATION','LATITUDE','LONGITUDE']].drop_duplicates('STATION')
#print(stations.shape)
#stations.head()

## export for FIPS codes to be attached
stations.to_csv('../data/station_coords.csv')

In [5]:
dates = pd.date_range('1992-01-01', '2014-01-01')
prod = product(stations['STATION'], dates)
timeframe = pd.DataFrame([list(p) for p in prod],
                        columns = ['STATION', 'date'])

In [6]:
dtvec = np.vectorize(datetime.datetime)
df['date'] = pd.to_datetime(dtvec(df.year, df.month, df.day))

In [7]:
compleat = pd.merge(df, timeframe, on = ['STATION', 'date'], how = 'outer')
compleat = compleat.sort_values(['STATION','date'])
compleat['HPCP'] = compleat['HPCP'].fillna(0)
compleat = compleat[['STATION', 'date', 'HPCP']]

## Assign FIPS Codes
- Import CSV of station FIPS codes
- Merge to add FIPS codes
- Aggregate sum and mean by FIPS and day

In [8]:
geo_stations = pd.read_csv('../data/stations_block.csv')
geo_stations = geo_stations[['STATION', 'STCT_FIPS']]
geo_stations.head()

Unnamed: 0,STATION,STCT_FIPS
0,COOP:040014,6037
1,COOP:040161,6049
2,COOP:040212,6055
3,COOP:040322,6053
4,COOP:040368,6055


In [24]:
## Merge with FIPS codes and aggregate to FIPS-day

byfips = pd.merge(compleat, geo_stations, on = 'STATION', how = 'left')

# consider both sum and mean of all stations in FIPS
agg_series = byfips.groupby(['STCT_FIPS', 'date']).sum().rename(columns={'HPCP':'station_sum'})
agg_series['station_mean'] = byfips.groupby(['STCT_FIPS', 'date']).mean()['HPCP']

agg_series.tail(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,station_sum,station_mean
STCT_FIPS,date,Unnamed: 2_level_1,Unnamed: 3_level_1
6115,2013-12-23,0.0,0.0
6115,2013-12-24,0.0,0.0
6115,2013-12-25,0.0,0.0
6115,2013-12-26,0.0,0.0
6115,2013-12-27,0.0,0.0
6115,2013-12-28,0.0,0.0
6115,2013-12-29,0.0,0.0
6115,2013-12-30,0.0,0.0
6115,2013-12-31,0.0,0.0
6115,2014-01-01,0.0,0.0


## Rolling Time Series
- Create 30-day rolling sum of precipitation by FIPS code
- Set individual year, month, day columns
- Export final precipitation dataset

In [25]:
agg_series['past30_ss_sum'] = agg_series['station_sum'].rolling(30).sum()
agg_series['past30_sm_sum'] = agg_series['station_mean'].rolling(30).sum()

agg_series.reset_index('date', inplace = True)

agg_series['year'] = agg_series['date'].dt.year
agg_series['month'] = agg_series['date'].dt.month
agg_series['day'] = agg_series['date'].dt.day

agg_series.tail(10)

Unnamed: 0_level_0,date,station_sum,station_mean,past30_ss_sum,past30_sm_sum,year,month,day
STCT_FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
6115,2013-12-23,0.0,0.0,1.57,0.3925,2013,12,23
6115,2013-12-24,0.0,0.0,1.57,0.3925,2013,12,24
6115,2013-12-25,0.0,0.0,1.57,0.3925,2013,12,25
6115,2013-12-26,0.0,0.0,1.57,0.3925,2013,12,26
6115,2013-12-27,0.0,0.0,1.57,0.3925,2013,12,27
6115,2013-12-28,0.0,0.0,1.57,0.3925,2013,12,28
6115,2013-12-29,0.0,0.0,1.57,0.3925,2013,12,29
6115,2013-12-30,0.0,0.0,1.57,0.3925,2013,12,30
6115,2013-12-31,0.0,0.0,1.47,0.3675,2013,12,31
6115,2014-01-01,0.0,0.0,1.47,0.3675,2014,1,1


In [78]:
agg_series.to_csv('../data/precip_agg_series.csv')