In [238]:
%matplotlib inline
import pandas as pd
import datetime
import os
from matplotlib import pyplot as plt; plt.rcParams['figure.figsize'] = 15, 5

DATA_DIR = '../data/raw/'
#The files are given as fixed width files.
#See Documentation in ../data/
WY2_colspecs = [(0, 5), (5, 6), (6, 16), (16, 20), (20, 24),
                (24, 26), (26, 30), (30, 32), (32, 36), (36, 38),
                (38, 42), (42, 43), (43, 47), (47, 48), (48, 52),
                (52, 53), (53, 57), (57, 58), (58, 60), (60, 61),
                (61, 65), (65, 66), (66, 70), (70, 71), (71, 75),
                (75, 76), (76, 84), (84, 85), (85, 90), (90, 91),
                (91, 92), (92, 95), (95, 96), (96, 100), (100, 101),
                (101, 104), (104, 105), (105, 109), (109, 110),
                (110, 112), (112, 113), (113, 115), (115, 116),
                (116, 117), (117, 118)]

WY2_usefulcols = [(6, 16), (91, 95), (95, 96)] #Time, temperature, temp flag
col_names = ['Time', 'T', 'T_flag'] #Flags indicate missing or estimated data

In [86]:
#Hours are from 01...24, but we can only parse 00...23.
#so subtract 1 from the hour, then add back the time difference
#hour 24 will become 00 of the next day.
td_1hr = datetime.timedelta(hours = 1)
D = pd.read_fwf(DATA_DIR + '/SASKATCHEWAN/ReginaA_1953-2005/25005.WY2', colspecs = WY2_usefulcols,
                header = None, names = col_names, parse_dates = True, nrows = 100,
                date_parser = lambda d: pd.to_datetime(str(int(d) - 1), format = '%Y%m%d%H') + td_1hr)

In [87]:
D.head()

Unnamed: 0,Time,T,T_flag
0,1953010101,-106,
1,1953010102,-122,
2,1953010103,-128,
3,1953010104,-128,
4,1953010105,-122,


There is also a metadata file that contains lat long coordinates of stations, identified by their WBAN number.

Further, the time recordings at each station are given in "local standard time" (LST), and the mlong coordinate provides a means to convert to UTC (universal time): LST = UTC - MLONG/15, or UTC = LST + MLONG/15.  I will convert all of the times into UTC

In [222]:
col_names = ['Name',  'WBAN',   'lat',    'long',  'mlong', 'first_year', 'last_year']
loc_cols =  [(0,24), (24, 30), (45, 52), (52, 58), (59, 65),  (74, 76),     (77, 79)]

In [223]:
D = pd.read_fwf(DATA_DIR + 'locations.txt', colspecs = loc_cols, comment = '#',
                header = None, names = col_names)

In [224]:
D.head()

Unnamed: 0,Name,WBAN,lat,long,mlong,first_year,last_year
0,CALGARY INT'L. A,25110,51.1,114.02,105.0,53,5
1,COLD LAKE A,25129,54.42,110.28,105.0,54,5
2,CORONATION,25113,52.1,111.45,105.0,53,94
3,COWLEY A,CAN43,49.63,114.08,105.0,53,59
4,EDMONTON INT'L. A,25142,53.32,113.58,105.0,61,5


In [251]:
#The year is specified only with the last 2 digits
#but, data collection started in after 1950 and ended before 2050
def fix_year(yr : int):
    if yr > 50:
        yr += 1900
    else:
        yr += 2000
    return yr

In [226]:
D.loc[:, ['first_year', 'last_year']] = D.loc[:, ['first_year', 'last_year']].applymap(fix_year)

In [227]:
D.head()

Unnamed: 0,Name,WBAN,lat,long,mlong,first_year,last_year
0,CALGARY INT'L. A,25110,51.1,114.02,105.0,1953,2005
1,COLD LAKE A,25129,54.42,110.28,105.0,1954,2005
2,CORONATION,25113,52.1,111.45,105.0,1953,1994
3,COWLEY A,CAN43,49.63,114.08,105.0,1953,1959
4,EDMONTON INT'L. A,25142,53.32,113.58,105.0,1961,2005


In [252]:
def time_correction(mlong : float):
    '''The time delta to add to an LST time to yield a UTC time,
    given the prime meridian mlong in degrees.'''
    return datetime.timedelta(minutes = mlong / 15)

In [229]:
D['time_correction'] = D.loc[:, 'mlong'].apply(time_correction)

In [230]:
D.head()

Unnamed: 0,Name,WBAN,lat,long,mlong,first_year,last_year,time_correction
0,CALGARY INT'L. A,25110,51.1,114.02,105.0,1953,2005,00:07:00
1,COLD LAKE A,25129,54.42,110.28,105.0,1954,2005,00:07:00
2,CORONATION,25113,52.1,111.45,105.0,1953,1994,00:07:00
3,COWLEY A,CAN43,49.63,114.08,105.0,1953,1959,00:07:00
4,EDMONTON INT'L. A,25142,53.32,113.58,105.0,1961,2005,00:07:00


In [231]:
del D['mlong'] #No longer needed

In [232]:
D.head()

Unnamed: 0,Name,WBAN,lat,long,first_year,last_year,time_correction
0,CALGARY INT'L. A,25110,51.1,114.02,1953,2005,00:07:00
1,COLD LAKE A,25129,54.42,110.28,1954,2005,00:07:00
2,CORONATION,25113,52.1,111.45,1953,1994,00:07:00
3,COWLEY A,CAN43,49.63,114.08,1953,1959,00:07:00
4,EDMONTON INT'L. A,25142,53.32,113.58,1961,2005,00:07:00


In [253]:
def wban_fname(wban):
    '''Convert the WBAN string into the filename we need to look for'''
    for root, dirs, files in os.walk(DATA_DIR):
        for f in files:
            if f.endswith('WY2') and f.startswith(wban):
                return root + '/' + f
    return 404
%timeit wban_fname('xxx') #check time to traverse whole directory structure

2.98 ms ± 30.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [243]:
D['WBAN_file'] = D['WBAN'].apply(wban_fname)

In [244]:
D.head()

Unnamed: 0,Name,WBAN,lat,long,first_year,last_year,time_correction,WBAN_file
0,CALGARY INT'L. A,25110,51.1,114.02,1953,2005,00:07:00,../data/raw/ALBERTA/CalgaryInt'l.A_1953-2005/2...
1,COLD LAKE A,25129,54.42,110.28,1954,2005,00:07:00,../data/raw/ALBERTA/ColdLakeA_1954-2005/25129.WY2
2,CORONATION,25113,52.1,111.45,1953,1994,00:07:00,../data/raw/ALBERTA/Coronation_1953-1994/25113...
3,COWLEY A,CAN43,49.63,114.08,1953,1959,00:07:00,../data/raw/ALBERTA/CowleyA_1953-1959/CAN43.WY2
4,EDMONTON INT'L. A,25142,53.32,113.58,1961,2005,00:07:00,../data/raw/ALBERTA/EdmontonInt'l.A_1961-2005/...


In [245]:
D['WBAN_file'][0]

"../data/raw/ALBERTA/CalgaryInt'l.A_1953-2005/25110.WY2"

In [254]:
os.getcwd()

'/home/ubuntu/science/dwglasso_cweeds/notebooks'

In [255]:
os.c

b'/home/ubuntu/science/dwglasso_cweeds/notebooks'