In [None]:
import datetime
import re
import pandas as pd

In [18]:
edinburgh_columns = [
    'date',
    'sunrise',
    'sunset',
    'daylength',
    'daylength_diff',
    'astro_twilight_start',
    'astro_twilight_end',
    'nautical_twilight_start',
    'nautical_twilight_end',
    'civil_twilight_start',
    'civil_twilight_end',
    'solar_noon_time',
    'solar_noon_dist'
]

In [26]:
def drop_empty_rows(df, thresh=0.5):
    """Drop a row if at least "thresh"% is missing."""
    return df.dropna(thresh=int(thresh*len(df.columns)))

In [None]:
def clean_columns(x):
    x = str(x).replace('−', '-')
    x = re.sub("[^().+-:\d]", '', x)
    return x

In [77]:
sheets = pd.read_excel('data/Edinburgh-daytime.xlsx', sheet_name=None)

all_sheets = []
for k in sheets.keys():
    month = int(k[2:])
    year = 2000 + int(k[:2])
    df = sheets[k]
    # change column names
    df.columns = edinburgh_columns
    # drop first two rows and any empties
    df.drop(index=[0,1], inplace=True)
    df = drop_empty_rows(df).reset_index(drop=True)
    # clean columns
    for col in df.columns:
        if col != 'date':
            df[col] = df[col].apply(clean_columns)
    for col in ['sunrise', 'sunset', 'solar_noon_time']:
        df[col] = df[col].str.split('(', expand=True).loc[:,0]
    # update date
    df['date'] = df['date'].apply(lambda row: datetime.date(year, month, row))
    all_sheets.append(df)
edinburgh_df = pd.concat(all_sheets, ignore_index=True)
edinburgh_df

Unnamed: 0,date,sunrise,sunset,daylength,daylength_diff,astro_twilight_start,astro_twilight_end,nautical_twilight_start,nautical_twilight_end,civil_twilight_start,civil_twilight_end,solar_noon_time,solar_noon_dist
0,2011-12-01,08:18,15:44,07:25:45,-2:37,06:00:00,18:02:00,06:45:00,17:17:00,07:33:00,16:29:00,12:01,147.52
1,2011-12-02,08:20,15:43,07:23:13,-2:31,06:02:00,18:01:00,06:46:00,17:16:00,07:34:00,16:28:00,12:02,147.495
2,2011-12-03,08:21,15:42,07:20:48,-2:25,06:03:00,18:01:00,06:48:00,17:16:00,07:36:00,16:28:00,12:02,147.471
3,2011-12-04,08:23,15:41,07:18:29,-2:18,06:04:00,18:00:00,06:49:00,17:15:00,07:37:00,16:27:00,12:02,147.447
4,2011-12-05,08:24,15:41,07:16:17,-2:12,06:05:00,18:00:00,06:50:00,17:15:00,07:39:00,16:27:00,12:03,147.425
...,...,...,...,...,...,...,...,...,...,...,...,...,...
423,2013-01-27,08:16,16:35,08:19:28,+3:48,06:05:00,18:46:00,06:48:00,18:03:00,07:34:00,17:17:00,12:25,147.317
424,2013-01-28,08:14,16:37,08:23:19,+3:51,06:03:00,18:48:00,06:47:00,18:04:00,07:32:00,17:19:00,12:25,147.336
425,2013-01-29,08:12,16:39,08:27:14,+3:54,06:02:00,18:50:00,06:45:00,18:06:00,07:31:00,17:21:00,12:25,147.355
426,2013-01-30,08:10,16:42,08:31:11,+3:57,06:00:00,18:52:00,06:44:00,18:08:00,07:29:00,17:23:00,12:26,147.375


In [78]:
sheets = pd.read_excel('data/Strathspey-weather.xlsx', sheet_name=None)

all_sheets = []
for k in sheets.keys():
    month = int(k[2:])
    year = 2000 + int(k[:2])
    df = sheets[k]
    # # change column names
    # df.columns = edinburgh_columns
    # # drop first two rows and any empties
    # df.drop(index=[0,1], inplace=True)
    # df = drop_empty_rows(df).reset_index(drop=True)
    # # clean columns
    # for col in df.columns:
    #     if col != 'date':
    #         df[col] = df[col].apply(clean_columns)
    # for col in ['sunrise', 'sunset', 'solar_noon_time']:
    #     df[col] = df[col].str.split('(', expand=True).loc[:,0]
    # # update date
    # df['date'] = df['date'].apply(lambda row: datetime.date(year, month, row))
    all_sheets.append(df)
strathspey_df = pd.concat(all_sheets, ignore_index=True)
strathspey_df

Unnamed: 0,DEC.,DAILY STATISTICS,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,FEB.,MAR.,APR.,MAY,JUN.,JUL.,AUG.,SEP.,OCT.,NOV.
0,2011,TEMPERATURE,,,RAIN,PRESSURE,,WIND,,,...,,,,,,,,,,
1,,Mean,Min,Max.,mm,Mb.,Mb,Mean,Max.,Dom.,...,,,,,,,,,,
2,,°C,°C,°C,,,,mph,mph,dir'n.,...,,,,,,,,,,
3,DATE,0001-2359,24 hrs. to,24 hrs. from,24 hrs. from 0900 UTC,900,2100,0001-2359,0001-2359,0001-2359,...,,,,,,,,,,
4,,UTC,0900 UTC,0900 UTC,,UTC,UTC,UTC,UTC,UTC,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,,2.8,0.4,4.7,5.9,973,982,9.2,31,SSW,...,,,,,,,,,,
506,,2.7,0.6,6,0.3,988,974,9.9,33,SSW,...,,,,,,,,,,
507,,6.1,1.5,7.7,0.8,989,975,11.9,55,SSW,...,,,,,,,,,,
508,,4.8,4.4,6.2,4.6,981,997,18.6,54,SSW,...,,,,,,,,,,


In [79]:
df.head(10)

Unnamed: 0,JAN.,DAILY STATISTICS,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,2013,TEMPERATURE,,,RAIN,PRESSURE,,WIND,,,SUN
1,,Mean,Min,Max.,mm,Mb.,Mb,Mean,Max.,Dom.,Hours
2,,°C,°C,°C,,,,mph,mph,dir'n.,
3,DATE,0001-2359,24 hrs. to,24 hrs. from,24 hrs. from 0900 UTC,900,2100,0001-2359,0001-2359,0001-2359,0001-2359
4,,UTC,0900 UTC,0900 UTC,,UTC,UTC,UTC,UTC,UTC,UTC
5,1,2.6,0.2,5,8.8,994,1011,5.7,23,SSW,0.31
6,2,6.1,1.5,10.9,0.1,1010,1012,6.8,29,SW,2.94
7,3,8.9,3.9,10.4,0,1021,1024,10.6,32,SSW,1.98
8,4,8.5,7.2,10.4,0,1028,1029,8.5,32,SSW,0
9,5,5.8,1.4,10.2,0.1,1026,1024,3.8,23,SSW,0
