In [1]:
#!/usr/bin/python3

import re
import json
import requests
from collections import namedtuple
from bs4 import BeautifulSoup as bs

In [21]:
# Setting the urls that we need
BASE_URL = "http://climate.weather.gc.ca"
SEARCH_URL = "http://climate.weather.gc.ca/historical_data/search_historic_data_e.html"
DAILY_DATA_URL = "http://climate.weather.gc.ca/climate_data/daily_data_e.html"
SEARCH_STATIONS_URL = "http://climate.weather.gc.ca/historical_data/search_historic_data_stations_e.html"
ALBERTA_STATIONS_URL = "http://climate.weather.gc.ca/historical_data/search_historic_data_stations_e.html?searchType=stnProv&timeframe=1&lstProvince=AB&optLimit=yearRange&StartYear=1840&EndYear=2018&Year=2018&Month=5&Day=8&selRowPerPage=100"

In [29]:
# A helper function that iterates through all the pages starting from
#     a given url, and performs a given function on each one, storing
#     the result of each function to a list.
#
# start_page can be the url of any page you wish to start on
# func must be a function that operates on an entire page accepting
#     a string of the entire page as its argument
def all_pages(start_page, func):
    list_of_info = []
    current_page = start_page
    flag = True
    iteration = 1

    
    while(flag):
        try:
            #print(iteration)
            #iteration += 1
            #print(current_page)
            
            # Grab page contents and apply the function
            content = requests.get(current_page).content
            list_of_info.append(func(content))
            
            # Parse page then find the page links
            soup = bs(content, 'lxml')
            div = soup.findAll("div", {"class": "pull-left text-left"})[-1]
            
            # Find the current page number
            current_num = div.find("li", {"class": "active"}).a.string
            print("Page: " + current_num)

            # Find the next url to the next page
            next_page = None
            children = div.ul.find_all("li")
            for count, child in enumerate(children):
                try:
                    if child.attrs["class"][0] == "active":
                        next_page = children[count+1].a["href"]
                        break
                except KeyError:
                    pass
            
            # If the next_page was not found try finding the "next" link
            if not next_page:
                next_page = div.find("a", {"rel": "next"})
                # If on last page there is no next, so end
                if not next_page:
                    flag = False          

            # The next_page becomes the current_page
            current_page = BASE_URL + next_page
            #if iteration == 10:
            #    break
        except Exception as e:
            print(e)
            break
            
    return list_of_info

In [140]:
# TODO make library
class date_range:
    def __init__(self, string=None):
        if string is None:
            self.start_date_string = None
            self.end_date_string = None
            self.start_year = None      
            self.start_month = None
            self.start_day = None            
            self.end_year = None
            self.end_month = None
            self.end_day = None
        else:
            self.parse_string(string)
        
    def parse_string(self, string):
        split_range = string.split('|')
        self.start_date_string = split_range[0]
        self.end_date_string = split_range[1]
        
        split_start = split_range[0].split('-')
        split_end = split_range[1].split('-')
        
        self.set_start_date(split_start[0], split_start[1], split_start[2])
        self.set_end_date(split_end[0], split_end[1], split_end[2])
                
    def set_date_string(self, start, end):
        self.start_date_string = start
        self.end_date_string = end
        
    def set_start_date(self, year, month, day):
        self.start_year = int(year)
        self.start_month = int(month)
        self.start_day = int(day)
        
    def set_end_date(self, year, month, day):
        self.end_year = int(year)
        self.end_month = int(month)
        self.end_day = int(day)
        
    def get_start_date(self):
        if (self.start_date_string is None and 
            (self.start_year is None or
             self.start_month is None or
             self.start_day is None)):
            return None
        
        if self.start_year is None:
            self.start_year = int(self.start_date_string.split('-')[0])
        if self.start_month is None:
            self.start_month = int(self.start_date_string.split('-')[1])
        if self.start_day is None:
            self.start_date = int(self.start_date_string.split('-')[2])
            
        return (self.start_year, self.start_month, self.start_day)
    
    def get_end_date(self):
        if (self.end_date_string is None and 
            (self.end_year is None or
             self.end_month is None or
             self.end_day is None)):
            return None
        
        if self.end_year is None:
            self.end_year = int(self.end_date_string.split('-')[0])
        if self.end_month is None:
            self.end_month = int(self.end_date_string.split('-')[1])
        if self.end_day is None:
            self.end_date = int(self.end_date_string.split('-')[2])
            
        return (self.end_year, self.end_month, self.end_day)        
     
    def get_start_date_string(self):
        if (self.start_date_string is None and
            (self.start_year is None or
             self.start_month is None or
             self.start_day is None)):
            return None
        
        return "%i-%02d-%02d" % (self.start_year, self.start_month,
                                 self.start_day)                                
    
    def get_end_date_string(self):
        if (self.end_date_string is None and
            (self.end_date_year is None or
             self.end_date_month is None or
             self.end_date_day is None)):
            return None
        
        return "%i-%02d-%02d" % (self.end_year, self.end_month,
                                 self.end_day)
    
    # TODO implment
    def is_leap_year(self, year):
        return False
    
    def list_days(self, year, month):
        if month == 2 and is_leap_year:
            pass
        else:
            pass
    
    def list_months(self, year):
        if int(year) == self.start_year: 
            return range(self.start_month, 13)
        elif int(year) == self.end_year:
            return range(1, self.end_month)
        else:
            return range(1, 13)
    
    def list_years(self):
        return range(self.start_year, self.end_year+1)

    
END_OF_MONTH = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]

In [141]:
class url_obj:
    def __init__(self, url, payload=None, rng=None):
        self.url = url
        self.payload = payload
        self.range = rng
        self.page = None
        
    def get(self):
        self.page = requests.get(self.url, params=self.payload)
        
    def grab_data(self, func):
        return func(self.page)
    
    def list_days(self, year, month):
        pass
    
    def list_months(self, year):
        pass
    
    def list_years(self):
        pass

In [142]:
class station(object):
    __slots__ = ('name', 'prov', 'lat', 'lon', 'elevation', 
                'station_id', 'start_date', 'end_date', 
                'climate_id', 'wmo_id', 'tc_id')
    def __init__(self):
        self.name = ''
        self.prov = ''
        self.lat = 0.0
        self.lon = 0.0
        self.elevation = ''
        self.station_id = ''
        self.start_date = ''
        self.end_date = ''
        self.climate_id = ''
        self.wmo_id = ''
        self.tc_id = ''

In [149]:
def fill_fourm(content, year, month, day):
    fourm = {}
    inputs = content.find_all("input", {"type": "hidden"})
    
    for element in inputs:
        fourm[str(element.attrs["name"])] = element.attrs["value"]        
    fourm["timeframe"] = "2"
    fourm["year"] = str(year)
    fourm["month"] = str(month)
    fourm["day"] = str(day)
    return fourm

# Helper functions to grab the range of dates that can be used, from a
#     Beuatiful Soup object
def get_dates(content):
    found = content.find("input", {"type": "hidden", "name": "dlyRange"})
    if not found:
        print("wrong")
    dates = found.attrs["value"]
    
    # TODO Fix this
    if dates == '|':
        dates = content.find("input", {"type": "hidden", "name": "hlyRange"}).attrs["value"]
    if dates == '|':
        dates = content.find("input", {"type": "hidden", "name": "mlyRange"}).attrs["value"]
    rng = date_range(dates)
        
    return rng

def get_station_info(page_obj):
    s = station()

    url = page_obj.url
    url = url.split("&")
    for part in url:
        if part[:10] == "StationID=":
            s.station_id = part[10:]
            break

    page = page_obj.content
    soup = bs(page, 'lxml')
    name_content = soup.find("p", {"class": "text-center table-header pdng-md mrgn-bttm-0"}).contents
    s.name = name_content[0]
    s.prov = name_content[2]
    
    print(s.name)
    
    lat = soup.find("div", {"aria-labelledby": "latitude"}).contents
    deg = lat[0]
    minute = lat[2]
    second = lat[4]
    s.lat = float(deg) + float(minute)/60 + float(second)/3600

    lon = soup.find("div", {"aria-labelledby": "longitude"}).contents
    deg = lon[0]
    minute = lon[2]
    second = lon[4]
    s.lon = 360-float(deg) + float(minute)/60 + float(second)/3600 
    s.elevation = soup.find("div", {"aria-labelledby": "elevation"}).contents[0] 
    
    climate_id = soup.find("div", {"aria-labelledby": "climateid"}).contents
    if not climate_id:
        s.climate_id = None
    else:
        s.climate_id = climate_id[0]
    
    wmo_id = soup.find("div", {"aria-labelledby": "wmoid"}).contents
    if not wmo_id:
        s.wmo_id = None
    else:
        s.wmo_id = wmo_id[0]
        
    tc_id = soup.find("div", {"aria-labelledby": "tcid"}).contents
    if not tc_id:
        s.tc_id = None
    else:
        s.tc_id = tc_id[0]

    return s

def all_stations(page, func=get_station_info):
    soup = bs(page, 'lxml')
    
    # A list of all the stations on the page
    results = soup.find_all("form", {"id": re.compile("stnRequest[0-9]+-sm")})
    print('%i of results on this page\n' % len(results))
    list_of_stations = []
    for element in results:
        dates = get_dates(element)
        
        fourm = fill_fourm(element, *dates.get_end_date())
        page_obj = requests.get(DAILY_DATA_URL, params=fourm)
        s = func(page_obj)
        s.start_date = dates.get_start_date_string()
        list_of_stations.append(s)
    return list_of_stations
    
    
def search_station(name):
    payload = {
        "searchType": "stnName",
        "timeframe": "1",
        "txtStationName": name,
        "optLimit": "yearRange",
        "StartYear": "1840",
        "EndYear": "2018",
        "Year": "2018",
        "Month": "4",
        "Day": "28",
        "selRowPerPage": "100",
    }
    content = requests.get(SEARCH_STATIONS_URL, params=payload).content
    get_stations(content)

In [None]:
list_of_list_of_stations = all_pages(ALBERTA_STATIONS_URL, all_stations)

101 of results on this page

(AE) BOW SUMMIT
ABEE AGDM
ACADIA VALLEY
ACADIA VALLEY
ACADIA VALLEY CDA EPF
ACADIA VALLEY EXP ST
ACADIA VALLEY MACTAVISH
ACADIA VALLEY VANDYNE
ACME CDA EPF
ADAIR LO
ADAMS CREEK LO
ADEN
AIRDRIE
AKAMINA PASS
ALBERT HALL AGCM
ALDER FLATS LO
ALGAR LO
ALIX
ALIX
ALLIANCE
ALLIANCE AGCM
ALLIANCE GREENVIEW
ALLIANCE SOUTH
ALSASK
ALTAWAN
AMBER LO
AMISK
ANDREW
ANDREW AGDM
ANSELL LO
ANSELMO
ANTHRACITE
ANTLER HILL
ANZAC
ANZAC
ARDENVILLE
ARMADA EXP ST
ARMENA
ARNESON
ARROWWOOD
ASSUMPTION
ASSUMPTION
ATHABASCA
ATHABASCA 1
ATHABASCA 2
ATHABASCA 3
ATHABASCA AGCM
ATHABASCA EXP ST
ATHABASCA LANDING
ATHABASCA LO
ATIKAMEG
ATLEE
ATLEE AGCM
ATMORE
ATMORE AGDM
AURORA LO
AZURE
BAIRD LAKE
BALD MOUNTAIN LO
BALDY LO
BALLATER
BALLATER AGCM
BALM
BANANA BELT
BANFF
BANFF (AUT)
BANFF CR10
BANFF CS
BANFF SPRINGS
BAPTISTE LAKE
BARNWELL AGDM
BARONS AGCM
BARONS EXP ST
BARONS EXP ST 2
BARRHEAD
BARRHEAD CS
BARRIER LAKE
BASELINE LO
BASHAW
BASHAW
BASNETT
BASSANO AGCM
BASSANO DAM
BASSANO GEM
BASSET LO

GOOSEBERRY LAKE
GOOSEBERRY LAKE AGCM
GORDON LAKE LO
GRANDE CACHE
GRANDE CACHE AUTO
GRANDE CACHE MILNER
GRANDE CACHE RS
GRANDE CACHE S.T.P.
GRANDE LO
GRANDE PRAIRIE
GRANDE PRAIRIE A
GRANDE PRAIRIE A
Page: 6
101 of results on this page

GRANDE PRAIRIE A
GRANDE PRAIRIE CR21X
GRANUM JUMBO VALLEY
GRASSY LAKE
GRASSY LAKE
GRASSY LAKE RIVERS DEV
GRAVE FLATS LO
GROSMONT
GROTON
GROUARD
GROUARD
GROUND ZERO AFS
GROVEDALE RS
GULL LAKE GOLF COURSE
GUY
GWYNNE
HACKETT
HAILSTONE BUTTE LO
HALKIRK
HALKIRK AGCM
HAND HILLS AGCM
HANNA
HARDISTY
HARMATTAN
HASTINGS LAKE
HAWK HILLS AGCM
HAWK HILLS LO
HAY CAMP TOWER
HAY LAKES RS
HAY RIVER RS
HAYS
HEART LAKE LO
HEISLER 10S
HELDAR
HEMARUKA
HEMARUKA AGCM


In [66]:
with open()

3 of results on this page

<class 'bs4.element.ResultSet'>
<__main__.date_range object at 0x7f25e89d1d30>
<__main__.date_range object at 0x7f25e89d1d30>
<__main__.date_range object at 0x7f25e89d1d30>


In [69]:
def count_recent(data):
    soup = bs(data, 'lxml')
    # A list of all the stations on the page
    results = soup.find_all("form", {"id": re.compile("stnRequest[0-9]+-sm")})
    #print('%i of results on this page\n' % len(results))
    #print(type(results))
    count = 0
    for element in results:
        if str(get_dates(element).get_end_date()[0]) == "2018":
            count += 1
    return count

result = all_pages("http://climate.weather.gc.ca/historical_data/search_historic_data_stations_e.html?searchType=stnProv&timeframe=1&lstProvince=&optLimit=yearRange&StartYear=1840&EndYear=2018&Year=2018&Month=4&Day=28&selRowPerPage=100", count_recent)

Page: 1
Page: 2
Page: 3
Page: 4
Page: 5
Page: 6
Page: 7
Page: 8
Page: 9
Page: 10
Page: 11
Page: 12
Page: 13
Page: 14
Page: 15
Page: 16
Page: 17
Page: 18
Page: 19
Page: 20
Page: 21
Page: 22
Page: 23
Page: 24
Page: 25
Page: 26
Page: 27
Page: 28
Page: 29
Page: 30
Page: 31
Page: 32
Page: 33
Page: 34
Page: 35
Page: 36
Page: 37
Page: 38
Page: 39
Page: 40
Page: 41
Page: 42
Page: 43
Page: 44
Page: 45
Page: 46
Page: 47
Page: 48
Page: 49
Page: 50
Page: 51
Page: 52
Page: 53
Page: 54
Page: 55
Page: 56
Page: 57
Page: 58
Page: 59
Page: 60
Page: 61
Page: 62
Page: 63
Page: 64
Page: 65
Page: 66
Page: 67
Page: 68
Page: 69
Page: 70
Page: 71
Page: 72
Page: 73
Page: 74
Page: 75
Page: 76
Page: 77
Page: 78
Page: 79
Page: 80
Page: 81
Page: 82
Page: 83
Page: 84
Page: 85
Page: 86
Page: 87
Page: 88


In [70]:
summ = 0
for element in result:
    summ += int(element)
print(summ)

1406
