In [2]:
from urllib.request import urlretrieve
import pandas as pd


This notebook allowed for the automated downloading of the data from the government of Canada's public database. 

In [3]:
weather_stations=pd.read_csv('stations_shp.csv')

In [4]:
weather_stations.head()

Unnamed: 0.1,Unnamed: 0,Station Name,Province,Latitude,Longitude,Elevation,Climate ID,WMO Identifier,TC Identifier,First Year,Last Year,HLY First Year,HLY Last Year,DLY First Year,DLY Last Year,MLY First Year,MLY Last Year,geometry
0,0,100 MILE HOUSE 6NE,BRITISH COLUMBIA,51.68,-121.22,928.0,1165793,,,1987,2023,,,1987.0,2023.0,1987.0,2007.0,POINT (-121.22 51.68)
1,1,ABEE AGDM,ALBERTA,54.28,-112.97,664.0,3010010,71285.0,XAF,1990,2024,1990.0,2024.0,2002.0,2024.0,2002.0,2007.0,POINT (-112.97 54.28)
2,2,ADDENBROKE ISLAND,BRITISH COLUMBIA,51.6,-127.86,21.3,1060080,,WCZ,1978,2024,1994.0,2001.0,1978.0,2024.0,1978.0,2007.0,POINT (-127.86 51.6)
3,3,AGASSIZ CDA,BRITISH COLUMBIA,49.24,-121.76,15.0,1100120,,,1889,2023,,,1889.0,2023.0,1889.0,2007.0,POINT (-121.76 49.24)
4,4,AGASSIZ RCS,BRITISH COLUMBIA,49.24,-121.76,19.3,1100119,71113.0,WZA,1988,2024,1994.0,2024.0,1988.0,2024.0,1988.0,2006.0,POINT (-121.76 49.24)


In [5]:
weather_stations.shape

(435, 18)

I'll be using the `weather_stations` table to loop through and download all the data needed from the following site:
https://dd.weather.gc.ca/climate/observations/daily/csv/


To do that cleanly, I will have to make a small adjustment to the way the provinces are currently displayed in the table. We will have to change the full name of the province to their respective 2 letter abbreviation. 

In [6]:
weather_stations['Province'].value_counts()

Province
QUEBEC                   146
BRITISH COLUMBIA         123
ONTARIO                   43
ALBERTA                   30
SASKATCHEWAN              28
MANITOBA                  22
NUNAVUT                   12
NEWFOUNDLAND              11
NOVA SCOTIA                6
NEW BRUNSWICK              5
YUKON TERRITORY            4
NORTHWEST TERRITORIES      3
PRINCE EDWARD ISLAND       2
Name: count, dtype: int64

In [10]:
weather_stations= weather_stations.replace({'Province':{'BRITISH COLUMBIA':'BC','ONTARIO':'ON','QUEBEC':'QC','SASKATCHEWAN':'SK','ALBERTA':'AB','MANITOBA':'MB','NORTHWEST TERRITORIES':'NW','NEWFOUNDLAND':'NL','YUKON TERRITORY':'YK','NUNAVUT':'NU','NEW BRUNSWICK':'NB','NOVA SCOTIA':'NS','PRINCE EDWARD ISLAND':'PE'}})

In [11]:
weather_stations_list=list(weather_stations['Province'].unique())

In [12]:
weather_stations_list

['BC', 'AB', 'NU', 'ON', 'QC', 'NL', 'NB', 'MB', 'SK', 'NS', 'NW', 'YK', 'PE']

In [8]:
weather_stations.head()

Unnamed: 0.1,Unnamed: 0,Station Name,Province,Latitude,Longitude,Elevation,Climate ID,WMO Identifier,TC Identifier,First Year,Last Year,HLY First Year,HLY Last Year,DLY First Year,DLY Last Year,MLY First Year,MLY Last Year,geometry
0,0,100 MILE HOUSE 6NE,BC,51.68,-121.22,928.0,1165793,,,1987,2023,,,1987.0,2023.0,1987.0,2007.0,POINT (-121.22 51.68)
1,1,ABEE AGDM,AB,54.28,-112.97,664.0,3010010,71285.0,XAF,1990,2024,1990.0,2024.0,2002.0,2024.0,2002.0,2007.0,POINT (-112.97 54.28)
2,2,ADDENBROKE ISLAND,BC,51.6,-127.86,21.3,1060080,,WCZ,1978,2024,1994.0,2001.0,1978.0,2024.0,1978.0,2007.0,POINT (-127.86 51.6)
3,3,AGASSIZ CDA,BC,49.24,-121.76,15.0,1100120,,,1889,2023,,,1889.0,2023.0,1889.0,2007.0,POINT (-121.76 49.24)
4,4,AGASSIZ RCS,BC,49.24,-121.76,19.3,1100119,71113.0,WZA,1988,2024,1994.0,2024.0,1988.0,2024.0,1988.0,2006.0,POINT (-121.76 49.24)


Next I'm creating a date range to add into my `for` loop to ensure the loop downloads only the files that have those dates. 

In [18]:

date_range = pd.date_range('1990','2023',freq='ME').strftime('%Y-%m')

date_range


Index(['1990-01', '1990-02', '1990-03', '1990-04', '1990-05', '1990-06',
       '1990-07', '1990-08', '1990-09', '1990-10',
       ...
       '2022-03', '2022-04', '2022-05', '2022-06', '2022-07', '2022-08',
       '2022-09', '2022-10', '2022-11', '2022-12'],
      dtype='object', length=396)

In [17]:
#creating an empty dataframe to start
weather_data_df=pd.DataFrame() 

In [19]:


# downloading weather data
for prov in weather_stations_list: #going through each province
    
    station= list(weather_stations['Climate ID'][weather_stations['Province']==prov].unique()) #going through each station from the province
    for id in station:
       
        for date in date_range: #selecting the date range for each province
            
            try:
                download=urlretrieve(url=f'https://dd.weather.gc.ca/climate/observations/daily/csv/{prov}/climate_daily_{prov}_{id}_{date}_P1D.csv',filename=f'Data/Weather_Daily/climate_daily_{prov}_{id}_{date}_P1D.csv')
                print(f'Downloading from {prov}')
            except:
                print('pass')
                continue                 



Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC
Downloading from BC


In [None]:
weather_data_df.to_csv('Data/weather_download.csv')

In [None]:
weather_data_df['Climate ID'].nunique()

230

In [None]:
weather_data_df.shape

(1165588, 31)

Below I'm reading the downloaded csv so that I can come back to this notebook without having to download the data again

In [None]:
weather_datadf= pd.read_csv('Data/weather_download.csv')

In [None]:
weather_datadf.isna().sum()

Unnamed: 0                         0
Longitude (x)                      0
Latitude (y)                       0
Station Name                       0
Climate ID                         0
Date/Time                          0
Year                               0
Month                              0
Day                                0
Data Quality                 1027129
Max Temp (°C)                  75451
Max Temp Flag                1124506
Min Temp (°C)                  75462
Min Temp Flag                1119519
Mean Temp (°C)                 86247
Mean Temp Flag               1110163
Heat Deg Days (°C)             86247
Heat Deg Days Flag           1110163
Cool Deg Days (°C)             86247
Cool Deg Days Flag           1110163
Total Rain (mm)               403599
Total Rain Flag               867860
Total Snow (cm)               408367
Total Snow Flag               846521
Total Precip (mm)             132428
Total Precip Flag             994967
Snow on Grnd (cm)             458866
S

Above we can see that there is quite a high number of missing information in a lot of our temperature and climate data.

I will be looking at options whether adding more stations or itterating the missing data may be ideal.

In [None]:
#DOWNLOADING MONTHLY DATA

for prov in weather_stations['Province']: #going through each province
    
    station= weather_stations['Climate ID'][weather_stations['Province']== prov] #going through each station from the province
    for id in station:
       
        for date in date_range: #selecting the date range for each province
            
            try:
                download=urlretrieve(url=f'https://dd.weather.gc.ca/climate/observations/monthly/csv/{prov}/climate_daily_{prov}_{id}_{date}_P1D.csv',filename=f'Data/Weather/climate_daily_{prov}_{id}_{date}_P1M.csv')
                place_holder = pd.read_csv(f'Data/Weather/climate_daily_{prov}_{id}_{date}_P1D.csv')
                weather_data_df= pd.concat(weather_data_df,place_holder) #this was a mistake to add in my for loop - it exponentially increased the run time for my code
                print(f'Downloading from {prov}')
            except:
                print('pass')
                continue        