The Objective of this notebook is to go through a list of weather stations and collect the weather data for each station. The data is collected from the Canada open data portal. Data for each station is combined into a single table then stored in a database for further analysis.

In [2]:
import os
import sys
import pandas as pd
import geopandas as gpd
import pygeos as pg
import numpy as np
import tensorflow as tf
import sqlalchemy as sq
import ipyparallel as ipp
from IPython.display import clear_output
from matplotlib import pyplot as plt
from ClimateDataRequester import ClimateDataRequester as cdr

# The following lines adjust the granularity of reporting.
pd.options.display.max_rows = 10
pd.options.display.float_format = "{:.1f}".format
pd.set_option('display.max_columns', None)
os.chdir('/tf')


2022-11-21 16:48:49.032148: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-21 16:48:49.152743: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
db_connection_url = "postgresql://grpthreeuser:grpthreeuser@postgres:5432/grpthreedb"
engine = sq.create_engine(db_connection_url)
db_con = engine.connect()


In [4]:
tableName = "public.\"lgFireStationsTen\""
query = "SELECT * FROM " + tableName + ";"
dfStations = gpd.GeoDataFrame.from_postgis(query, db_con)


In [5]:
def push_data(df: pd.DataFrame) -> None:
    df.to_sql("WeatherData", db_con, if_exists="append", index=False)


In [34]:
def dataProcessA(df: pd.DataFrame, stationID: str) -> None:
    try:
        df.drop(columns=['Data Quality', 'Max Temp Flag', 'Mean Temp Flag', 'Min Temp Flag', 'Heat Deg Days Flag', 'Cool Deg Days Flag', 'Spd of Max Gust (km/h)',
                         'Total Rain Flag', 'Total Snow Flag', 'Total Precip Flag', 'Snow on Grnd Flag', 'Dir of Max Gust Flag', 'Spd of Max Gust Flag',
                         'Heat Deg Days (°C)', 'Cool Deg Days (°C)', 'Longitude (x)', 'Latitude (y)', 'Station Name', 'Dir of Max Gust (10s deg)'], inplace=True)
    except:
        df.to_csv("Failed/" + str(df.iloc[0, 0]) +
                  "_unexpected_column_names.csv", index=False)

    # Climate ID	Date/Time	Year	Month	Day	Max Temp (Â°C)	Min Temp (Â°C)	Mean Temp (Â°C)	Total Rain (mm)	Total Snow (cm)	Total Precip (mm)	Snow on Grnd (cm)	Dir of Max Gust (10s deg)	Spd of Max Gust (km/h)
    # ClimateID Date Year Month Day MaxTemp MinTemp MeanTemp TotalRain TotalSnow TotalPrecip SnowOnGrnd DirOfMaxGust SpdOfMaxGust
    df.rename(columns={df.columns[0]: "ClimateID"}, inplace=True)
    df.rename(columns={df.columns[1]: "Date"}, inplace=True)
    df.rename(columns={df.columns[2]: "Year"}, inplace=True)
    df.rename(columns={df.columns[3]: "Month"}, inplace=True)
    df.rename(columns={df.columns[4]: "Day"}, inplace=True)
    df.rename(columns={df.columns[5]: "MaxTemp"}, inplace=True)
    df.rename(columns={df.columns[6]: "MinTemp"}, inplace=True)
    df.rename(columns={df.columns[7]: "MeanTemp"}, inplace=True)
    df.rename(columns={df.columns[8]: "TotalRain"}, inplace=True)
    df.rename(columns={df.columns[9]: "TotalSnow"}, inplace=True)
    df.rename(columns={df.columns[10]: "TotalPrecip"}, inplace=True)
    df.rename(columns={df.columns[11]: "SnowOnGrnd"}, inplace=True)

    df.dropna(subset=['MeanTemp'], inplace=True)
    df.loc[df['SnowOnGrnd'].isnull(), 'SnowOnGrnd'] = 0
    df.loc[df['TotalRain'].isnull(), 'TotalRain'] = 0
    df.loc[df['TotalSnow'].isnull(), 'TotalSnow'] = 0
    df.loc[df['TotalPrecip'].isnull(), 'TotalPrecip'] = 0
    df['MaxTemp'] = np.where(df['MaxTemp'].isnull(),
                             df['MeanTemp'], df['MaxTemp'])
    df['MinTemp'] = np.where(df['MinTemp'].isnull(),
                             df['MeanTemp'], df['MinTemp'])

    df[['ClimateID', 'Date']] = df[['ClimateID', 'Date']].astype(str)
    df[['Year', 'Month', 'Day']] = df[['Year', 'Month', 'Day']].astype(int)
    df[['MaxTemp', 'MinTemp', 'MeanTemp', 'TotalRain', 'TotalSnow', 'TotalPrecip', 'SnowOnGrnd']] = df[[
        'MaxTemp', 'MinTemp', 'MeanTemp', 'TotalRain', 'TotalSnow', 'TotalPrecip', 'SnowOnGrnd']].astype(float)

    # we try a db push, but if it fails, we place the data in a csv file
    # try:
    push_data(df)
    db_con.execute(
        "UPDATE public.\"lgFireStationsTen\" SET \"dataAvailable\" = True WHERE \"ClimateID\" like {};".format(stationID))
    # except:
    #     df.to_csv("Failed/" + str(df.iloc[0, 0]) +
    #             "_data_failed_dbpush.csv", index=False)


In [36]:
requester = cdr()

provinces = {'brit': "BC", 'albe': "AB", 'sask': "SK", 'mani': "MB", 'onta': "ON", 'queb': "QC",
             'nuna': "NU", 'yuko': "YT", 'nort': "NT", 'newf': "NL", 'prin': "PE", 'nova': "NS", 'new ': "NB"}

# for each station, we will request the weather data for the years 2010 to 2022
df = pd.DataFrame()
for index, row in dfStations.iterrows():

    if row['dataAvailable'] != True:

        key = row['Province'].lower()[:4]
        province = provinces[key]

        if province == "AB" or province == "SK" or province == "MB" or province == "YT":
            stationID = str(row['ClimateID'])
            df = requester.get_data(province, stationID, 2010, 2022)
            clear_output(wait=False)

            if not df.empty:
                dataProcessA(df, stationID)
            else:
                db_con.execute(
                    "UPDATE public.\"lgFireStationsTen\" SET \"dataAvailable\" = False WHERE \"ClimateID\" like {};".format(stationID))

        else:
            print("Province not wanted: " + province)

    else:
        print("Data for station " + str(row['ClimateID']) + " already exists.")

    print("Processed row " + str(index) + " of " + str(len(dfStations)))


ProgrammingError: (psycopg2.errors.UndefinedColumn) column "3070043" does not exist
LINE 1: ...SET "dataAvailable" = True WHERE "ClimateID" like "3070043";
                                                             ^

[SQL: UPDATE public."lgFireStationsTen" SET "dataAvailable" = True WHERE "ClimateID" like "3070043";]
(Background on this error at: https://sqlalche.me/e/14/f405)