In [150]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None

from datetime import datetime, timedelta

### Loading Data
Loading and displaying the two ottawa datasets  
One is labelled as missing as we will use this data set to supplement the missing values from the IntlA station
We use the closest weather station to each IntlA station and take the date range from 2020-10-27 to 2021-02-27

In [151]:
climate_ottawa = pd.read_csv(r"Ottawa_IntlA_Total.csv")
climate_ottawa_missing = pd.read_csv(r"Ottawa_CDA_Total.csv")

climate_ottawa.tail()

Unnamed: 0,Longitude (x),Latitude (y),Station Name,Climate ID,Date/Time,Year,Month,Day,Data Quality,Max Temp (°C),...,Total Snow (cm),Total Snow Flag,Total Precip (mm),Total Precip Flag,Snow on Grnd (cm),Snow on Grnd Flag,Dir of Max Gust (10s deg),Dir of Max Gust Flag,Spd of Max Gust (km/h),Spd of Max Gust Flag
424,-75.67,45.32,OTTAWA INTL A,6106001,2021-02-28,2021,2,28,,4.4,...,0.0,T,1.0,,58.0,,15.0,,41.0,
425,-75.67,45.32,OTTAWA INTL A,6106001,2021-03-01,2021,3,1,,3.8,...,0.0,T,1.5,,52.0,,31.0,,68.0,
426,-75.67,45.32,OTTAWA INTL A,6106001,2021-03-02,2021,3,2,,-7.3,...,6.4,,4.0,,48.0,,30.0,,58.0,
427,-75.67,45.32,OTTAWA INTL A,6106001,2021-03-03,2021,3,3,,-0.4,...,0.0,T,0.0,T,55.0,,32.0,,33.0,
428,-75.67,45.32,OTTAWA INTL A,6106001,2021-03-04,2021,3,4,,-4.9,...,0.0,,0.0,,55.0,,33.0,,48.0,


In [152]:
climate_ottawa_missing.tail()

Unnamed: 0,Longitude (x),Latitude (y),Station Name,Climate ID,Date/Time,Year,Month,Day,Data Quality,Max Temp (°C),...,Total Snow (cm),Total Snow Flag,Total Precip (mm),Total Precip Flag,Snow on Grnd (cm),Snow on Grnd Flag,Dir of Max Gust (10s deg),Dir of Max Gust Flag,Spd of Max Gust (km/h),Spd of Max Gust Flag
424,-75.72,45.38,OTTAWA CDA RCS,6105978,2021-02-28,2021,2,28,,4.8,...,,,1.1,,56.0,,11.0,,32.0,
425,-75.72,45.38,OTTAWA CDA RCS,6105978,2021-03-01,2021,3,1,,4.5,...,,,2.4,,49.0,,31.0,,63.0,
426,-75.72,45.38,OTTAWA CDA RCS,6105978,2021-03-02,2021,3,2,,-7.0,...,,,4.4,,45.0,,28.0,,48.0,
427,-75.72,45.38,OTTAWA CDA RCS,6105978,2021-03-03,2021,3,3,,0.0,...,,,0.2,,49.0,,,,,
428,-75.72,45.38,OTTAWA CDA RCS,6105978,2021-03-04,2021,3,4,,-3.8,...,,,0.0,,50.0,,32.0,,45.0,


### Processing

Keeping the columns we want in our data mart and add surrogate keys
The snow and rain columns were removed. I kept total precipitation instead. $ values from the Ottawa intlA were missing and the altenrative CDA station didn;t have snow and rain data. Instead to avoid null values and removing any dates for the weather I have kept total precipitation.

In [153]:
ottawa_climate = climate_ottawa[['Station Name','Date/Time','Mean Temp (°C)',  'Min Temp (°C)','Max Temp (°C)', 
                                 'Total Precip (mm)']]

ottawa_climate_missing = climate_ottawa_missing[['Station Name','Date/Time','Mean Temp (°C)',  'Min Temp (°C)','Max Temp (°C)', 
                                 'Total Precip (mm)']]

The indexes of missing temperature or precipitation values and their corresponding date

In [154]:
series = ottawa_climate.isnull().any(axis=1)
for index, value in series.iteritems():
    if value == True:
        print("index of: ", index, " Value: ", value)

index of:  107  Value:  True
index of:  135  Value:  True
index of:  259  Value:  True
index of:  362  Value:  True
index of:  390  Value:  True
index of:  404  Value:  True
index of:  410  Value:  True


In [155]:
index = [107, 135, 259, 362, 404, 410]
for i in index:
    print(ottawa_climate["Date/Time"].loc[[i]])

107    2020-04-17
Name: Date/Time, dtype: object
135    2020-05-15
Name: Date/Time, dtype: object
259    2020-09-16
Name: Date/Time, dtype: object
362    2020-12-28
Name: Date/Time, dtype: object
404    2021-02-08
Name: Date/Time, dtype: object
410    2021-02-14
Name: Date/Time, dtype: object


In [156]:
for i in index:
    ottawa_climate.loc[[i]] = ottawa_climate_missing.loc[[i]]
    print(ottawa_climate.loc[[i]])

       Station Name   Date/Time  Mean Temp (°C)  Min Temp (°C)  Max Temp (°C)  \
107  OTTAWA CDA RCS  2020-04-17             2.4           -1.2            6.1   

     Total Precip (mm)  
107                0.0  
       Station Name   Date/Time  Mean Temp (°C)  Min Temp (°C)  Max Temp (°C)  \
135  OTTAWA CDA RCS  2020-05-15            11.1            9.7           12.5   

     Total Precip (mm)  
135               18.3  
       Station Name   Date/Time  Mean Temp (°C)  Min Temp (°C)  Max Temp (°C)  \
259  OTTAWA CDA RCS  2020-09-16            15.4            8.6           22.2   

     Total Precip (mm)  
259                0.0  
       Station Name   Date/Time  Mean Temp (°C)  Min Temp (°C)  Max Temp (°C)  \
362  OTTAWA CDA RCS  2020-12-28            -1.2           -6.8            4.3   

     Total Precip (mm)  
362                1.9  
       Station Name   Date/Time  Mean Temp (°C)  Min Temp (°C)  Max Temp (°C)  \
404  OTTAWA CDA RCS  2021-02-08           -13.5          -20.3     

### Toronto Weather

We see that Toronto isn't missing any values from the columns we have kept

In [157]:
climate_toronto = pd.read_csv(r"Toronto_IntlA_Total.csv")
climate_toronto_missing = pd.read_csv(r"Toronto_City_Total.csv")
toronto_climate = climate_toronto[['Station Name','Date/Time','Mean Temp (°C)',  'Min Temp (°C)','Max Temp (°C)', 
                                 'Total Precip (mm)']]

toronto_climate_missing = climate_toronto_missing[['Station Name','Date/Time','Mean Temp (°C)',  'Min Temp (°C)','Max Temp (°C)', 
                                 'Total Precip (mm)']]
series2 = toronto_climate.isnull().any(axis=1)
for index, value in series2.iteritems():
    if value == True:
        print("index of: ", index, " Value: ", value)

index of:  384  Value:  True
index of:  413  Value:  True
index of:  414  Value:  True


Process the same way as the ottawa data now

In [158]:
index2 = [384, 413, 414]
for i in index2:
    print(toronto_climate["Date/Time"].loc[[i]])

384    2021-01-19
Name: Date/Time, dtype: object
413    2021-02-17
Name: Date/Time, dtype: object
414    2021-02-18
Name: Date/Time, dtype: object


In [159]:
for i in index2:
    toronto_climate.loc[[i]] = toronto_climate_missing.loc[[i]]
    print(toronto_climate.loc[[i]])

     Station Name   Date/Time  Mean Temp (°C)  Min Temp (°C)  Max Temp (°C)  \
384  TORONTO CITY  2021-01-19             0.0           -2.2            2.2   

     Total Precip (mm)  
384                0.0  
     Station Name   Date/Time  Mean Temp (°C)  Min Temp (°C)  Max Temp (°C)  \
413  TORONTO CITY  2021-02-17            -7.6          -12.7           -2.5   

     Total Precip (mm)  
413                0.0  
     Station Name   Date/Time  Mean Temp (°C)  Min Temp (°C)  Max Temp (°C)  \
414  TORONTO CITY  2021-02-18            -4.4           -6.2           -2.6   

     Total Precip (mm)  
414                3.7  


Join them into a single data frame and write to csv

In [160]:
ottawa_climate = ottawa_climate[300:423]
toronto_climate = toronto_climate[300:423]
weather_dimension = ottawa_climate.append(toronto_climate, ignore_index=True, sort=False)
weather_dimension.head()
#toronto_climate.insert(0, "surrogate_key",  np.arange(len(toronto_climate)))
#toronto_climate["Date/Time"] = pd.to_datetime(toronto_climate["Date/Time"] ).dt.strftime('%Y-%m-%d')

Unnamed: 0,Station Name,Date/Time,Mean Temp (°C),Min Temp (°C),Max Temp (°C),Total Precip (mm)
0,OTTAWA INTL A,2020-10-27,0.2,-2.3,2.7,0.0
1,OTTAWA INTL A,2020-10-28,4.2,-1.1,9.4,0.4
2,OTTAWA INTL A,2020-10-29,2.4,-1.3,6.0,0.0
3,OTTAWA INTL A,2020-10-30,-0.5,-4.9,3.9,0.0
4,OTTAWA INTL A,2020-10-31,-1.0,-8.4,6.5,0.0


In [161]:
weather_dimension.tail()

Unnamed: 0,Station Name,Date/Time,Mean Temp (°C),Min Temp (°C),Max Temp (°C),Total Precip (mm)
241,TORONTO INTL A,2021-02-22,0.6,-1.7,2.8,4.8
242,TORONTO INTL A,2021-02-23,1.8,-0.7,4.3,0.2
243,TORONTO INTL A,2021-02-24,3.9,-0.6,8.4,0.0
244,TORONTO INTL A,2021-02-25,-1.4,-3.7,0.9,0.0
245,TORONTO INTL A,2021-02-26,-1.6,-6.9,3.8,0.0


In [162]:
weather_dimension.insert(0, "surrogate_key",  np.arange(len(weather_dimension)))
weather_dimension["Date/Time"] = pd.to_datetime(weather_dimension["Date/Time"] ).dt.strftime('%Y-%m-%d')
weather_dimension.head()

Unnamed: 0,surrogate_key,Station Name,Date/Time,Mean Temp (°C),Min Temp (°C),Max Temp (°C),Total Precip (mm)
0,0,OTTAWA INTL A,2020-10-27,0.2,-2.3,2.7,0.0
1,1,OTTAWA INTL A,2020-10-28,4.2,-1.1,9.4,0.4
2,2,OTTAWA INTL A,2020-10-29,2.4,-1.3,6.0,0.0
3,3,OTTAWA INTL A,2020-10-30,-0.5,-4.9,3.9,0.0
4,4,OTTAWA INTL A,2020-10-31,-1.0,-8.4,6.5,0.0


In [163]:
weather_dimension.tail()

Unnamed: 0,surrogate_key,Station Name,Date/Time,Mean Temp (°C),Min Temp (°C),Max Temp (°C),Total Precip (mm)
241,241,TORONTO INTL A,2021-02-22,0.6,-1.7,2.8,4.8
242,242,TORONTO INTL A,2021-02-23,1.8,-0.7,4.3,0.2
243,243,TORONTO INTL A,2021-02-24,3.9,-0.6,8.4,0.0
244,244,TORONTO INTL A,2021-02-25,-1.4,-3.7,0.9,0.0
245,245,TORONTO INTL A,2021-02-26,-1.6,-6.9,3.8,0.0


In [164]:
weather_dimension.to_csv("weather_dimension.csv",index=False)