In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(color_codes = True)
import calendar
from datetime import datetime
import math

In [3]:
weather = pd.read_csv("climate-daily.csv", low_memory = False)

In [4]:
weather.head()

Unnamed: 0,x,y,TOTAL_PRECIPITATION,SNOW_ON_GROUND,SPEED_MAX_GUST_FLAG,CLIMATE_IDENTIFIER,DIRECTION_MAX_GUST_FLAG,LOCAL_MONTH,TOTAL_SNOW,PROVINCE_CODE,...,MAX_TEMPERATURE_FLAG,STATION_NAME,MIN_TEMPERATURE,COOLING_DEGREE_DAYS,SPEED_MAX_GUST,TOTAL_SNOW_FLAG,SNOW_ON_GROUND_FLAG,TOTAL_PRECIPITATION_FLAG,HEATING_DEGREE_DAYS,MAX_REL_HUMIDITY
0,-79.4,43.666667,,,,6158350,,3,,ON,...,,TORONTO,0.0,0.0,,,,,13.8,
1,-79.4,43.666667,,,,6158350,,3,,ON,...,,TORONTO,1.1,0.0,,,,,13.5,
2,-79.4,43.666667,,,,6158350,,3,,ON,...,,TORONTO,2.2,0.0,,,,,11.3,
3,-79.4,43.666667,,,,6158350,,3,,ON,...,,TORONTO,-3.9,0.0,,,,,12.4,
4,-79.4,43.666667,,,,6158350,,3,,ON,...,,TORONTO,-1.1,0.0,,,,,15.2,


In [5]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62116 entries, 0 to 62115
Data columns (total 36 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   x                         62116 non-null  float64
 1   y                         62116 non-null  float64
 2   TOTAL_PRECIPITATION       60817 non-null  float64
 3   SNOW_ON_GROUND            19824 non-null  float64
 4   SPEED_MAX_GUST_FLAG       0 non-null      float64
 5   CLIMATE_IDENTIFIER        62116 non-null  int64  
 6   DIRECTION_MAX_GUST_FLAG   0 non-null      float64
 7   LOCAL_MONTH               62116 non-null  int64  
 8   TOTAL_SNOW                60867 non-null  float64
 9   PROVINCE_CODE             62116 non-null  object 
 10  MIN_TEMPERATURE_FLAG      9 non-null      object 
 11  HEATING_DEGREE_DAYS_FLAG  5 non-null      object 
 12  MIN_REL_HUMIDITY          1 non-null      float64
 13  LOCAL_DAY                 62116 non-null  int64  
 14  MEAN_T

In [12]:
def day_mean(day, month, column, rounded = True):
    ''' finds the mean value of a column on a specific day
        ex. average high temperature on March 7

        arguments:
            day: the numerical day to use, int
            month: the numerical month to use, int
            column: the column on the .csv to sort by (ex. 'MEAN_TEMPERATURE'), string
            rounded: an optional argument to round the averages to two decimal points
                set to true by default, boolean
        
        return:
            the numerical mean on that column and day
        '''

    df_oneday = weather.loc[(weather["LOCAL_MONTH"] == month) & (weather["LOCAL_DAY"] == day)]
    mean_value = df_oneday[column].mean()

    if rounded:
        return round(mean_value, 2)

In [7]:
mean_for_year = {}
months = list(calendar.month_name[1:])
for month in range(1, 13):
    for day in range(1, 32):
        temps_day = day_mean(day, month, "MEAN_TEMPERATURE")
        rain_day = day_mean(day, month, "TOTAL_PRECIPITATION")
        snow_day = day_mean(day, month, "SNOW_ON_GROUND")
        if not math.isnan(temps_day):
            mean_for_year[f"{calendar.month_name[month]} {day}"] = {"MEAN_TEMPERATURE": temps_day, "TOTAL_PRECIPITATION": rain_day, "SNOW_ON_GROUND": snow_day}
pandas_dict = pd.DataFrame(mean_for_year).T
pandas_dict

Unnamed: 0,MEAN_TEMPERATURE,TOTAL_PRECIPITATION,SNOW_ON_GROUND
January 1,-4.0,2.0,5.0
January 2,-4.0,3.0,5.0
January 3,-4.0,2.0,6.0
January 4,-4.0,2.0,6.0
January 5,-4.0,2.0,6.0
...,...,...,...
December 27,-3.0,2.0,5.0
December 28,-3.0,2.0,5.0
December 29,-4.0,2.0,5.0
December 30,-4.0,2.0,5.0


In [8]:
with pd.option_context('display.max_rows', None,):
    print(pandas_dict)

              MEAN_TEMPERATURE  TOTAL_PRECIPITATION  SNOW_ON_GROUND
January 1                 -4.0                  2.0             5.0
January 2                 -4.0                  3.0             5.0
January 3                 -4.0                  2.0             6.0
January 4                 -4.0                  2.0             6.0
January 5                 -4.0                  2.0             6.0
January 6                 -4.0                  2.0             6.0
January 7                 -5.0                  2.0             6.0
January 8                 -5.0                  2.0             7.0
January 9                 -5.0                  2.0             6.0
January 10                -5.0                  2.0             7.0
January 11                -5.0                  2.0             6.0
January 12                -5.0                  2.0             7.0
January 13                -5.0                  2.0             7.0
January 14                -5.0                  

In [14]:
sorted_dict = pandas_dict.sort_values(by = ['MEAN_TEMPERATURE', 'TOTAL_PRECIPITATION'], ascending = [False, False])
sorted_dict

Unnamed: 0,MEAN_TEMPERATURE,TOTAL_PRECIPITATION,SNOW_ON_GROUND
July 17,22.0,3.0,0.0
July 18,22.0,3.0,0.0
July 15,21.0,4.0,0.0
August 4,21.0,4.0,0.0
July 7,21.0,3.0,0.0
...,...,...,...
February 3,-6.0,2.0,9.0
February 4,-6.0,2.0,9.0
February 5,-6.0,2.0,8.0
February 10,-6.0,2.0,8.0


In [10]:
print(sorted_dict.iloc[0])

MEAN_TEMPERATURE       22.0
TOTAL_PRECIPITATION     3.0
SNOW_ON_GROUND          0.0
Name: July 17, dtype: float64


In [28]:
def weather_probability(day, month, column):
    ''' finds the probability that a certain weather condition will occur on a day (snow or rain)

    arguments:
        day: the numerical day to use, int
        month: the numerical month to use, int
        column: the column on the .csv to sort by (ex. 'TOTAL_RAIN'), string
    
    return:
        the percentage probability that the weather condition occurs on that day
    '''

    df_oneday = weather.loc[(weather["LOCAL_MONTH"] == month) & (weather["LOCAL_DAY"] == day)]
    total_days = 0
    times_weather_occured = 0
    for date in df_oneday[column]:
        if math.isnan(date):
            continue
        if date:
            times_weather_occured += 1
        total_days += 1

    return round((times_weather_occured / total_days) * 100, 2)

In [27]:
print(weather_probability(28, 11, "TOTAL_SNOW"))

27.68


In [23]:
df_oneday = weather.loc[(weather["LOCAL_MONTH"] == month) & (weather["LOCAL_DAY"] == day)]
with pd.option_context('display.max_rows', None,):
    print(df_oneday['TOTAL_SNOW'])

305       NaN
670       NaN
1035      NaN
1400      NaN
1766      NaN
2131      NaN
2496      NaN
2861      0.0
3227      0.0
3592      0.0
3957      0.0
4322      2.0
4688     20.3
5053      8.6
5418      0.0
5783      0.0
6149      0.0
6514      1.3
6879      0.0
7244      0.0
7610      0.3
7975      0.0
8340      0.0
8705      3.8
9071      0.0
9436      0.0
9801      0.0
10166     7.6
10532     0.0
10897     0.0
11262     1.3
11627     0.0
11993     1.0
12358     0.0
12723     0.3
13088     0.0
13454     1.3
13819     0.0
14184     0.0
14549     5.1
14915     0.0
15280     4.6
15645     0.3
16010     0.3
16376     0.0
16741     0.0
17106    17.8
17471    10.2
17837     0.0
18202     0.0
18567     0.5
18932     0.3
19298     0.0
19663     0.5
20028     0.0
20393     0.0
20759     0.0
21124     0.0
21489     0.0
21854     0.5
22219     1.3
22584     0.5
22949     0.0
23314     0.5
23680     0.0
24045     0.3
24410     0.0
24775     0.0
25141     0.5
25506     0.5
25871     1.5
26236 