For this project we need:
1. access the dataset with data from 01/01/1980 - 01/01/2020
2. find the average of each value for each day

In [71]:
import datetime
import pandas as pd

In [72]:
url = 'weather_output.csv'
df = pd.read_csv(url)
df.drop(df.columns[0], axis=1, inplace=True)

In [73]:
df['date'] = pd.to_datetime(df['date'])
df['day'] = df['date'].dt.day
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year

In [74]:
def get_final_df():
    
    main_columns = [
    #temperature
    'temperature_2m_mean',
    #precipitation
    'precipitation_sum', 'precipitation_hours',
    #wind
    'wind_speed_10m_max',
    #sun
    'shortwave_radiation_sum'
    ]

    final_df = pd.DataFrame()
    for col in main_columns:
        day_month_df = df.groupby(['day', 'month']).agg({col: ['mean', 'std']}).reset_index()
        day_month_df.columns = ['day', 'month', f'{col}_mean', f'{col}_std']
        year_df = df[['year', 'month', 'day', col]].copy()
        full_df = pd.merge(year_df, day_month_df, on=['day', 'month'], how='left')
        full_df[f'{col}_z_score'] = (full_df[col] - full_df[f'{col}_mean']) / full_df[f'{col}_std']
        #We are not using the variation percent
        # full_df[f'{col}_var_percent'] = ((full_df[col]-full_df[f'{col}_mean'])/full_df[f'{col}_mean'])*100
        if final_df.empty:
            final_df = full_df
        else:
            full_df = full_df.drop(columns=['year', 'month', 'day'])
            final_df = pd.concat([final_df, full_df], axis=1)
    
    final_colums = ['year', 'month', 'day', 
    'temperature', 'temperature_mean', 'temperature_std', 'temperature_z_score',
    'rain_sum', 'rain_mean', 'rain_std', 'rain_z_score',
    'rain_hours', 'rain_hours_mean','rain_hours_std', 'rain_hours_z_score',
    'wind_speed','wind_speed_mean', 'wind_speed_std','wind_speed_z_score',
    'exposure_sum', 'exposure_mean', 'exposure_std', 'exposure_z_score']

    final_df.columns = final_colums
    return final_df
    


In [75]:
final_dataframe = get_final_df()
display(final_dataframe)

Unnamed: 0,year,month,day,temperature,temperature_mean,temperature_std,temperature_z_score,rain_sum,rain_mean,rain_std,...,rain_hours_std,rain_hours_z_score,wind_speed,wind_speed_mean,wind_speed_std,wind_speed_z_score,exposure_sum,exposure_mean,exposure_std,exposure_z_score
0,1980,1,1,-1.929167,4.995909,3.270764,-2.117265,0.000000,2.941463,4.417520,...,4.992189,-0.864766,16.375053,23.659357,10.334067,-0.704883,2.58,2.080976,1.214617,0.410849
1,1980,1,2,-2.406250,4.867831,3.602246,-2.019319,0.000000,1.835000,3.048375,...,4.486375,-0.718843,16.981165,24.447642,9.661297,-0.772823,3.99,2.549500,1.251848,1.150699
2,1980,1,3,2.079167,4.440748,3.856877,-0.612304,7.199999,2.300000,3.545166,...,4.818501,2.334751,23.863409,25.494043,10.090264,-0.161605,0.69,2.273250,1.229345,-1.287881
3,1980,1,4,7.283333,4.638925,3.487914,0.758163,0.000000,2.002500,3.758170,...,4.109947,-0.748185,24.063513,25.071737,9.685254,-0.104099,1.75,2.322250,0.990572,-0.577696
4,1980,1,5,3.785417,5.098613,3.492382,-0.376017,0.000000,2.322500,3.023370,...,4.247473,-0.965280,22.322900,23.668557,8.119429,-0.165733,3.54,2.184750,1.118142,1.212056
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14606,2019,12,28,7.802333,4.444394,3.966594,0.846555,0.000000,1.060000,1.957733,...,3.555332,-0.555504,13.708391,21.388834,7.428256,-1.033950,1.77,2.472500,1.088258,-0.645527
14607,2019,12,29,5.929417,4.315175,4.037568,0.399805,0.000000,1.960000,3.709226,...,5.394620,-0.699771,15.937878,21.201974,8.940934,-0.588764,3.47,2.247000,1.173234,1.042418
14608,2019,12,30,5.448166,5.128040,3.760275,0.085134,0.000000,2.322500,3.538904,...,5.272023,-0.810884,16.575644,22.883216,10.390741,-0.607038,3.77,2.031250,1.168782,1.487659
14609,2019,12,31,5.952333,5.185800,3.236155,0.236865,0.000000,0.950000,1.477350,...,3.543774,-0.726626,16.981165,22.506410,9.415917,-0.586798,1.49,2.077000,0.910920,-0.644403


In [None]:
#JUST AN EXAMPLE OF HOW IT WORKS
mean_temperature_df = df.groupby(['day', 'month']).agg({'temperature_2m_mean':['mean', 'std']}).reset_index()
mean_temperature_df.columns = ['_'.join(col).strip('_') for col in mean_temperature_df.columns.values]
year_mean_temperature_df = df[['year','month','day','temperature_2m_mean']].copy()
full_mean_temperature_df = pd.merge(left=year_mean_temperature_df, right=mean_temperature_df)
full_mean_temperature_df['mean_z_score']= (full_mean_temperature_df['temperature_2m_mean'] - full_mean_temperature_df['temperature_2m_mean_mean'])/full_mean_temperature_df['temperature_2m_mean_std']
full_mean_temperature_df['mean_var_percent']= ((full_mean_temperature_df['temperature_2m_mean']-full_mean_temperature_df['temperature_2m_mean_mean'])/full_mean_temperature_df['temperature_2m_mean_mean'])*100
full_mean_temperature_df


Unnamed: 0,year,month,day,temperature_2m_mean,temperature_2m_mean_mean,temperature_2m_mean_std,mean_z_score,mean_var_percent
0,1980,1,1,-1.929167,4.995909,3.270764,-2.117265,-138.614934
1,1980,1,2,-2.406250,4.867831,3.602246,-2.019319,-149.431664
2,1980,1,3,2.079167,4.440748,3.856877,-0.612304,-53.179809
3,1980,1,4,7.283333,4.638925,3.487914,0.758163,57.004760
4,1980,1,5,3.785417,5.098613,3.492382,-0.376017,-25.755947
...,...,...,...,...,...,...,...,...
14606,2019,12,28,7.802333,4.444394,3.966594,0.846555,75.554505
14607,2019,12,29,5.929417,4.315175,4.037568,0.399805,37.408487
14608,2019,12,30,5.448166,5.128040,3.760275,0.085134,6.242674
14609,2019,12,31,5.952333,5.185800,3.236155,0.236865,14.781394


In [76]:
def get_date_valid_df():
    condition1 = final_dataframe['year'] >= 2015
    condition2 = final_dataframe['year'] <= 2020
    filtered_df = final_dataframe[condition1 & condition2].reset_index(drop=True)
    filtered_df.to_excel('weather_data.xlsx')

In [77]:
get_final_df()
get_date_valid_df()