In [1]:
# Library to suppress warnings or deprecation notes 
import warnings
warnings.filterwarnings('ignore')

import os
import math

# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd
import glob
from tqdm import tqdm

## Assumes that rukshar_weather_data_preprocessing.ipynb has run and created the Combined_<weather element>*.csv files

In [2]:
def create_combined_merged_weather_file(root_directory, output_directory):
    '''Create an output combined merged weather across all elements
        :root_directory: Root directory containing the combined weather files
        :output_directory: Directory to combined output merged files
        :min_date: The min date to exclude from the date range after building the combined file default is 12/31/2016
    '''

    directory_mask = os.path.join(root_directory, 'Combined*.csv')
    # Find list of matching filenames in the directory
    filenames = glob.glob(directory_mask)

    combined_merged_df = pd.read_csv(filenames[0])
    combined_merged_df.drop(columns='SOUID', inplace=True)
    filenames.pop(0)

    for filename in filenames:
        filename_base = os.path.basename(filename)
        # Expected format is Combined_<weather_element>_ECA_<weather element name>.csv
        keys = filename_base.split('_')
        print('Processing {}'.format(filename_base))
        df = pd.read_csv(filename, usecols = ['STAID', 'DATE', keys[1], 'Q_' + keys[1]])
        combined_merged_df = combined_merged_df.merge(df, how='outer', on=['STAID','DATE'])

    combined_merged_df.to_csv(os.path.join(output_directory, 'Combined_Merged_Weather_Data.csv'), index=False)

In [3]:
root_directory = "../../data/4. data_processed_1/processed_weather_data_in_CSV_files"
output_directory = '../../data/4. data_processed_1/merged_weather_data_CSV'

create_combined_merged_weather_file(root_directory, output_directory)

Processing Combined_FG_ECA_wind_speed.csv
Processing Combined_HU_ECA_humidity.csv
Processing Combined_PP_ECA_sea_level_pressure.csv
Processing Combined_QQ_ECA_global_radiation.csv
Processing Combined_RR_ECA_precipitation.csv
Processing Combined_SD_ECA_snow depth.csv
Processing Combined_SS_ECA_sunshine.csv
Processing Combined_TG_ECA_mean_temperature.csv


## Adding powiat and voivodeship info

In [4]:
stations = pd.read_csv("../../data/4. data_processed_1/processed_weather_data_in_CSV_files/stations_with_powiat_voivod_GEOJSON.csv")
stations.head()

Unnamed: 0,STAID,STANAME,CN,LAT,LON,HGHT,county,state,Voivodeship
0,24860,PSZCZYNA,PL,50.0,18.916667,261,powiat pszczyński,województwo śląskie,Silesian
1,24876,JABLONKA,PL,49.466667,19.7,671,powiat nowotarski,województwo małopolskie,Lesser Poland
2,24880,GUBALOWKA,PL,49.3,19.933333,856,powiat tatrzański,województwo małopolskie,Lesser Poland
3,24881,WITOW,PL,49.333333,19.833333,844,powiat tatrzański,województwo małopolskie,Lesser Poland
4,24885,LAZY,PL,49.966667,20.5,251,powiat bocheński,województwo małopolskie,Lesser Poland


In [5]:
staid_map = {}
for i, row  in stations.iterrows():
    if row['STAID'] not in staid_map:
        staid_map[row['STAID']] = (row['county'], row['state'], row['LAT'], row['LON'])

In [6]:
wdata = pd.read_csv("../../data/4. data_processed_1/merged_weather_data_CSV/Combined_Merged_Weather_Data.csv")
wdata.head()

Unnamed: 0,STAID,DATE,CC,Q_CC,FG,Q_FG,HU,Q_HU,PP,Q_PP,QQ,Q_QQ,RR,Q_RR,SD,Q_SD,SS,Q_SS,TG,Q_TG
0,204,2017-01-01,8.0,0.0,0.0,0.0,0.0,1.0,10151.0,0.0,-9999.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
1,204,2017-01-02,7.0,0.0,35.0,0.0,89.0,0.0,10069.0,0.0,-9999.0,9.0,15.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0
2,204,2017-01-03,7.0,0.0,0.0,0.0,0.0,1.0,10090.0,0.0,18.0,0.0,0.0,0.0,1.0,0.0,7.0,0.0,-14.0,0.0
3,204,2017-01-04,8.0,0.0,32.0,0.0,93.0,0.0,9885.0,0.0,20.0,0.0,50.0,0.0,7.0,0.0,0.0,0.0,3.0,0.0
4,204,2017-01-05,6.0,0.0,0.0,0.0,0.0,1.0,10101.0,0.0,20.0,0.0,0.0,0.0,16.0,0.0,9.0,0.0,-108.0,0.0


In [8]:
for i, row in wdata.iterrows():
    pow_voi_data = staid_map[row['STAID']]
    wdata.loc[i,'county'] = pow_voi_data[0]
    wdata.loc[i,'Voivodeship'] = pow_voi_data[1]
    wdata.loc[i,'LAT'] = pow_voi_data[2]
    wdata.loc[i,'LON'] = pow_voi_data[3]

In [9]:
wdata.to_csv("../../data/4. data_processed_1/merged_weather_data_CSV/Combined_Merged_Weather_Data.csv", index=False)