In [2]:
import pandas as pd
import numpy as np
import os, glob
from tqdm import tqdm

In [3]:
# Mapping between leaf names and file name prefix in child directory
directory_dict = {'Combined_CC_ECA_cloud_cover': 'CC','Combined_QQ_ECA_global_radiation': 'QQ','Combined_HU_ECA_humidity': 'HU',
            'Combined_TG_ECA_mean_temperature': 'TG','Combined_RR_ECA_precipitation': 'RR','Combined_PP_ECA_sea_level_pressure': 'PP',
            'Combined_SD_ECA_snow depth': 'SD','Combined_SS_ECA_sunshine': 'SS','Combined_FG_ECA_wind_speed': 'FG'}
input_dir = 'merged_weather_data_CSV'
files = glob.glob( os.path.join(input_dir, '*.csv'))
files

['merged_weather_data_CSV/Combined_SD_ECA_snow depth.csv',
 'merged_weather_data_CSV/Combined_SS_ECA_sunshine.csv',
 'merged_weather_data_CSV/Combined_RR_ECA_precipitation.csv',
 'merged_weather_data_CSV/Combined_CC_ECA_cloud_cover.csv',
 'merged_weather_data_CSV/Combined_HU_ECA_humidity.csv',
 'merged_weather_data_CSV/Combined_FG_ECA_wind_speed.csv',
 'merged_weather_data_CSV/Combined_QQ_ECA_global_radiation.csv',
 'merged_weather_data_CSV/Combined_PP_ECA_sea_level_pressure.csv',
 'merged_weather_data_CSV/Combined_TG_ECA_mean_temperature.csv']

In [4]:
output_dir = './mean_weather_data/'
df_list = []
for f in files:
    #get the file name only: we first split on '/' to get Combined_SD_ECA_snow depth.csv
    #and then leave out the last 4 chars (.csv) 
    fname = f.split('/')[1][:-4] 
    
    print(fname)
    component = directory_dict[fname] #which weather element we're working with
    quality_column = 'Q_'+component

    df = pd.read_csv(f)
    df = df[df[quality_column] == 0][['DATE', 'SOUNAME', component]] #choosing only dates, source name of station, and element values and only the valid rows (where Q_CC == 0)

    #create a pivot table with dates as row indices and source names as columns 
    pivot = pd.pivot_table(df, values=component, 
                                    index='DATE', 
                                    columns='SOUNAME', 
                                    aggfunc=np.nanmean)

    #change the column names to add the component to the source name columns
    pivot.rename(columns = lambda x: x+'_'+component, inplace = True)

    #create a mean valued col for each row
    mean_df = pivot.mean(axis=1,skipna=True)
    mean_df = pd.DataFrame(mean_df, columns = ['mean_'+component]) 

    #append the mean column to the pivot table
    concat_df = pd.concat([pivot, mean_df], axis=1)

    df_list.append(concat_df)
    #concat_df.to_csv(os.path.join(output_dir, 'mean_'+fname+'.csv'))


    #break


Combined_SD_ECA_snow depth
Combined_SS_ECA_sunshine
Combined_RR_ECA_precipitation
Combined_CC_ECA_cloud_cover
Combined_HU_ECA_humidity
Combined_FG_ECA_wind_speed
Combined_QQ_ECA_global_radiation
Combined_PP_ECA_sea_level_pressure
Combined_TG_ECA_mean_temperature


In [9]:
all_concat_df = pd.concat(df_list, axis=1) #concatenate all the weather component dataframes into a single one
all_concat_df.to_csv('./mean_weather_data/all_mean_weather_data_combined.csv')
all_concat_df.shape

(1826, 726)

In [8]:
all_concat_df.columns

Index(['BIALOWIEZA_SD', 'BIALYSTOK_SD', 'BIEBRZA-PIENCZYKOWEK_SD',
       'BIELSKO-BIALA_SD', 'BORUSOWA_SD', 'CHOJNICE_SD', 'CHORZELOW_SD',
       'CIESZANOW_SD', 'DRONIOWICE_SD', 'DYNOW_SD',
       ...
       'WARSZAWA-FILTRY_TG', 'WARSZAWA-OBSERWATORIUM II_TG',
       'WARSZAWA-OKECIE_TG', 'WIELICHOWO_TG', 'WIELUN_TG', 'WLODAWA_TG',
       'WROCLAW_TG', 'ZAMOSC_TG', 'ZIELONA GORA_TG', 'mean_TG'],
      dtype='object', length=726)