In [1]:
# Library to suppress warnings or deprecation notes 
import warnings
warnings.filterwarnings('ignore')

import os
import math

# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd
import glob
from tqdm import tqdm

In [2]:
def clean_weather_column_names(df):
    '''Strip out blanks in weather dataframe columns
        :df: DataFrame to use
    '''
    df.rename(columns=lambda x: x.strip(), inplace = True)
    return df

def create_single_weather_file(directory, prefix, suffix = '*.txt', lines_to_skip=19):
    '''Read all the weather files in a directory and create a concatenated dataframe
        :directory: Directory with weather file
        :prefix: The prefix of the weather file to look for in the directory
        :suffix: The suffix of the weather file to look for in the directory default is *.txt
        :lines_to_skip: Number of lines to skip in the files before csv header default is 19
    '''
    directory_mask = os.path.join(directory, prefix + suffix)
    
    # Find list of matching filenames in the directory
    filenames = glob.glob(directory_mask)
    concat_df = None
    
    if (len(filenames) > 0):
        df_list = []
        for filename in filenames:
            df_list.append(pd.read_csv(filename, skiprows=lines_to_skip))

        concat_df = pd.concat(df_list, axis=0)
        concat_df = clean_weather_column_names(concat_df)
    else:
        print('No files found in {} for prefix {}'.format(directory, prefix))
        
    return concat_df

def process_weather_files(root_directory, output_directory, min_date='2016-12-31', keep_valid_only=True):
    '''Process all the weather file directories
        :root_directory: Root directory containing the weather file subdirectories
        :output_directory: Directory to output merged files
        :min_date: The min date to exclude from the date range after building the combined file default is 12/31/2016
    '''
    
    # Mapping between leaf names and file name prefix in child directory
    directory_dict = {'ECA_cloud_cover': 'CC','ECA_global_radiation': 'QQ','ECA_humidity': 'HU',
                'ECA_mean_temperature': 'TG','ECA_precipitation': 'RR','ECA_sea_level_pressure': 'PP',
                'ECA_snow depth': 'SD','ECA_sunshine': 'SS','ECA_wind_speed': 'FG'}
    
    for key, value in tqdm(directory_dict.items()):
        # Concatenate files into a single dateframe
        current_directory = os.path.join(root_directory, key)
        prefix = value
        
        #Create dataframe for the sources.txt file
        #Number of lines to skip in the sources.txt files before csv header is 23
        sources_df = pd.read_csv(os.path.join(current_directory, 'sources.txt'), skiprows=23)
        sources_df.rename(columns=lambda x: x.strip(), inplace = True) #strip extra white space from column names
        sources_df['BEGIN'] = pd.to_datetime(sources_df['BEGIN'].astype(str)) #format the date columns
        sources_df['END'] = pd.to_datetime(sources_df['END'].astype(str)) #format the date columns
        sources_df['SOUNAME'] = sources_df['SOUNAME'].str.strip() #strip leading / trailing white spaces from the column
        sources_df['PARNAME'] = sources_df['PARNAME'].str.strip() #strip leading / trailing white spaces from the column
        sources_df['PARID'] = sources_df['PARID'].str.strip() #strip leading / trailing white spaces from the column

        # Create a single weather file
        combined_df = create_single_weather_file(current_directory, prefix)

        # Convert DATE column to datetime so we can filter 2017-2021
        combined_df['DATE'] = pd.to_datetime(combined_df['DATE'].astype(str))
        combined_df = combined_df[combined_df['DATE'] > min_date]
        
        # Ignore bad readings and merge with source data to get lat/long
        if (keep_valid_only):
            combined_df = combined_df[combined_df['Q_' + value] == 0]
        
        combined_df = pd.merge(combined_df,sources_df, how='inner', on='SOUID')
        
        # I found duplicated data in source weather files. Ensure its removed.
        combined_df = combined_df.drop_duplicates()
        
        # See if we have an duplicated data
        print('Rows of duplicated data in the dataset {}'.format(combined_df.duplicated().sum()))
        
        # output a single file
        output_file = os.path.join(output_directory, 'Combined_{}_{}.csv'.format(prefix, key))
        combined_df.to_csv(output_file, index=False)

        #print(sources_df.head())
        #sources_output_file = os.path.join(output_directory, 'sources_{}_{}.csv'.format(prefix, key))
        #sources_df.to_csv(sources_output_file, index=False)

        #break

In [3]:
# Root directory of the weather files from zip
#root_directory = '/home/rukshar/Documents/Omdena/Poland/Air Quality/Data/daily_weather_data/daily_weather_data_1979-2021'
root_directory = 'C:/Users/thayes/omdena/Air Pollution in Poland/daily_weather_data_1979-2021'

# Output directory for combined file
#output_directory = '/home/rukshar/Documents/Omdena/Poland/Air Quality/Data/processed_weather_data'
output_directory = 'c:/temp/processed_weather_data'

process_weather_files(root_directory, output_directory)

  0%|          | 0/9 [00:00<?, ?it/s]

Rows of duplicated data in the dataset 0


 11%|█         | 1/9 [00:05<00:40,  5.06s/it]

Rows of duplicated data in the dataset 0


 22%|██▏       | 2/9 [00:07<00:22,  3.24s/it]

Rows of duplicated data in the dataset 0


 33%|███▎      | 3/9 [00:10<00:19,  3.21s/it]

Rows of duplicated data in the dataset 0


 44%|████▍     | 4/9 [00:15<00:19,  3.88s/it]

Rows of duplicated data in the dataset 0


 56%|█████▌    | 5/9 [00:20<00:17,  4.30s/it]

Rows of duplicated data in the dataset 0


 67%|██████▋   | 6/9 [00:22<00:10,  3.60s/it]

Rows of duplicated data in the dataset 0


 78%|███████▊  | 7/9 [00:27<00:07,  3.98s/it]

Rows of duplicated data in the dataset 0


 89%|████████▉ | 8/9 [00:29<00:03,  3.38s/it]

Rows of duplicated data in the dataset 0


100%|██████████| 9/9 [00:34<00:00,  3.88s/it]
