In [11]:
import pandas as pd
import numpy as np
import os as os
from pathlib import Path

In [12]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100)

Helper function: *data_raw*  
Read data from each txt file in the directory and concat into a long data
- input: path of the directory
- output: raw data

In [13]:
def data_raw(dir_path):

    df_list = []

    # iterate through all file
    for file in os.listdir(dir_path):
        # Check whether file is in text format or not
        if file.endswith(".txt"):

            file_path = os.path.join(dir_path, file)
            df = pd.read_csv(file_path, encoding='big5', encoding_errors='ignore')

            # clean all metadata starting with *
            idx = df.iloc[:, 0].str.contains(r'^\*+')
            
            df = df[~idx]

            # remove the # from header
            df.iloc[0, 0] = df.iloc[0, 0].lstrip('# ')

            # sep columns by space
            df = df.iloc[:, 0].str.split(r'\s+', expand=True)

            # set header with 1st row
            new_header = df.iloc[0] #grab the first row for the header
            df = df[1:] #take the data less the header row
            df.columns = new_header #set the header row as the df header

            # append cleaned dataframe to a list
            df_list.append(df)

    # concat all df into long data
    long_data = pd.concat(df_list, axis=0, join='outer')

    long_data = long_data.reset_index(drop=True)

    return long_data

Helper function: *data_cleaned*   
1. select the required variables
1. clean missing values  
1. change time format
- input: `data_raw`
- output: a cleaned long data

In [14]:
def data_cleaned(df):
    # selecte required features
    df_sub = df[['yyyymmdd', 'stno', 'TX01', 'PP01', 'WD01', 'WD02', 'PS01']]

    # convert dtype=object (string) to numeric
    df_sub['yyyymmdd'] = df_sub['yyyymmdd'].astype(np.int)
    df_sub['stno'] = df_sub['stno'].astype(np.int)
    df_sub['TX01'] = df_sub['TX01'].astype(np.float)
    df_sub['PP01'] = df_sub['PP01'].astype(np.float)
    df_sub['WD01'] = df_sub['WD01'].astype(np.float)
    df_sub['WD02'] = df_sub['WD02'].astype(np.float)
    df_sub['PS01'] = df_sub['PS01'].astype(np.float)

    # replace missing value with np.nan
    df_cleaned = (
        df_sub
        .replace(-9998.0, np.nan)
        .replace(-9999.0, np.nan)
        .replace(-9997.0, np.nan)
    )

    # Convert Strings to Datetime
    df_cleaned['yyyymmdd'] = pd.to_datetime(df_cleaned['yyyymmdd'], format='%Y%m%d')

    return df_cleaned

# Put them together  
function: long_data_clean
- input: dir path
- output: df_cleaned

In [15]:
def long_data_clean(dp):
    df_raw = data_raw(dp)
    df_cleaned = data_cleaned(df_raw)
    return df_cleaned

In [19]:
file_name = 'df_cleaned.pkl'

if not Path(file_name).is_file():
    dp = Path("/archive/Observation/CWB_Station/daily")
    df_cleaned = long_data_clean(dp)
    df_cleaned.to_pickle(file_name)
