In [2]:
import pandas as pd
import numpy as np
import os as os

In [112]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100)

Helper function: *data_raw*  
Read data from each txt file in the directory and concat into a long data
- input: path of the directory
- output: raw data

In [51]:
def data_raw(dir_path):

    df_list = []

    # iterate through all file
    for file in os.listdir():
        # Check whether file is in text format or not
        if file.endswith(".txt"):

            file_path = os.path.join(dir_path, file)
            df = pd.read_csv(file_path, encoding='big5')

            # clean all metadata starting with *
            idx = df.iloc[:, 0].str.contains(r'^\*+')
            
            df = df[~idx]

            # remove the # from header
            df.iloc[0, 0] = df.iloc[0, 0].lstrip('# ')

            # sep columns by space
            df = df.iloc[:, 0].str.split(r'\s+', expand=True)

            # set header with 1st row
            new_header = df.iloc[0] #grab the first row for the header
            df = df[1:] #take the data less the header row
            df.columns = new_header #set the header row as the df header

            # append cleaned dataframe to a list
            df_list.append(df)

    # concat all df into long data
    long_data = pd.concat(df_list, axis=0, join='outer')

    long_data = long_data.reset_index(drop=True)

    return long_data

Helper function: *data_cleaned*   
1. select the required variables
1. clean missing values  
1. change time format
- input: `data_raw`
- output: a cleaned long data

In [108]:
def data_cleaned(df):
    # selecte required features
    df_sub = df[['yyyymmdd', 'stno', 'TX01', 'PP01', 'WD01', 'WD02', 'PS01']]

    # convert dtype=object (string) to numeric
    df_sub['yyyymmdd'] = df_sub['yyyymmdd'].astype(np.int)
    df_sub['stno'] = df_sub['stno'].astype(np.int)
    df_sub['TX01'] = df_sub['TX01'].astype(np.float)
    df_sub['PP01'] = df_sub['PP01'].astype(np.float)
    df_sub['WD01'] = df_sub['WD01'].astype(np.float)
    df_sub['WD02'] = df_sub['WD02'].astype(np.float)
    df_sub['PS01'] = df_sub['PS01'].astype(np.float)

    # replace missing value with np.nan
    df_cleaned = (
        df_sub
        .replace(-9998.0, np.nan)
        .replace(-9999.0, np.nan)
    )

    # Convert Strings to Datetime
    df_cleaned['yyyymmdd'] = pd.to_datetime(df_cleaned['yyyymmdd'], format='%Y%m%d')

    return df_cleaned

# Put them together  
function: long_data_clean
- input: dir path
- output: df_cleaned

In [114]:
def long_data_clean(dp):
    df_raw = data_raw(dp)
    df_cleaned = data_cleaned(df_raw)
    return df_cleaned


In [115]:
dp = "C:\\Users\\hsnutardis\\OneDrive - 國立台灣大學\\RCEC_summer_2022\\RCEC_summer_2022"
long_data_clean(dp)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  df_sub['yyyymmdd'] = df_sub['yyyymmdd'].astype(np.int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['yyyymmdd'] = df_sub['yyyymmdd'].astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  df_sub['stno'] = df_sub['stno'].astype(np.int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['stno'] = df_sub['stno'].astype(np.int)
Deprecated in NumP

Unnamed: 0,yyyymmdd,stno,TX01,PP01,WD01,WD02,PS01
0,1911-01-01,466920,15.2,11.4,2.7,,1021.4
1,1911-01-02,466920,12.6,2.1,4.5,,1024.2
2,1911-01-03,466920,16.9,0.0,4.1,,1021.1
3,1911-01-04,466920,18.0,0.3,1.6,,1019.2
4,1911-01-05,466920,15.2,5.3,3.2,,1022.0
...,...,...,...,...,...,...,...
1204,2021-12-27,467990,7.3,0.0,5.6,20.0,1021.6
1205,2021-12-28,467990,9.4,,4.1,360.0,1016.7
1206,2021-12-29,467990,12.0,,2.3,40.0,1014.5
1207,2021-12-30,467990,11.3,0.0,4.4,60.0,1018.0


# EDA

function:
Mean groupby date
