## Prepare Data

### Merge Data

In [62]:
from pathlib import Path
import pandas as pd
from tqdm import tqdm

data_dir_path = '../data'
data_dir = Path(data_dir_path)
subdata_list = sorted(list(data_dir.glob('*')))

final_df_list = []
for subdata_dir in tqdm(subdata_list):
    data_list = list(subdata_dir.glob('tng*.tsv'))

    for data_path in data_list:
        # read sensor data
        df = pd.read_csv(data_path, sep='\t')
        df['date'] = df['datetime'].apply(lambda x: x.split('T')[0])
        df = df[(df['flag_Gg_pyr']==0)][['date', 'Gg_pyr']].dropna().reset_index(drop=True)
        
        # Case: all flag is not 0
        if len(df) > 0:
            df.columns = ['date', 'GHI']
            df = df.groupby('date').mean().reset_index()
            df['station_id'] = data_path.stem.split('_')[0]

            # read relative gee data
            gee_data_path = Path('../gee_data') / f"gee_{df.iloc[0]['date'][:-3]}.csv"
            gee_df = pd.read_csv(gee_data_path)

            # merge dataframe
            merge_df = df.merge(gee_df, how='left', on=['date', 'station_id'])
            final_df_list.append(merge_df)

100%|██████████| 13/13 [00:22<00:00,  1.75s/it]


In [65]:
final_df = pd.concat(final_df_list, ignore_index=True)
final_df.to_csv('../data/final/final_data.csv', index=False)