In [5]:
from os import listdir, makedirs
from os.path import isfile, join, exists
import pandas as pd
import numpy as np
from prepare_data import get_station_paths, prepare_old_station_data, prepare_new_station_data

old_basepath = '../basedata/PCD Data/Data before 2020-9'
new_basepath = '../basedata/PCD Data/Data after 2020-7/PCD data after 2020-7.csv'
others_basepath = './prepared_data/others/'


In [6]:
new_columns = ['stationID', 'PM25', 'PM10', 'NO2', 'SO2', 'CO', 'O3', 'datetime_aq']
all_new_df = pd.read_csv(new_basepath, usecols=new_columns)
old_station_paths = get_station_paths(old_basepath)

# get all uncollect columns name

## new data

In [4]:
new_uncollect = set()
stations = []
columns = []
for station in all_new_df['stationID'].unique():
    st_df = all_new_df[all_new_df.stationID == station]
    st_cols = []
    for col in st_df.columns:
        if st_df[col].sum() == 0:
            new_uncollect.add(col)
            stations.append(station)
            columns.append(col)
new_uncollect_df = pd.DataFrame({'station':stations, 'column':columns})
new_uncollect

{'CO', 'NO2', 'O3', 'PM10', 'SO2'}

In [5]:
pd.DataFrame(new_uncollect).to_csv(join(others_basepath, 'new_uncollect_columns.csv'))
new_uncollect_df.to_csv(join(others_basepath, 'new_uncollect_mapper.csv'))

## old data

### Investigate columns name

In [7]:
old_columns = {}
for station in old_station_paths:
    try:
        df = prepare_old_station_data(station)
        for col in df.columns:
            old_columns[col] = station
    except:
        print(station)

In [8]:
old_columns

{'CO': '19t',
 'NO2': '19t',
 'SO2': '19t',
 'O3': '19t',
 'PM10': '19t',
 'WS': '19t',
 'WD': '19t',
 'Temp': '19t',
 'PM25': '08t',
 'Rain': '13t',
 ' Total HC': '35t',
 'CH4 (ppm)': '35t',
 ' Glob rad': '35t'}

## This station is too complicated so we removed it

In [9]:
pd.read_excel(old_station_paths['35t'])

Unnamed: 0,ปี/เดือน/วัน,ชั่วโมง,CO,NO,O3,NO2,SO2,NOX,PM10,Total HC,...,WS comp.2,WS comp.3,WS comp.4,WS comp.5,CO8,Std wdir,Std wdir.1,Std_wspd,Std_wspd.1,Std_wspd.2
0,,,at 3 m (ppm),at 3 m (ppb),at 3 m (ppb),at 3 m (ppb),at 3 m (ppb),at 3 m (ppb),at 3 m (มคก./ลบ.ม.),(ppm),...,V10,V30,W10,W30,at 3 m (ppm),10,30,U,V,W
1,950101.0,100.0,,,,,,,,,...,,,,,,,,,,
2,950101.0,200.0,,,,,,,,,...,,,,,,,,,,
3,950101.0,300.0,,,,,,,,,...,,,,,,,,,,
4,950101.0,400.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61364,11231.0,2000.0,0.599999,5,10,13,0,17,54,1.69,...,,1.7,,-0.3,0.557142,,320,0.5,1.9,0
61365,11231.0,2100.0,0.599999,6,8,11,0,17,48,1.7,...,,2,,-0.3,0.585714,,325,0.3,1.6,0
61366,11231.0,2200.0,0.7,13,0,16,0,28,51,1.78,...,,1.7,,-0.3,0.6,,185,1,0.7,0
61367,11231.0,2300.0,0.7,8,1,14,0,22,55,1.77,...,,0.2,,-0.2,0.585714,,205,2,0.799999,0


## We rename column to be the same format as this

In [3]:
col_mapper = {'CO': 'CO', ' NO2': 'NO2', ' SO2 ': 'SO2', 'O3': 'O3', ' PM10': 'PM10', ' Wind speed': 'WS', ' Wind dir': 'WD',
 ' Temp': 'Temp', ' Rel hum': 'Rain', ' PM2.5': 'PM25', 'PM10': 'PM10', 'PM2.5': 'PM25', 'NO2': 'NO2', 'SO2': 'SO2', 
 'WS': 'WS', 'WD': 'WD', 'TEMP': 'Temp', 'RH': 'Rain', 'PM2.5 ': 'PM25', ' CO': 'CO', ' WD': 'WD', ' WS ': 'WS', 'Temp': 'Temp',
  ' TEMP': 'Temp', ' RH': 'Rain', ' CO ': 'CO', ' Rain': 'Rain', 'CO(ppm)': 'CO', 'PM10(มคก./ลบ.ม.)': 'PM10', 'TMP': 'Temp'}
to_remove = ['Unnamed: 11', ' Pressure', 'Unnamed: 10', 'Unnamed: 8', 'Unnamed: 9', 'NO', 'Nox', ' NO ', ' NOX ']
to_have = ['CO', 'NO2', 'SO2', 'O3', 'PM10', 'WS', 'WD', 'Temp', 'Rain', 'PM25']

In [11]:
old_uncollect = set()
old_columns = {}
stations = []
columns = []
maximum_st = len(old_station_paths)
i = 0
for station in old_station_paths:
    try:
        df = prepare_old_station_data(station)
        for col in df.columns:
            old_columns[col] = station
            if df[col].sum() == 0:
                old_uncollect.add(col)
                stations.append(station)
                columns.append(col)
        # ==================================================================
        # We add this for find which columns are uncollected
        for col in list(set(to_have) - set(df.columns)):
            stations.append(station)
            columns.append(col)
        # ==================================================================
    except:
        print(station)
    i += 1
old_uncollect_df = pd.DataFrame({'station':stations, 'column':columns})
old_uncollect 

{'Rain'}

In [12]:
pd.DataFrame(old_uncollect).to_csv(join(others_basepath, 'old_uncollect_columns.csv'))
old_uncollect_df.to_csv(join(others_basepath, 'old_uncollect_mapper.csv'))

In [10]:
false_list = []
for station in all_new_df['stationID'].unique():
    df = prepare_new_station_data(station)
    try:
        if station in old_station_paths:
            old_df = prepare_old_station_data(station)
            df = pd.concat([old_df, df]).reset_index().rename(columns={'index':'datetime'}).drop_duplicates(subset=['datetime']).set_index('datetime')
    except:
        print(station)
        false_list.append(station)
