# Convert into uncertain temporal pattern

Prerequiste: Run aqinew.ipynb or label the AQI index values first.

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
import numpy as np
from tqdm import tqdm
import pandas as pd
import glob
import os
import matplotlib.pyplot as plt

In [None]:
pollutants = ['PM2.5','PM10','CO','NO2','SO2','O3']
aqi_columns = ['AQI_O3','AQI_NO2', 'AQI_SO2','AQI_CO', 'AQI_PM2.5', 'AQI_PM10']

# Read data

In [None]:
input_path = r"../data/AQI/20221222_0224/AQI.csv"
label_df = pd.read_csv(input_path)

# Preprocess

In [None]:
kept_columns = ['Datetime', 'SensorCode']
label_df.drop(['dt','LatitudeCam','LongtitudeCam','LatitudeSensor','LongtitudeSensor'], inplace=True, axis=1)
label_df['Datetime'] = pd.to_datetime(label_df['Datetime'], errors='coerce')

In [None]:
for col in label_df.columns:
    if (col in kept_columns) or ('AQI' in col): continue            # AQI handled differently
    maxval = label_df[col].max()
    #print(maxval)
    label_df[col] = label_df[col].values / maxval

## Define fuzzy negation range

In [None]:
# fuzzy_range_label = { 'LOW': [0.05, 0.2],
#                         'MED': [0.2, 0.5],
#                         'HIGH': [0.5, 0.7] }
fuzzy_range_label = {
    'label': ['LOW', 'MODERATE', 'HIGH'],
    'normal': [0.1, 0.2, 0.4]
}
exclude_columns = aqi_columns + kept_columns

In [None]:
exclude_columns

The step below handle all columns.

In [None]:
def fuzzy_convert(row, min, max):
    return (row - min) / (max - min)
    
for col in tqdm(label_df.columns):
    if (col in exclude_columns): continue
    for l in fuzzy_range_label['label']:
        label_col = col + '_' + l
        label_df[label_col] = 0
    
    for i in tqdm(range(1, len(fuzzy_range_label['label']))):
        l2, l1 = fuzzy_range_label['label'][i], fuzzy_range_label['label'][i-1]
        vmax, vmin = fuzzy_range_label['normal'][i], fuzzy_range_label['normal'][i-1]
        condition = (label_df[col] >= vmin) & (label_df[col] <= vmax)
        lc1, lc2 = col + '_' + l1, col + '_' + l2
        label_df[lc2][ condition ] = label_df[col][condition].apply(fuzzy_convert, args=(vmin, vmax))
        label_df[lc1][ condition ] = 1 - label_df[lc2][condition]
    
    first_label, last_label = fuzzy_range_label['label'][0], fuzzy_range_label['label'][-1]
    label_df[col + "_" + first_label][(label_df[col] < (fuzzy_range_label['normal'])[0])] = 1
    label_df[col + "_" + last_label][(label_df[col] >= (fuzzy_range_label['normal'])[-1])] = 1


## Handle AQI levels

I'll divide AQI columns into 6 categories.

In [None]:
aqi_fuzzy_range_label = {
    'label': ['lv1','lv2','lv3','lv4','lv5'],
    #['GOOD', 'MODERATE', 'UNHEALTHY_FOR_SENSITIVE_GROUP', 'UNHEALTHY', 'VERY_UNHEALTHY', 'HAZARDOUS'],
    'normal': [5, 12, 50, 100, 200]
    #[25, 75, 125, 175, 250, 350]
    #[0, 50, 100, 150, 200, 300, 500]
}

In [None]:
for col in tqdm(aqi_columns):
    for l in aqi_fuzzy_range_label['label']:
        label_col = col + '_' + l
        label_df[label_col] = 0
    
    for i in tqdm(range(1, len(aqi_fuzzy_range_label['label']))):
        l2, l1 = aqi_fuzzy_range_label['label'][i], aqi_fuzzy_range_label['label'][i-1]
        vmax, vmin  = aqi_fuzzy_range_label['normal'][i], aqi_fuzzy_range_label['normal'][i-1]
        #condition =  
        #print(condition.sum())
        lc1, lc2 = col + '_' + l1, col + '_' + l2
        label_df[lc2][ (label_df[col] <= vmax) & (label_df[col] >= vmin) ] = label_df[col][(label_df[col] <= vmax) & (label_df[col] >= vmin)].apply(fuzzy_convert, args=(vmin, vmax))
        label_df[lc1][ (label_df[col] <= vmax) & (label_df[col] >= vmin) ] = 1 - label_df[lc2][(label_df[col] <= vmax) & (label_df[col] >= vmin)]
    
    first_label, last_label = aqi_fuzzy_range_label['label'][0], aqi_fuzzy_range_label['label'][-1]
    label_df[col + "_" + first_label][(label_df[col] < (aqi_fuzzy_range_label['normal'])[0])] = 1
    label_df[col + "_" + last_label][(label_df[col] >= (aqi_fuzzy_range_label['normal'])[-1])] = 1

In [None]:
label_df['AQI_O3_lv2'].value_counts()

In [None]:
label_df[ [ ('AQI_O3_' + col) for col in aqi_fuzzy_range_label['label']] ].describe()

In [None]:
label_df['AQI_O3_lv4'].value_counts()

In [None]:
label_df.drop(pollutants, axis=1, inplace=True)

for p in pollutants:
    label_df.drop( [ col for col in label_df.columns if col.startswith(p) ] , axis=1, inplace=True)

## Get rid of labels from rows without wind+rain

In [None]:
windrain_columns = ['WindGust', 'WindSpeed', 'Rainfall', 'WindCos', 'WindSin']
for col in windrain_columns:
    for label in fuzzy_range_label['label']:
        lc = col + '_' + label
        label_df[lc][ (label_df['WindCos'] == 0) & (label_df['WindSin'] == 0) ] = 0

In [None]:
label_df.fillna(0, inplace=True)

# Last cleaning

In [None]:
dropped_columns = []
is_ok = False
for col in label_df.columns:
    if (col in kept_columns): continue
    is_ok = False
    for l in (fuzzy_range_label['label'] + aqi_fuzzy_range_label['label']):
        if l in col:
            is_ok = True
            break
    if not is_ok:
        dropped_columns.append(col)

label_df.drop( dropped_columns, axis=1, inplace=True )
label_df.info()

In [None]:
for col in label_df.columns:
    if (col in kept_columns): continue
    label_df[col] [ label_df[col] < 0.5 ] = 0               # truncate all prob below 0.5 into 0, basically not using it anymore. This is also to made all transactions have only prob independent items.

## Handle special columns

In [None]:
label_df['WeekDay'] = label_df['Datetime'].dt.day_name()
label_df['WeekDay'].value_counts()

In [None]:
label_df['HourTriple'] = label_df['Datetime'].dt.round(freq='3H').dt.hour.astype(str)
label_df['HourTriple'].value_counts()

In [None]:
label_df.drop(['SensorCode'], axis=1, inplace=True)

In [None]:
label_df = pd.get_dummies(label_df)
label_df.info()

## Clean for sparser data?

# Save the resutls

In [None]:
import datetime
output_folder = '../data/UTDATABASE/utd_' + datetime.datetime.now().strftime(format="%Y%m%d_%H%M")
os.makedirs(output_folder, exist_ok=True)
out_dbpath= os.path.join(output_folder, 'label.csv')
label_df.to_csv(out_dbpath, index=False)
print(out_dbpath)

In [None]:
import json
with open(os.path.join(output_folder, 'setting.json'), 'w') as f:
    f.write( json.dumps({ 'input_path': input_path, 
                        'aqi_fuzzymap': aqi_fuzzy_range_label, 
                        'other_fuzzymap': fuzzy_range_label }, 
    indent=4 ))

# with open(os.path.join(output_folder, 'aqi_fuzzy_range_label.json'), 'w') as f:
#     f.write( json.dumps( aqi_fuzzy_range_label, indent=4 ))
# with open(os.path.join(output_folder, 'fuzzy_range_label.json'), 'w') as f:
#     f.write( json.dumps( fuzzy_range_label, indent=4 ))

Completed. Move on the last step, mine these data.