# Init
Import all library here!

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
import numpy as np
from tqdm import tqdm
import pandas as pd
import glob
import os
import matplotlib.pyplot as plt

# Explore data

## Read archive data

In [None]:
sensor_filepath = r"../data/SENSOR/fixed_data/**/*.csv"
sensor_files = glob.glob(sensor_filepath, recursive=True)
print(len(sensor_files))

In [None]:
archive_sensor_dfs = [ ]
for path in sensor_files:
    sensor_id = path.split('/')[-2]
    df = pd.read_csv(path)
    df['SensorCode'] = sensor_id
    archive_sensor_dfs.append(df)

In [None]:
#sensor_df = pd.concat(map(pd.read_csv, sensor_files))
sensor_df = pd.concat(archive_sensor_dfs)
del archive_sensor_dfs

In [None]:
sensor_df.info()
sensor_df.describe()

In [None]:
sensor_df.head()

In [None]:
sensor_df['Datetime'] = sensor_df['date'].astype(str) + ' ' + sensor_df['time'].astype(str)
sensor_df['Datetime'] = pd.to_datetime(sensor_df['Datetime'], errors='coerce', format="%Y-%m-%d %H:%M:%S")
sensor_df.drop(['date', 'time'], axis = 1, inplace=True)

## Read new data
New data gathered by my friend.

In [None]:
new_sensor_filepath = r'../data/SENSOR/SENSOR(13-08-2022 _ 31-10-2022)/*.csv'
new_sensor_files = glob.glob(new_sensor_filepath, recursive=True)
print(new_sensor_files)    

In [None]:
new_sensor_df = pd.concat(map(pd.read_csv, new_sensor_files))

In [None]:
new_sensor_df['Datetime'] = new_sensor_df['Date'].astype(str) + ' ' + new_sensor_df['Time'].astype(str)
new_sensor_df['Datetime'] = pd.to_datetime(new_sensor_df['Datetime'], errors='coerce', format="%d/%m/%Y %H:%M:%S")
new_sensor_df.drop(['Date', 'Time'], axis = 1, inplace=True)

In [None]:
# new_sensor_df.info()

## Merge archive data with new data
Into one dataframe only!

In [None]:
sensor_df.columns

In [None]:
sensor_df.rename(columns={ 
    'latitude':'Latitude', 
    'longitude':'Longtitude', 
    'altitude':'Altitude', 
    'temperature':'Temperature',
    'humidity':'Humidity', 
    'pm1.0':'PM1.0', 
    'pm2.5':'PM2.5', 
    'pm10':'PM10', 
    'uv':'UV', 
    'co':'CO', 
    'no2':'NO2', 
    'so2':'SO2', 
    'o3':'O3',
    'rain':'Rainfall', 
    'wind_direction':'Direction', 
    'wind_gust':'WindGust', 
    'wind_avg':'WindSpeed'
}, inplace=True)

In [None]:
sensor_df = pd.concat([sensor_df, new_sensor_df])

In [None]:
sensor_df.head()

## Convert and clear data
Convert date, time to respective format, and delete NANs.

In [None]:
sensor_df.drop(['SensorID', 'SensorName'], axis=1, inplace=True)

In [None]:
sensor_df['WeekDay'] = sensor_df['Datetime'].dt.day_of_week

In [None]:
sensor_df.sort_values(by=['Datetime'], inplace=True)
sensor_df.reset_index(inplace=True, drop=True)

In [None]:
sensor_df['Direction'] = sensor_df['Direction'].astype(dtype=pd.StringDtype())
sensor_df['SensorCode'] = sensor_df['SensorCode'].astype(dtype=pd.StringDtype())

In [None]:
for col in sensor_df:
    if (sensor_df.dtypes[col] == 'object'):
        sensor_df[col] = pd.to_numeric(sensor_df[col], errors='coerce')
        print(col)

In [None]:
row_counts = sensor_df.shape[0]
sensor_df.dropna(inplace=True)
nan_row_count = row_counts - sensor_df.shape[0]
row_counts = sensor_df.shape[0]
print('Number of NaN rows = ', nan_row_count)

Delete negative target values: 

Since the target columns (air pollutant measurements) must be non-negative, and my friend while training got error due to these negative values - these rows must be deleted. I think it's because the sensors got some errors.

In [None]:
prev_row_counts = sensor_df.shape[0]
row_counts = sensor_df.shape[0]
target_columns = ['PM1.0', 'PM2.5', 'PM10', 'UV', 'CO', 'NO2', 'SO2', 'O3']
for target in target_columns:
    sensor_df.drop(sensor_df[target][ sensor_df[target] < 0 ].index, inplace=True)
    print(target, ':', row_counts - sensor_df.shape[0])
    row_counts = sensor_df.shape[0]
print('Negative target value rows deleted: ', prev_row_counts - sensor_df.shape[0])

In [None]:
sensor_df.reset_index(drop=True, inplace=True)

## Handle wind direction
Since Wind direction is string, with the maximum length of 3, I should split this string into 3 seperate colums, each as a category feature. Then apply one-hot encoding or something?

Convert from labels to degree (pi based). East direction is 0 and West direction is pi.

In [None]:
direction_label_to_pi = {
    'X': -1,                        # psudo label, equal NaN?
    'N' : 0,                        # remember to treat it as 2*pi when calculate NW and NNW
    'NNE': np.pi / 8,
    'NE' : np.pi / 4,
    'ENE': np.pi * 3 / 8,
    'E' : np.pi / 2,
    'ESE': np.pi * 5 / 8,
    'SE' : np.pi * 3 / 4,
    'SSE': np.pi * 7 / 8,
    'S' : np.pi,
    'SSW': np.pi * 9 / 8,
    'SW' : np.pi * 5 / 4,
    'WSW': np.pi * 11 / 8,
    'W' : np.pi * 3 / 2,
    'WNW': np.pi * 13 / 8,
    'NW' : np.pi * 7 / 4,           # N as 2*pi, (1.5 + 2)/2 * pi = 1.75 pi
    'NNW': np.pi * 15 / 8
}

In [None]:
sensor_df['WindDegree'] = sensor_df['Direction'].map(direction_label_to_pi)

In [None]:
# this cell check if the conversion has error. Just had to be sure!

wind_direction_valcounts = sensor_df['Direction'].value_counts(sort=True)
print(wind_direction_valcounts)
wind_degree_valcounts = sensor_df['WindDegree'].value_counts(sort=True)
print(wind_degree_valcounts)

winddir_valcounts_arr = wind_direction_valcounts.to_numpy()
winddegree_valcounts_arr = wind_degree_valcounts.to_numpy()
is_winddegree_conversion_error = False
for i in range(winddir_valcounts_arr.shape[0]):
    if (winddir_valcounts_arr[i] != winddegree_valcounts_arr[i]):
        print(i, winddir_valcounts_arr[i], winddegree_valcounts_arr[i])
        is_winddegree_conversion_error = True
        break
if (is_winddegree_conversion_error):
    print("There's a difference, conversion has error")
else: 
    print("No error.")

In [None]:
plt.subplot(111, polar=True)
for id in wind_direction_valcounts.index:
    #print(id)
    if (id == 'X'): continue
    plt.bar(x=direction_label_to_pi[id], height=10*(wind_direction_valcounts[id] / wind_direction_valcounts.sum()), bottom=0, width=np.pi / 8)

In [None]:
sensor_df['WindCos'] = np.cos( sensor_df['WindDegree'] )
sensor_df['WindSin'] = np.sin( sensor_df['WindDegree'] )
sensor_df['WindCos'][ sensor_df['Direction'] == 'X' ] = 0
sensor_df['WindSin'][ sensor_df['Direction'] == 'X' ] = 0

In [None]:
sensor_df.drop( ['WindDegree'], axis=1, inplace=True)

## Summary after clean

In [None]:
sensor_df.info()
sensor_df.describe()

In [None]:
sensor_df.head()

# Read the camera csv

In [None]:
camera_filepath = r'../data/CAMERA/CAMERA_CSV(13-08-2022-_31-10-2022)/*.csv'
camera_files = glob.glob(camera_filepath, recursive=True)
print(camera_files)

In [None]:
camera_df = pd.concat(map(pd.read_csv, camera_files))

In [None]:
camera_df.drop(['CameraID', 'CameraName'], axis=1, inplace=True)

In [None]:
camera_df['Datetime'] = camera_df['Date'].astype(str) + ' ' + camera_df['Time'].astype(str)
camera_df['Datetime'] = pd.to_datetime(camera_df['Datetime'], errors='coerce', format="%d/%m/%y %H:%M:%S")          # 13/08/22,19:50:02
camera_df.drop(['Date', 'Time'], axis = 1, inplace=True)
#camera_df['WeekDay'] = camera_df['Datetime'].dt.day_of_week

In [None]:
camera_df.sort_values(by=['Datetime'], inplace=True)
camera_df.reset_index(inplace=True, drop=True)
camera_df.Datetime[ camera_df.Datetime.notna() ].sort_values()

# Export preprocessed data

In [None]:
import datetime

In [None]:
sensor_df.to_csv('../data/SENSOR/sensors_' + datetime.datetime.now().strftime(format="%Y%m%d_%H%M%S") + '.csv', index=False)
camera_df.to_csv('../data/CAMERA/cameras_' + datetime.datetime.now().strftime(format="%Y%m%d_%H%M%S") + '.csv', index=False)