In [None]:
import os
import pandas as pd
import numpy as np
import plotly.express as px
from aeon.anomaly_detection import KMeansAD, PyODAdapter, DWT_MLEAD
from aeon.distances import euclidean_pairwise_distance
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from pyod.models.lof import LOF
from scipy import stats
from tqdm import tqdm

np.seterr(invalid='ignore')

In [None]:
!wget -nc https://archive.ics.uci.edu/static/public/501/beijing+multi+site+air+quality+data.zip -O ./data/data.zip

os.makedirs('./data/PRSA_Data', exist_ok=True)

!unzip -n ./data/data.zip -d ./data
!unzip -n ./data/PRSA2017_Data_20130301-20170228.zip -d data/
!mv --no-clobber ./data/PRSA_Data_20130301-20170228/* ./data/PRSA_Data

In [None]:
def process_data(path: str, encode_wd: bool=False) -> pd.DataFrame:
    df = pd.read_csv(path, parse_dates={'timestamp': ['year', 'month', 'day', 'hour']}, date_format="%Y %m %d %H")
    df = df.set_index('timestamp')

    if encode_wd:
        one_hot = pd.get_dummies(df['wd'].fillna('NAN'), dtype=int)
        one_hot.loc[one_hot['NAN'] == 1, list(one_hot.columns)] = np.nan
        one_hot = one_hot.drop(['NAN'], axis=1)
        df = df.join(one_hot)
        df = df.drop(['wd'], axis=1)
        
    df = df.interpolate(method='time')
    df = df.interpolate(method='backfill')
    df = df.drop(['No'], axis=1)

    return df

def get_anomalies(df: pd.DataFrame) -> pd.DataFrame:

    columns = [column for column in df.columns if column not in['wd', 'station']]
    x = np.array(df[columns]).T
    x = StandardScaler().fit_transform(x)

    anomaly_df = pd.DataFrame({'timestamp':df.index})

    detector = KMeansAD(random_state=42)
    anomaly_df['anomaly_score_KMeans'] = detector.fit_predict(x)

    detector = PyODAdapter(LOF())  
    anomaly_df['anomaly_score_PyoD'] = detector.fit_predict(x)

    detector = DWT_MLEAD()
    for i in range(len(columns)):
        anomaly_df[f'anomaly_score_{columns[i]}'] = detector.fit_predict(x[i])


    return anomaly_df

def get_distances(df_dict: dict, column:str) -> np.ndarray:
    arrays = []
    for key in df_dict.keys():
        arrays.append(np.array(df_dict[key][column]))

    arrays = np.array(arrays)

    return euclidean_pairwise_distance(arrays), cosine_similarity(arrays)

def visualize_scores(df_dict: dict, column:str) -> None:
    vis_df = pd.DataFrame({'timestamp': df_dict[list(df_dict.keys())[0]]['timestamp']})

    for key in df_dict.keys():
        vis_df[key] = df_dict[key][column]

    
    vis_df = pd.melt(vis_df, ['timestamp'])
    names = column.split('_')
    title = names[0].capitalize() + ' ' + names[1].capitalize() + ': ' + names[2]
    fig = px.line(vis_df, x="timestamp", y="value", color='variable', title=title)
    fig.show()

def get_whole_city_anomalies(df_dict: dict, column:str) -> pd.DataFrame:

    anomaly_df = {}
    for key in df_dict.keys():
        anomaly_df[key] = np.array(df_dict[key][column])

    anomaly_df = pd.DataFrame(anomaly_df)
    anomaly_df.index = df_dict[list(df_dict.keys())[0]]['timestamp']
 
    anomalies = anomaly_df[(np.abs(stats.zscore(anomaly_df)) >= 3).all(axis=1)]

    return anomalies
    

In [None]:
path = './data/PRSA_Data'

file_list = os.listdir(path)

os.makedirs('./data/anomalies/', exist_ok=True)
anomaly_files = os.listdir('./data/anomalies/')
anomaly_dict = {}

for file in tqdm(file_list, total=len(file_list)):
    station = file.split('_')[2]
    if f'{station}.csv' in anomaly_files:
        df = pd.read_csv(f'./data/anomalies/{station}.csv', parse_dates=['timestamp'])
        df = df[[column for column in df.columns if column != 'Unnamed: 0']]
        anomaly_dict[station] = df

    else:
        df = process_data(f'{path}/{file}')
        anomaly_dict[station] = get_anomalies(df)
        anomaly_dict[station].to_csv(f'./data/anomalies/{station}.csv')

In [None]:
for column in anomaly_dict['Changping'].columns:
    if column != 'timestamp':
        euclid, cosine = get_distances(anomaly_dict, column)
        print(f'{column} --- Mean Euclid Distance: {round(np.mean(euclid),2)} | Mean Cosine Similarity: {round(100*np.mean(cosine),2)}%')

In [None]:
for column in anomaly_dict[list(anomaly_dict.keys())[0]].columns:
    if column != 'timestamp':
        print(f'{column} --- Whole city anomalies: {get_whole_city_anomalies(anomaly_dict, column).shape[0]}')

In [None]:
visualize_scores(anomaly_dict, 'anomaly_score_PRES')

In [None]:
get_whole_city_anomalies(anomaly_dict, 'anomaly_score_PRES')