In [None]:
import numpy as np
import pandas as pd
import os

from sklearn.feature_selection import f_regression
from sklearn import metrics as sk_metrics
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

import plotly.express as px
import plotly.graph_objects as go

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
### drow row data ###
#####################
SITE = "banqiao_new"
# "banqiao_new", "taoyuan_new", "zhongming_new", "wanhua_new", "tainan_new", "qianjin_new"

DF_row_data = pd.read_csv( "clean_data/seperate/" + site + ".csv" )
fig = go.Figure()
for device_id in DF_row_data['device_id'].unique():
    mask_id = DF_row_data[ "device_id" ] == device_id
    fig.add_trace(go.Scatter(x=DF_row_data[mask_id].time_hourly, y=DF_row_data[mask_id]["PM2.5"],
                        mode='lines',
                        marker=dict( size=2 ),
                        line=dict( width=0.6, dash='dot' ),
                        name=device_id, opacity=0.8))

fig.add_trace(go.Scatter(x=DF_row_data.time_hourly, y=DF_row_data["EPA_PM2.5"],
                    mode='lines',
                    line=dict( width=1.2 ),
                    name='EPA_'+ SITE))
fig.show()

In [None]:
def build( DF_row_data ):
    GB = DF_data.rename( columns={"PM2.5":"AirBox_PM2.5"} )
    
    return GB

In [None]:
JSON_ANA = {}
NEW_SITES = [ "banqiao_new", "taoyuan_new", "zhongming_new", "wanhua_new", "tainan_new", "qianjin_new" ]
for site in NEW_SITES:
    DF_data = pd.read_csv( "clean_data/seperate/" + site + ".csv" )
    DF_data = build( DF_data )
    JSON_ANA[ site ] = {}
    JSON_ANA[ site ][ "covariance" ] = DF_data[["EPA_PM2.5", "AirBox_PM2.5"]].cov()['EPA_PM2.5']['AirBox_PM2.5']
    JSON_ANA[ site ][ "correlation" ] = DF_data[["EPA_PM2.5", "AirBox_PM2.5"]].corr()['EPA_PM2.5']['AirBox_PM2.5']
    JSON_ANA[ site ][ "error_rate" ] = (abs(DF_data["AirBox_PM2.5"] - DF_data["EPA_PM2.5"]) / DF_data["EPA_PM2.5"] * 100).mean()
    #JSON_ANA[ site ][ "R2" ] = r2_score( DF_data["EPA_PM2.5"], DF_data["PM2.5"] )
    JSON_ANA[ site ][ "MSE" ] = mean_squared_error( DF_data["EPA_PM2.5"], DF_data["AirBox_PM2.5"] )
    JSON_ANA[ site ][ "RMSE" ] = np.sqrt( JSON_ANA[ site ][ "MSE" ] )
    JSON_ANA[ site ][ "MAE" ] = mean_absolute_error( DF_data["EPA_PM2.5"], DF_data["AirBox_PM2.5"] )
    JSON_ANA[ site ][ "R2" ] = np.corrcoef( x = DF_data[["EPA_PM2.5","AirBox_PM2.5"]].T )[0][1] **2
    AirBox_Value( DF_data, site, "AirBox_PM2.5", 10 )
    AirBox_Value( DF_data, site, "EPA_PM2.5", 6 )
    AirBox_Value( DF_data, site, "Temperature", 3 )
    AirBox_Value( DF_data, site, "Humidity", 3 )
    AirBox_Value( DF_data, site, "HR", 1 )

In [None]:
#DF_ANA = pd.DataFrame(JSON_ANA)
#DF_ANA.to_csv( "score_ana2/score_all.csv" )
DF_ANA = pd.read_csv( "score_ana2/score_all.csv", index_col=0 )
DF_ANA

In [None]:
def AirBox_Value( DF_data, site, feature, p ):
    value = int(DF_data[feature].min())
    JSON_G = {}
    while value <= DF_data[feature].max() + p:
        value_mask = (value <= DF_data[feature]) & (DF_data[feature] < value + p)
        G = DF_data[ value_mask ]

        if(G.shape[0] > 1):
            JSON_G[ value ] = {}
            JSON_G[ value ][ "covariance" ] = G[["EPA_PM2.5", "AirBox_PM2.5"]].cov()['EPA_PM2.5']['AirBox_PM2.5']
            JSON_G[ value ][ "correlation" ] = G[["EPA_PM2.5", "AirBox_PM2.5"]].corr()['EPA_PM2.5']['AirBox_PM2.5']
            JSON_G[ value ][ "error_rate" ] = (abs(G["AirBox_PM2.5"] - G["EPA_PM2.5"]) / G["EPA_PM2.5"] * 100).mean()
            JSON_G[ value ][ "R2" ] = np.corrcoef( x = G[["EPA_PM2.5","AirBox_PM2.5"]].T )[0][1] **2
            JSON_G[ value ][ "MSE" ] = mean_squared_error( G["EPA_PM2.5"], G["AirBox_PM2.5"] )
            JSON_G[ value ][ "RMSE" ] = np.sqrt( JSON_G[ value ][ "MSE" ] )
            JSON_G[ value ][ "MAE" ] = mean_absolute_error( G["EPA_PM2.5"], G["AirBox_PM2.5"] )
        value += p
    
    # ouptut
    DF_AirBox_PM25 = pd.DataFrame(JSON_G)
    filepath = "score_ana2/" + site + "/"
    filename = feature + ".csv"
    isExists=os.path.exists(filepath)
    if not isExists:
        os.makedirs(filepath)
    DF_AirBox_PM25.to_csv( filepath + filename )

In [None]:
fig2 = go.Figure()
Feature = "PM2.5"
# "PM2.5", "Temperature", "EPA_PM2.5", "HR", "Humidity"
Score = "R2"
# "correlation", "error_rate", "R2", "MSE", "RMSE", "MAE", "covariance"
for site in NEW_SITES:
    DF_score = pd.read_csv( "score_ana2/"+ site +"/"+ Feature +".csv",index_col=0 )
    fig2.add_trace(go.Scatter(y=DF_score.loc[Score], x=DF_score.loc[Score].index,
                        mode='markers',
                        name=site))
fig2.show()

In [None]:
SITE = "banqiao_new"
# "banqiao_new", "taoyuan_new", "zhongming_new", "wanhua_new", "tainan_new", "qianjin_new"
Feature = "PM2.5"
# "PM2.5", "Temperature", "EPA_PM2.5", "HR", "Humidity"

DF_score = pd.read_csv( "score_ana2/"+ SITE +"/"+ Feature +".csv",index_col=0 )
DF_score

## Seperate

In [None]:
JSON_ANA = {}
NEW_SITES = [ "banqiao_new", "taoyuan_new", "zhongming_new", "wanhua_new", "tainan_new", "qianjin_new" ]
for site in NEW_SITES:
    DF_data = pd.read_csv( "clean_data/seperate/" + site + ".csv" )
    i = 1
    for device_id in DF_data['device_id'].unique():
        mask_id = DF_data[ "device_id" ] == device_id
        device_data = DF_data[ mask_id ]
        AirBox_Value2( device_data, site, "PM2.5", 10, i )
        AirBox_Value2( device_data, site, "EPA_PM2.5", 6, i )
        AirBox_Value2( device_data, site, "Temperature", 3, i )
        AirBox_Value2( device_data, site, "Humidity", 3, i )
        AirBox_Value2( device_data, site, "HR", 1, i )
        i += 1

In [None]:
def AirBox_Value2( DF_data, site, feature, p, i ):
    value = int(DF_data[feature].min())
    JSON_G = {}
    while value <= DF_data[feature].max() + p:
        value_mask = (value <= DF_data[feature]) & (DF_data[feature] < value + p)
        G = DF_data[ value_mask ]

        if(G.shape[0] > 1):
            JSON_G[ value ] = {}
            JSON_G[ value ][ "covariance" ] = G[["EPA_PM2.5", "PM2.5"]].cov()['EPA_PM2.5']['PM2.5']
            JSON_G[ value ][ "correlation" ] = G[["EPA_PM2.5", "PM2.5"]].corr()['EPA_PM2.5']['PM2.5']
            JSON_G[ value ][ "error_rate" ] = (abs(G["PM2.5"] - G["EPA_PM2.5"]) / G["EPA_PM2.5"] * 100).mean()
            JSON_G[ value ][ "R2" ] = np.corrcoef( x = G[["EPA_PM2.5","PM2.5"]].T )[0][1] **2
            JSON_G[ value ][ "MSE" ] = mean_squared_error( G["EPA_PM2.5"], G["PM2.5"] )
            JSON_G[ value ][ "RMSE" ] = np.sqrt( JSON_G[ value ][ "MSE" ] )
            JSON_G[ value ][ "MAE" ] = mean_absolute_error( G["EPA_PM2.5"], G["PM2.5"] )
        value += p
    
    # ouptut
    DF_AirBox_PM25 = pd.DataFrame(JSON_G)
    filepath = "score_ana2/" + site + "/seperate/"
    filename = feature + "-" + str(i) + ".csv"
    isExists=os.path.exists(filepath)
    if not isExists:
        os.makedirs(filepath)
    DF_AirBox_PM25.to_csv( filepath + filename )

In [None]:
fig3 = go.Figure()
Feature = "Humidity"
# "PM2.5", "Temperature", "EPA_PM2.5", "HR", "Humidity"
Score = "correlation"
# "correlation", "error_rate", "R2", "MSE", "RMSE", "MAE", "covariance"
for site in NEW_SITES:
    for i in [1,2]:
        DF_score = pd.read_csv( "score_ana2/"+ site +"/seperate/"+ Feature + "-" + str(i) + ".csv",index_col=0 )
        fig3.add_trace(go.Scatter(y=DF_score.loc[Score], x=DF_score.loc[Score].index,
                            mode='markers',
                            name=site+str(i)))
fig3.show()