In [None]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from colorama import Fore
import gc

from datetime import datetime, timedelta
import time
import math

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load data
source_df = pd.read_csv( "ALL.csv.gz", compression='gzip' )
source_df.head()

In [None]:
# Info 
#ana_df = source_df.drop(["create_timestamp_utc","HR","HUM","PM25","TEM","intercept"],axis=1)
ana_df = source_df.drop(["training_AirBoxs","training_EPA"],axis=1)

site_list = list( ana_df["site"].unique() )
method_list = list( ana_df["method"].unique() )
feature_list = list( ana_df["feature"].unique() )
day_list = list( ana_df["day"].unique() )
earliest = ana_df.date.min()
latest = ana_df.date.max()
ana_df["date_datetime"] = pd.to_datetime( ana_df["date"] )

In [None]:
# info
print( "{green}**site list:\n{reset}{site}\n\n"
      "{green}**method list:\n{reset}{method}\n\n"
      "{green}**feature list:\n{reset}{feature}\n\n"
      "{green}**day list:\n{reset}{day}\n\n"
      "{green}**time period:\n{reset}{earliest} - {latest}\n\n"
      .format( green=Fore.GREEN, reset=Fore.RESET, site=site_list, 
              method=method_list, feature=feature_list, day=day_list, 
             earliest = earliest, latest = latest) )

# All sites during the whole time period
the best Train_R2 of each site everyday

1. Line chart, daily
2. bar polar animation, daily
3. bar polar animation, monthly (Train_R2 mean)

---

useful dataframe
- all_best_df: everyday's best Train_R2 of each site with whole columns
- all_best_df_monthly: monthly mean of each site with "RMSE", "Train_R2", "Test_R2"

Just a overview :)

In [None]:
# Find the best result of all time/site
def find_best( all_best_df, element="Train_R2" ):
    df = all_best_df.iloc[all_best_df.groupby(['date','site']).apply(lambda x: x[element].idxmax())]
    
    return df.reset_index(drop=True)

def add_trace_site( site_df, site_name ):
    fig_ALL.add_trace( go.Scatter( x=list(site_df.date),
                               y=list(site_df.Train_R2),
                               name=site_name
                             ) )

In [None]:
#all_best_df = find_best(ana_df).set_index(keys=['date', 'site'] )
all_best_df = find_best(ana_df)
all_best_df.head()

In [None]:
all_best_df_monthly = all_best_df.drop( ["day", "feature", "method", "num"], axis=1 )

all_best_df_monthly = all_best_df_monthly.groupby( ["site", "date_datetime"] ).mean()

level_values = all_best_df_monthly.index.get_level_values
all_best_df_monthly = (all_best_df_monthly.groupby([level_values(i) for i in [0]]
                      +[pd.Grouper(freq='M', level=-1)]).mean())
all_best_df_monthly.reset_index(inplace=True)
all_best_df_monthly['month'] = all_best_df_monthly.date_datetime.map( lambda x: x.strftime("%Y-%m") )


In [None]:
# Create figure
fig_ALL = go.Figure()

# each site
for site in site_list:
    site_mask = all_best_df['site'] == site
    site_df = all_best_df[ site_mask ]
    add_trace_site( site_df, site )
    
# Set title
fig_ALL.update_layout(
    title_text="All sites during the whole time period"
)

# Add range slider
fig_ALL.update_layout(
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     label="1m",
                     step="month",
                     stepmode="backward"),
                dict(count=6,
                     label="6m",
                     step="month",
                     stepmode="backward"),
                dict(count=1,
                     label="YTD",
                     step="year",
                     stepmode="todate"),
                dict(count=1,
                     label="1y",
                     step="year",
                     stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(
            visible=True
        ),
        type="date"
    )
)

fig_ALL.show()

In [None]:
#time_mask = all_best_df["date_datetime"].between( datetime.strptime("2020-01-01", "%Y-%m-%d"), datetime.strptime("2020-01-30", "%Y-%m-%d") )
### template: "plotly", "plotly_dark", "plotly_white"

fig_ALL2 = px.bar_polar( all_best_df, color="Train_R2", template="plotly_dark", 
                        theta="site", r="Train_R2",
                        color_discrete_sequence= px.colors.sequential.Plasma_r,
                       animation_frame="date", range_color=[0,1])

fig_ALL2.show()

In [None]:
fig_ALL3 = px.bar_polar( all_best_df_monthly, color="Train_R2", template="plotly", 
                        theta="site", r="Train_R2",
                        color_discrete_sequence= px.colors.sequential.Plasma_r,
                       animation_frame="month", range_color=[0,1])

fig_ALL3.show()

# Take a look at each site

please replace ``sites_name`` and ```start_date```, ```end_date```

---
comparision_config, element:

- ```HANDS_SITES```
- ```NORTH_SITES``` = [ "keelung", "xinying", "banqiao", "shilin", "wanhua", "taoyuan", "pingzhen", "yangming" ]
- ```CHU_MIAO_SITES``` = [ "zhudong", "hsinchu", "miaoli", "sanyi" ]
- ```CENTRAL_SITES``` = [ "fengyuan", "zhongming", "changhua", "nantou" ]
- ```YUN_CHIA_NAN_SITES``` = [ "douliu", "puzi", "chiayi", "xinying", "tainan" ]
- ```KAO_PING_SITES``` = [ "meinong", "qianjin", "pingtung", "hengchun" ]
- ```YILAN_SITES``` = [ "yilan" ]
- ```HUA_TUNG``` = [ "taitung", "hualien" ]
- ```ISLAND``` = [ "matsu", "kinmen", "magong" ]

- ```NEW_SITES``` = [ "banqiao", "taoyuan", "zhongming", "wanhua", "tainan", "qianjin" ]


In [None]:
# Import sites config
import comparison_config as config

HANDS_SITES = config.HANDS_SITES

NORTH_SITES = config.NORTH_SITES
CHU_MIAO_SITES = config.CHU_MIAO_SITES
CENTRAL_SITES = config.CENTRAL_SITES
YUN_CHIA_NAN_SITES = config.YUN_CHIA_NAN_SITES
KAO_PING_SITES = config.KAO_PING_SITES
YILAN_SITES = config.YILAN_SITES
HUA_TUNG = config.HUA_TUNG
ISLAND = config.ISLAND

NEW_SITES = config.NEW_SITES
ALL_SITES = list( HANDS_SITES.keys() )

ROW_PATH = "row_data/"

In [None]:
fig_sites = px.box(ana_df, x="day", y="Train_R2", range_y=[-3, 1.5],
                  color="method", notched=True, animation_frame="site")
fig_sites.show()

In [None]:
def add_trace_site2( site, site_df, row_EPA_df, row_AirBox_dfs, cal_df ):
    
    # Train_R2
    fig_site2.add_trace( go.Scatter( x=site_df['date'], y=site_df['Train_R2'], name=site+"_R2",
                                    line_shape='vh') )
    
    # row PM2.5
    fig_site2.add_trace( go.Scatter( x=row_EPA_df['datetime'], y=row_EPA_df['PM2.5'], name=site+"_EPA",
                                    line=dict( width=2 )) )
    i = 1
    for row_AirBox_df in row_AirBox_dfs:
        fig_site2.add_trace( go.Scatter( x=row_AirBox_df['datetime'] , y=row_AirBox_df['PM2_5'], 
                                        name=site+"_AirBox"+str(i),
                                        line=dict( width=1, dash='dot' ), opacity=0.6) )
        i += 1
        
    # calibration
    fig_site2.add_trace( go.Scatter( x=cal_df['datetime'], y=cal_df['PM25_C'], name=site+"_cal",
                                    line=dict( width=3 )) )
    

In [None]:
def add_buttons( sites_name ):
    
    training_v, EPA_PM25_v = [], []
    updatemenus=[
    dict(
        type="buttons",
        active=0,
        buttons=[]
    )]
    
    total_num = 0
    for site in sites_name:
        device_num = len(HANDS_SITES[site])
        total_num += (device_num + 2)
        training_v.extend( [True] + [False]*(device_num+1) )
        EPA_PM25_v.extend( [False, True] + [False]*device_num )

    #each site
    before_num = 0
    for site in sites_name:
        device_num = len(HANDS_SITES[site])
        site_v = [False]*(before_num + 1) + [True]*(device_num + 1) + [False]*( total_num - before_num - device_num -2 )
        updatemenus[0]["buttons"].append(dict(
            label=site,
            method="update",
            args=[{ "visible": site_v }]
        ))
        
        before_num += ( device_num + 2 )
    
    #training_R2
    updatemenus[0]["buttons"].append(dict(
        label="Training R2",
        method="update",
        args=[{ "visible": training_v }]
    ))
    
    #EPA_PM25
    updatemenus[0]["buttons"].append(dict(
        label="EPA PM2.5",
        method="update",
        args=[{ "visible": EPA_PM25_v }]
    ))
    
    return updatemenus

In [None]:
fig_site2 = go.Figure()

### replace options ###
sites_name = CENTRAL_SITES
start_date = "2020-01-01"
end_date = "2020-02-29"
#######################

for site in sites_name:
    ROW_devices_df = pd.read_csv( ROW_PATH + site + ".csv" ).drop( ["lat", "lon"], axis=1 )
    date_mask = ROW_devices_df['datetime'].between( start_date, end_date )
    row_devices_df = ROW_devices_df[ date_mask ]
    
    # free memory
    del ROW_devices_df;  gc.collect()
    
    site_mask = all_best_df['site'] == site
    date_mask = all_best_df['date'].between( start_date, end_date )
    site_df = all_best_df[ site_mask & date_mask ]
    
    row_AirBox_dfs = []
    for device in HANDS_SITES[site]:
        # EPA device
        if "EPA" in device:
            mask = row_devices_df['device_id']==device.replace("EPA-","")
            row_EPA_df = row_devices_df[mask].drop( ["Temperature", "Humidity", "PM1"], axis=1 )
            row_AirBoxs_df = row_devices_df[~mask]
        #AirBox
        else:
            mask = row_devices_df['device_id']==device
            device_df = row_devices_df[mask]
            
            # hourly
            device_df.datetime = pd.to_datetime( device_df.datetime, format="%Y-%m-%d %H:%M:%S" )
            device_df['PM2_5'] = device_df['PM2.5'].astype('float')
            device_df = device_df.groupby( ["device_id", "datetime"] ).mean()
            level_values = device_df.index.get_level_values
            device_df = (device_df.groupby([level_values(i) for i in [0]]
                      +[pd.Grouper(freq='1H', level=-1)]).mean())
            device_df.reset_index(inplace=True)
            
            row_AirBox_dfs.append( device_df )
            
    # AirBoxs mean
    row_AirBoxs_df.index = pd.to_datetime( row_AirBoxs_df.datetime, format="%Y-%m-%d %H:%M:%S" )
    row_AirBoxs_df['PM2_5'] = row_AirBoxs_df['PM2.5'].astype('float')
    hourly_AirBoxs_df = row_AirBoxs_df.resample('1H').mean()
    hourly_AirBoxs_df.reset_index( inplace=True )
    hourly_AirBoxs_df['date'] = hourly_AirBoxs_df['datetime'].dt.strftime("%Y-%m-%d")

    # calibration
    hourly_df = pd.merge( hourly_AirBoxs_df, site_df, how='inner', left_on='date', right_on='date' ).drop(["date_datetime","Train_R2","day","num","RMSE","Test_R2"],axis=1)
    hourly_df['PM25_C'] = hourly_df['PM25']*hourly_df['PM2_5'] + hourly_df['Temperature']*hourly_df['TEM'] + hourly_df['Humidity']*hourly_df['HUM'] + hourly_df["datetime"].dt.hour*hourly_df['HR'] + hourly_df["intercept"]
    hourly_df['PM25_C'] = hourly_df['PM25_C'].map( lambda x: 0 if x < 0 else x )
    
    
    add_trace_site2( site, site_df, row_EPA_df, row_AirBox_dfs, hourly_df[ ['datetime', 'PM25_C'] ] )
    fig_site2.update_layout( updatemenus = add_buttons( sites_name ) )



In [None]:
# Add range slider
fig_site2.update_layout(
    xaxis=dict(
        rangeslider=dict(
            visible=True
        ),
        type="date"
    )
)
# Set title
fig_site2.update_layout(
    title_text="Detail of each site",
    xaxis_domain=[0, 1.0]
)

fig_site2.show()

# Analysis features

please replace ``sites_name`` and ```start_date```, ```end_date```

---
1. Day
2. Feature
3. Method
4. Time Series
5. Num

In [None]:
### replace options ###
sites_name = ALL_SITES
start_date = "2019-02-01"
end_date = "2020-02-29"
#######################

site_mask = all_best_df['site'].isin(sites_name)
date_mask = all_best_df['date'].between( start_date, end_date )
site_df = all_best_df[ site_mask & date_mask ]


In [None]:
def add_trace_ana( sites_name, site_df, element, KEY ):
    
    y_list = []
    for site in sites_name:
        site_mask = site_df['site'] == site
        key_mask = site_df[KEY] == element
        
        y_list.append( site_df[ site_mask & key_mask ].shape[0] )
    
    BAR = go.Bar( x=sites_name, y=y_list , name=str(element) )
    return BAR, y_list

In [None]:
## DAY
fig_ana_day = go.Figure()
day_dict = {}

for day in day_list:
    BAR, day_dict[str(day)] = add_trace_ana( sites_name, site_df, day, "day" )
    fig_ana_day.add_trace( BAR )


fig_ana_day.update_layout(barmode='stack', 
                          xaxis={ 'categoryarray': [x for _, x in sorted(zip(day_dict['7'], sites_name), reverse=True)] }, 
                          yaxis={ 'categoryarray': day_list, 'categoryorder':'min descending' },
                          title_text="Day of each site")
fig_ana_day.show()

In [None]:
## Feature
fig_ana_feature = go.Figure()
feature_dict = {}

for feature in feature_list:
    BAR, feature_dict[feature] = add_trace_ana( sites_name, site_df, feature, "feature" )
    fig_ana_feature.add_trace( BAR )


fig_ana_feature.update_layout(barmode='stack', 
                              xaxis={ 'categoryarray': [x for _, x in sorted(zip(feature_dict['ALL'], sites_name), reverse=True)] }, 
                              yaxis={ 'categoryarray': feature_list, 'categoryorder':'min descending' },
                              title_text="Feature of each site")
fig_ana_feature.show()


In [None]:
## Method
fig_ana_method = go.Figure()
method_dict = {}

for method in method_list:
    BAR, method_dict[method] = add_trace_ana( sites_name, site_df, method, "method" )
    fig_ana_method.add_trace( BAR )


fig_ana_method.update_layout(barmode='stack', 
                             xaxis={ 'categoryarray': [x for _, x in sorted(zip(method_dict['BayesianRidge'], sites_name), reverse=True)] },
                             yaxis={ 'categoryarray': method_list, 'categoryorder':'min descending' },
                             title_text="Method of each site")
fig_ana_method.show()

### fig_percent

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

### replace options ###
sites_name = YUN_CHIA_NAN_SITES
start_date = "2019-02-01"
end_date = "2020-02-29"
#######################

site_mask = all_best_df['site'].isin(sites_name)
date_mask = all_best_df['date'].between( start_date, end_date )
site_df = all_best_df[ site_mask & date_mask ]

# counting 
ana_df = site_df[ ['Train_R2', 'day', 'feature', 'method', 'site'] ]
ana_df = ana_df.groupby( ['site','day','method','feature'] ).aggregate(['count','max','min','mean'])
ana_df.reset_index(inplace=True)
ana_df[ 'count' ] = ana_df.Train_R2['count']
ana_df[ 'mean' ] = ana_df.Train_R2['mean']
ana_df[ 'group' ] = ana_df['day'].astype("str").str.cat(ana_df[['feature','method']], sep="_")

In [None]:

fig_percent = make_subplots(1, 2, specs=[[{"type": "domain"}, {"type": "domain"}]],)


fig_percent = px.sunburst( ana_df, path=['site', 'feature', 'day', 'method'], 
                          values='count', branchvalues='total',maxdepth=2
                     )

fig_percent.show()

In [None]:
from plotly.subplots import make_subplots

# Create subplots: use 'domain' type for Pie subplot
Specs = []
for e in range(len(sites_name)//2+1):
    Specs.append( [{'type':'domain'}, {'type':'domain'}] )
fig_pie = make_subplots(rows=len(sites_name)//2+1, cols=2, specs=Specs, subplot_titles=sites_name)


i = 2
anno = []
for site in sites_name:
    site_mask=ana_df['site'] == site
    site_df = ana_df[ site_mask ].sort_values(['count'],ascending=False).head(10)
    fig_pie.add_trace( go.Pie(
        labels=site_df['group'], values = site_df['count'], name=site
    ), i//2, i%2+1 )
    
    #anno.append( dict(text=site, x=0.18, y=0.5, font_size=20, showarrow=False) )
    i += 1
    
# Use `hole` to create a donut-like pie chart
fig_pie.update_traces(hoverinfo="label+percent+name", textposition='inside', textinfo='percent',
                     marker=dict(line=dict(color='#000000', width=1)))

fig_pie.update_layout(
    # Add annotations in the center of the donut pies.
    #annotations=anno,
    height=300*( len(sites_name)//2+1 ))
fig_pie.update( layout_title_text='Fig Pie', layout_showlegend=False )
fig_pie.show()

### fig_num

In [None]:
### replace options ###
sites_name = YUN_CHIA_NAN_SITES
start_date = "2019-02-01"
end_date = "2020-02-29"
#######################

site_mask = all_best_df['site'].isin(sites_name)
date_mask = all_best_df['date'].between( start_date, end_date )
site_df = all_best_df[ site_mask & date_mask ][["HR","HUM","PM25","TEM","intercept","site","date"]]


In [None]:
# concat
num_df = pd.DataFrame()
for element in ["HR","HUM","PM25","TEM"]:
    temp_df = site_df[ [element,"site","date"] ].rename(columns={element: "value"})
    temp_df["value"] = temp_df['value'].abs()
    temp_df['feature'] = element
    num_df = pd.concat( [num_df, temp_df], axis=0 )

In [None]:

fig_num = px.scatter(num_df, x="date", y="value", animation_frame="site",
           color="feature", hover_name="feature")

fig_num.show()