In [18]:
import matplotlib.pyplot as plt
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output
from ipywidgets import SelectionRangeSlider, HBox, VBox, Label, Layout
import numpy as np

df = pd.read_csv('concatenated_data.csv')
df['date'] = pd.to_datetime(df[['year', 'month', 'day', 'hour']])

station_names = {
    1: 'Aotizhongxin',
    2: 'Changping',
    3: 'Dingling',
    4: 'Dongsi',
    5: 'Guanyuan',
    6: 'Gucheng',
    7: 'Huairou',
    8: 'Nongzhanguan',
    9: 'Shunyi',
    10: 'Tiantan',
    11: 'Wanliu',
    12: 'Wanshouxigong',
}

station_dropdown = widgets.SelectMultiple(
    options=[(station_names[i], i) for i in range(1, 13)],
    value=[1,2],
    description='Station:'
)

attribute_multiselect = widgets.SelectMultiple(
    options=['PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3', 'TEMP', 'PRES', 'DEWP', 'RAIN', 'WSPM','wd'],
    value=['PM2.5', 'PM10', 'SO2'],
    description='Attributes:'
)

date_range_slider = widgets.SelectionRangeSlider(
    options=pd.date_range(df['date'].min(), df['date'].max(), freq='D'),
    index=(0, len(pd.date_range(df['date'].min(), df['date'].max(), freq='D')) - 1),
    description='Date Range:',
    continuous_update=False,
    layout={'width': '600px'},
    readout=False
)
date_range_slider.style.handle_width = '50px'
date_range_slider.style.readout_width = '500px'

selected_info = Label(
    value=f'Selected Range: {date_range_slider.value[0]} - {date_range_slider.value[1]}',
    layout=Layout(margin='0 0 0 0px', width='500px')
)

def update_info(change):
    selected_info.value = f'Selected Range: {date_range_slider.value[0]} - {date_range_slider.value[1]}'

date_range_slider.observe(update_info, 'value')
hbox = HBox([date_range_slider, selected_info])
vbox = VBox([hbox])

def create_raw_time_series_plot(stations, attributes, date_range):
    clear_output(wait=True)
    start_date, end_date = date_range
    
    num_attributes = len(attributes)
    num_stations = len(stations)
    fig, axes = plt.subplots(num_attributes, 1, figsize=(15, num_attributes * 3), sharex=True)
    
    if num_attributes == 1:
        axes = [axes]
    
    colors = plt.cm.rainbow(np.linspace(0, 1, num_stations))
    
    for i, attribute in enumerate(attributes):
        for j, station in enumerate(stations):
            filtered_df = df[(df['station'] == station) & (df['date'].between(start_date, end_date))]
            axes[i].plot(filtered_df['date'], filtered_df[attribute], label=f'Station {station_names[station]}', color=colors[j])
        axes[i].set_title(attribute)
        axes[i].legend(loc='upper left')

    
    plt.suptitle("Time Series Graph of Selected Attributes and Stations", y=1.02)
    plt.tight_layout()
    plt.show()



output = widgets.interactive_output(
    create_raw_time_series_plot,
    {
        'stations': station_dropdown,
        'attributes': attribute_multiselect,
        'date_range': date_range_slider,
    }
)


display(station_dropdown, attribute_multiselect, vbox, output)


SelectMultiple(description='Station:', index=(0, 1), options=(('Aotizhongxin', 1), ('Changping', 2), ('Dinglin…

SelectMultiple(description='Attributes:', index=(0, 1, 2), options=('PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3',…

VBox(children=(HBox(children=(SelectionRangeSlider(continuous_update=False, description='Date Range:', index=(…

Output()

In [16]:

'''
def create_raw_time_series_plot(station, attributes, date_range, window_size=7):
    clear_output(wait=True)
    start_date, end_date = date_range
    filtered_df = df[(df['station'] == station) & (df['date'].between(start_date, end_date))].copy()

    fig, ax = plt.subplots(figsize=(12, 6))

    # Scale each attribute to a fixed range and apply rolling average
    for i, attribute in enumerate(attributes):
        attribute_values = filtered_df[attribute]
        attribute_min = attribute_values.min()
        attribute_max = attribute_values.max()
        attribute_range = attribute_max - attribute_min
        scaled_attribute = (attribute_values - attribute_min) / attribute_range * 0.8 + 0.1 * (i + 1)
        smoothed_attribute = scaled_attribute.rolling(window_size, min_periods=1).mean()
        ax.plot(filtered_df['date'], smoothed_attribute, label=attribute)

    ax.set(xlabel='Date', ylabel='Value')
    ax.legend()
    plt.show()
'''

"\ndef create_raw_time_series_plot(station, attributes, date_range, window_size=7):\n    clear_output(wait=True)\n    start_date, end_date = date_range\n    filtered_df = df[(df['station'] == station) & (df['date'].between(start_date, end_date))].copy()\n\n    fig, ax = plt.subplots(figsize=(12, 6))\n\n    # Scale each attribute to a fixed range and apply rolling average\n    for i, attribute in enumerate(attributes):\n        attribute_values = filtered_df[attribute]\n        attribute_min = attribute_values.min()\n        attribute_max = attribute_values.max()\n        attribute_range = attribute_max - attribute_min\n        scaled_attribute = (attribute_values - attribute_min) / attribute_range * 0.8 + 0.1 * (i + 1)\n        smoothed_attribute = scaled_attribute.rolling(window_size, min_periods=1).mean()\n        ax.plot(filtered_df['date'], smoothed_attribute, label=attribute)\n\n    ax.set(xlabel='Date', ylabel='Value')\n    ax.legend()\n    plt.show()\n"

In [8]:
def create_raw_time_series_plot(station, attributes, date_range):
    clear_output(wait=True)
    start_date, end_date = date_range
    filtered_df = df[(df['station'] == station) & (df['date'].between(start_date, end_date))]

    num_attributes = len(attributes)
    fig, axes = plt.subplots(num_attributes, 1, figsize=(15, num_attributes * 3), sharex=True)
    
    if num_attributes == 1:
        axes = [axes]
    
    for i, attribute in enumerate(attributes):
        axes[i].plot(filtered_df['date'], filtered_df[attribute], label=f'Station {station}')
        axes[i].set_title(attribute)
    
    plt.suptitle("Time Series Graph of Selected Attributes and Station", y=1.02)
    plt.tight_layout()
    plt.show()

In [9]:
def create_raw_time_series_plot(stations, attributes, date_range):
    clear_output(wait=True)
    start_date, end_date = date_range
    
    num_attributes = len(attributes)
    num_stations = len(stations)
    fig, axes = plt.subplots(num_attributes, 1, figsize=(15, num_attributes * 3), sharex=True)
    
    if num_attributes == 1:
        axes = [axes]
    
    colors = plt.cm.rainbow(np.linspace(0, 1, num_stations))
    
    for i, attribute in enumerate(attributes):
        for j, station in enumerate(stations):
            filtered_df = df[(df['station'] == station) & (df['date'].between(start_date, end_date))]
            axes[i].plot(filtered_df['date'], filtered_df[attribute], label=f'Station {station}', color=colors[j])
        axes[i].set_title(attribute)
    
    plt.suptitle("Time Series Graph of Selected Attributes and Stations", y=1.02)
    plt.tight_layout()
    plt.show()


### Trend Analysis and detect anomaly 

In [32]:
def detect_anomalies_iqr(data, factor=2):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - (factor-1.8) * IQR
    upper_bound = Q3 + factor * IQR
    anomalies = (data < lower_bound) | (data > upper_bound)
    return anomalies


In [33]:
def create_raw_time_series_plot(station, attributes, date_range, window_size=7):
    clear_output(wait=True)
    start_date, end_date = date_range
    filtered_df = df[(df['station'] == station) & (df['date'].between(start_date, end_date))].copy()

    fig, ax = plt.subplots(figsize=(12, 6))

    # Scale each attribute to a fixed range and apply rolling average
    for i, attribute in enumerate(attributes):
        attribute_values = filtered_df[attribute]
        attribute_min = attribute_values.min()
        attribute_max = attribute_values.max()
        attribute_range = attribute_max - attribute_min
        scaled_attribute = (attribute_values - attribute_min) / attribute_range * 0.8 + 0.1 * (i + 1)
        smoothed_attribute = scaled_attribute.rolling(window_size, min_periods=1).mean()
        ax.plot(filtered_df['date'], smoothed_attribute, label=attribute)
        
        # Detect and plot anomalies using the IQR method
        anomalies = detect_anomalies_iqr(attribute_values)
        anomaly_dates = filtered_df.loc[anomalies, 'date']
        anomaly_values = smoothed_attribute.loc[anomalies]
        ax.scatter(anomaly_dates, anomaly_values, marker='o', edgecolors='red', facecolors='none', label=f'{attribute} Anomaly')

    ax.set(xlabel='Date', ylabel='Value')
    ax.legend()
    plt.show()


In [10]:
output = widgets.interactive_output(
    

    create_raw_time_series_plot,
    {
        'station': station_dropdown,
        'attributes': attribute_multiselect,
        'date_range': date_range_slider,
    }
)

display(station_dropdown, attribute_multiselect, vbox, output)

SelectMultiple(description='Station:', index=(0, 1), options=(('Aotizhongxin', 1), ('Changping', 2), ('Dinglin…

SelectMultiple(description='Attributes:', index=(0,), options=('PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3', 'TEM…

VBox(children=(HBox(children=(SelectionRangeSlider(continuous_update=False, description='Date Range:', index=(…

Output()

In [35]:
from scipy import stats

def detect_anomalies_z_score(data, threshold=2):
    z_scores = np.abs(stats.zscore(data))
    anomalies = z_scores > threshold
    return anomalies


In [36]:
def create_raw_time_series_plot(station, attributes, date_range, window_size=7):
    clear_output(wait=True)
    start_date, end_date = date_range
    filtered_df = df[(df['station'] == station) & (df['date'].between(start_date, end_date))].copy()

    fig, ax = plt.subplots(figsize=(12, 6))

    # Scale each attribute to a fixed range and apply rolling average
    for i, attribute in enumerate(attributes):
        attribute_values = filtered_df[attribute]
        attribute_min = attribute_values.min()
        attribute_max = attribute_values.max()
        attribute_range = attribute_max - attribute_min
        scaled_attribute = (attribute_values - attribute_min) / attribute_range * 0.8 + 0.1 * (i + 1)
        smoothed_attribute = scaled_attribute.rolling(window_size, min_periods=1).mean()
        ax.plot(filtered_df['date'], smoothed_attribute, label=attribute)
        
        # Detect and plot anomalies using the IQR method
        anomalies = detect_anomalies_iqr(attribute_values)
        anomaly_dates = filtered_df.loc[anomalies, 'date']
        anomaly_values = smoothed_attribute.loc[anomalies]
        ax.scatter(anomaly_dates, anomaly_values, marker='o', edgecolors='red', facecolors='none', label=f'{attribute} Anomaly')

    ax.set(xlabel='Date', ylabel='Value')
    ax.legend()
    plt.show()

In [37]:
output = widgets.interactive_output(
    create_raw_time_series_plot,
    {
        'station': station_dropdown,
        'attributes': attribute_multiselect,
        'date_range': date_range_slider,
    }
)

display(station_dropdown, attribute_multiselect, vbox, output)

Dropdown(description='Station:', options=(('Aotizhongxin', 1), ('Changping', 2), ('Dingling', 3), ('Dongsi', 4…

SelectMultiple(description='Attributes:', index=(0, 4), options=('PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3', 'T…

VBox(children=(HBox(children=(SelectionRangeSlider(continuous_update=False, description='Date Range:', index=(…

Output()