# Setup


In [None]:
%reload_ext autoreload
%autoreload 2

from timeseries import ukf_rts_smooth, process_timeseries, convert_pi4_to_pi3_co2, get_corrected_data
from plotting import (
    export_fig,
    plot_co2,
    get_co2_axes,
)
from project import PRESENTATION_MEDIA_DIR, DATA_DIR, CO2_DATABASE_PATH
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from database import create_timeseries_data_table, delete_timeseries_data_table, get_co2_data_by_date
from natsort import natsorted
import seaborn as sns

In [None]:
delete_timeseries_data_table()

In [None]:
create_timeseries_data_table()

# Process Timeseries


In [None]:
data_dir = DATA_DIR / "2024-09-26/PI4"

for file_path in natsorted(data_dir.glob("*.csv")):
    df = pd.read_csv(file_path)
    # if df["CO2"].std() < 50:
    print(file_path)
    plt.plot(df["Gas"])
    # plt.ylim(400, 600)

plt.show()

## 2024-09-26


In [None]:
date = "2024-09-26"
pi = "PI4"

parameters = {
    (date, "1222"): {"x0": 530, "x_start": 50, "show": False},
    (date, "1242"): {"x_start": 50, "x_end": 250, "show": False},
    (date, "1303"): {"R": 5000, "show": False},
    (date, "1324"): {"show": False},
    (date, "1338"): {"x_start": 75, "show": False},
    (date, "1353"): {"R": 2000, "show": False},
    (date, "1410"): {"show": False},
    (date, "1425"): {"show": False},
    (date, "1439"): {"show": False},
    (date, "1458"): {"show": False},
    (date, "1521"): {"x_end": 170, "show": False},
    (date, "1548"): {"show": False},
}
process_timeseries(date, pi, parameters)

In [None]:
date = "2024-09-26"
pi = "PI3"

parameters = {
    (date, "1222"): {"x0": 400, "x_start": 50, "show": False},
    (date, "1243"): {"show": False},
    (date, "1302"): {"show": False},
    (date, "1324"): {"show": False},
    (date, "1338"): {"x0": 480, "show": False},
    (date, "1353"): {"show": False},
    (date, "1410"): {"show": False},
    (date, "1426"): {"show": False},
    (date, "1439"): {"show": False},
    (date, "1458"): {"show": False},
    (date, "1521"): {"show": False},
    (date, "1548"): {"x0": 465, "show": False},
}
process_timeseries(date, pi, parameters, default_R=200)

## 2024-10-18


In [None]:
date = "2024-10-18"

pi = "PI3"
parameters = {
    (date, "1801"): {"x0": 500, "x_start": 50, "show": False},
    (date, "1825"): {"x0": 455, "x_start": 50, "show": False},
    (date, "1835"): {"show": False},
    (date, "1847"): {"show": False},
    (date, "1902"): {"show": False},
    (date, "1913"): {"x0": 490, "show": False},
    (date, "1925"): {"show": False},
}
process_timeseries(date, pi, parameters, default_R=200)

pi = "PI4"
parameters = {
    (date, "1806"): {"show": False},
    (date, "1828"): {"show": False},
    (date, "1847"): {"x0": 500, "x_start": 50, "R": 2000, "show": False},
    (date, "1907"): {"show": False},
    (date, "1924"): {"show": False},
}
process_timeseries(date, pi, parameters, default_R=1000)

## 2024-10-22


In [None]:
date = "2024-10-22"

pi = "PI3"
parameters = {
    (date, "1807"): {"show": False},
    (date, "1819"): {"show": False},
    (date, "1834"): {"show": False},
    (date, "1848"): {"show": False},
    (date, "1901"): {"show": False},
    (date, "1915"): {"show": False},
}
process_timeseries(date, pi, parameters, default_R=200)

pi = "PI4"
parameters = {
    (date, "1745"): {"show": False},
    (date, "1750"): {"R": 2000, "show": False},
    (date, "1801"): {"R": 2000, "show": False},
    (date, "1809"): {"show": False},
    (date, "1822"): {"show": False},
    (date, "1832"): {"show": False},
}
process_timeseries(date, pi, parameters, default_R=1000)

## 2024-10-24


In [None]:
date = "2024-10-24"

pi = "PI3"
parameters = {
    (date, "0859"): {"show": False},
    (date, "0920"): {"show": False},
    (date, "0940"): {"show": False},
    (date, "0952"): {"show": False},
    (date, "1011"): {"show": False},
    (date, "1029"): {"show": False},
}
process_timeseries(date, pi, parameters, default_R=200)

pi = "PI4"
parameters = {
    (date, "0836"): {"show": False},
    (date, "0851"): {"show": False},
    (date, "0901"): {"show": False},
    (date, "0910"): {"show": False},
    (date, "0921"): {"show": False},
    (date, "0932"): {"show": False},
}
process_timeseries(date, pi, parameters, default_R=1000)

# Pi Calibration


In [None]:
import pandas as pd
from scipy.stats import linregress
import numpy as np
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import LinearRegression
from matplotlib.axes import Axes


def get_calibration_axes(ax: Axes) -> Axes:
    ax.set_xlabel("Mean CO2 concentration across devices (ppm)")
    ax.set_ylabel("Deviation between devices (ppm)")
    ax.set_xlim(460, 510)
    ax.set_ylim(-65, 20)
    return ax


# Select data for 2024-09-26
df = df[df["date"] == "2024-09-26"]

# Convert time to datetime for easier manipulation
df["datetime"] = pd.to_datetime(df["date"] + " " + df["time"], format="%Y-%m-%d %H%M")

# Separate data for PI3 and PI4
df_pi3 = df[df["pi"] == "PI3"]
df_pi4 = df[df["pi"] == "PI4"]

# Merge the dataframes on date and time within 1 minute
merged_df = pd.merge_asof(
    df_pi3.sort_values("datetime"),
    df_pi4.sort_values("datetime"),
    on="datetime",
    direction="nearest",
    tolerance=pd.Timedelta("1 minute"),
    suffixes=("_pi3", "_pi4"),
)

# Drop rows where there was no match within 1 minute
merged_df.dropna(inplace=True)
merged_df = merged_df.drop(0)

mean_across_devices = (merged_df["co2_mean_pi3"] + merged_df["co2_mean_pi4"]) / 2
deviation_between_devices = merged_df["co2_mean_pi3"] - merged_df["co2_mean_pi4"]
xerr = (merged_df["co2_std_pi3"] + merged_df["co2_std_pi4"]) / 2
yerr = merged_df["co2_std_pi3"] + merged_df["co2_std_pi4"]

fig, ax = plt.subplots()

ax.errorbar(
    mean_across_devices,
    deviation_between_devices,
    xerr=xerr,
    yerr=yerr,
    fmt="o",
    label="Data",
)
ax = get_calibration_axes(ax)
export_fig(fig, PRESENTATION_MEDIA_DIR / "pi_calibration_points.svg")

# Linear regression
slope, intercept, r_value, p_value, std_err = linregress(
    mean_across_devices,
    deviation_between_devices,
)

x = np.linspace(460, 510, 100)
ax.plot(
    x,
    slope * x + intercept,
    color="k",
    label="Best fit",
    linestyle="--",
)
ax.legend(loc="lower right")
export_fig(fig, PRESENTATION_MEDIA_DIR / "pi_calibration_naive_fit.svg")
plt.show()

# Prepare data for RANSAC
X = mean_across_devices.values.reshape(-1, 1)
y = deviation_between_devices.values

# Apply RANSAC for robust linear regression
ransac = RANSACRegressor(LinearRegression(), min_samples=5, residual_threshold=8)
ransac.fit(X, y)
inlier_mask = ransac.inlier_mask_
outlier_mask = np.logical_not(inlier_mask)

fig, ax = plt.subplots()
ax.errorbar(
    mean_across_devices[inlier_mask],
    deviation_between_devices[inlier_mask],
    xerr=xerr[inlier_mask],
    yerr=yerr[inlier_mask],
    fmt="o",
    label="Inliers",
    color="C2",
)
ax.errorbar(
    mean_across_devices[outlier_mask],
    deviation_between_devices[outlier_mask],
    xerr=xerr[outlier_mask],
    yerr=yerr[outlier_mask],
    fmt="o",
    label="Outliers",
    color="C1",
)
ax = get_calibration_axes(ax)
export_fig(fig, PRESENTATION_MEDIA_DIR / "pi_calibration_inlier_outliers.svg")

# Plot the RANSAC regression line
line_y_ransac = ransac.predict(x.reshape(-1, 1))
ax.plot(x, line_y_ransac, color="k", label="RANSAC regression line", linestyle="--")
ax.legend()
export_fig(fig, PRESENTATION_MEDIA_DIR / "pi_calibration_ransac_fit.svg")
plt.show()

slope = ransac.estimator_.coef_[0]
intercept = ransac.estimator_.intercept_
print(f"Slope: {slope}, Intercept: {intercept}")

In [None]:
from IPython.display import display, HTML

selected_columns = [
    "date_pi3",
    "time_pi3",
    "co2_mean_pi3",
    "co2_std_pi3",
    "co2_mean_pi4",
    "co2_std_pi4",
]
display_df = merged_df[selected_columns].copy()
display_df.rename(columns={"date_pi3": "date", "time_pi3": "time"}, inplace=True)

html_str = display_df.to_html(index=False)
display(HTML(html_str))

In [None]:
plt.errorbar(
    merged_df["co2_mean_pi3"],
    convert_pi4_to_pi3_co2(merged_df["co2_mean_pi4"]),
    xerr=merged_df["co2_std_pi3"],
    yerr=merged_df["co2_std_pi4"],
    fmt="o",
)
plt.plot(sorted(merged_df["co2_mean_pi3"]), sorted(merged_df["co2_mean_pi3"]))
plt.gca().set_aspect("equal", adjustable="box")
plt.xlim(400, 540)
plt.ylim(400, 540)

# Get All Data


In [None]:
df_corrected = get_corrected_data()
boxplot = df_corrected.boxplot(column="co2_mean", by="date", grid=False)
plt.ylabel("CO2 concentration (ppm)")
plt.title("")
plt.suptitle("")
export_fig(plt.gcf(), PRESENTATION_MEDIA_DIR / "boxplot_mean_co2_by_date.svg")

In [None]:
sns.violinplot(
    data=df_corrected,
    x="date",
    y="co2_mean",
    density_norm="count",
    palette=sns.color_palette(),
    hue="date",
    legend=False,
)
sns.stripplot(data=df_corrected, x="date", y="co2_mean", color="black", alpha=0.5)
plt.ylabel("CO2 concentration (ppm)")
export_fig(plt.gcf(), PRESENTATION_MEDIA_DIR / "violinplot_mean_co2_by_date.svg")

# Correlation Tests


In [None]:
from scipy.stats import kendalltau


def check_correlation(variable_str: str, units: str, x_lim: tuple = None):
    df_corrected = get_corrected_data()
    # Check for correlation between co2 and pressure
    fig, ax = plt.subplots()
    variable = df_corrected[f"{variable_str}_mean"]
    co2 = df_corrected["co2_mean"]
    ax.scatter(variable, co2)
    ax.set_xlabel(f"{variable_str.capitalize()} ({units})")
    ax.set_ylabel("CO2 concentration (ppm)")
    ax.set_xlim(x_lim)
    export_fig(fig, PRESENTATION_MEDIA_DIR / f"{variable_str}_vs_co2.svg")

    statistic, p_value = kendalltau(variable, co2)
    print(f"Kendall's tau: {statistic: .3f}, p-value: {p_value: .3f}")

In [None]:
check_correlation("pressure", "mbar", (1006, 1013))

In [None]:
check_correlation("temperature", "$\\degree C$")

In [None]:
check_correlation("humidity", "%")

# For Presentation


## Points and Lines


In [None]:
df = pd.read_csv(
    Path("data/co2_readings/processed/2024-09-26/PI4/2024-09-26 14_39_46.146124.csv")
)
t = df["Time"]
co2 = df["CO2"]
fig, ax = plt.subplots()
ax.scatter(t, co2, s=1)
ax = get_co2_axes(ax)
export_fig(fig, PRESENTATION_MEDIA_DIR / "co2_points.svg")
ax.plot(t, co2)
export_fig(fig, PRESENTATION_MEDIA_DIR / "co2_line.svg")

df = pd.read_csv(
    Path("data/co2_readings/processed/2024-10-18/PI3/2024-10-18 18_35_31.804712.csv")
)
t = df["Time"]
co2 = df["CO2"]
fig, ax = plot_co2([t], [co2], [{}], plot_legend=False)
export_fig(fig, PRESENTATION_MEDIA_DIR / "co2_raw_2.svg")

## Good and Bad Data


In [None]:
fig, ax = plt.subplots()

df = pd.read_csv(
    Path("data/co2_readings/processed/2024-09-26/PI4/2024-09-26 14_39_46.146124.csv")
)
t = df["Time"]
co2 = df["CO2"]
ax.plot(t, co2, label="Bad")

df = pd.read_csv(
    "data/co2_readings/processed/2024-09-26/PI3/2024-09-26 14_58_06.432300.csv"
)
t = df["Time"]
co2 = df["CO2"]
ax.plot(t, co2, label="Good")
ax.legend()
ax.set_xlim((-11.181157338619231, 250.04458330869673))
ax = get_co2_axes(ax)
export_fig(fig, PRESENTATION_MEDIA_DIR / "co2_good_bad.svg")

In [None]:
df = pd.read_csv(
    Path("data/co2_readings/processed/2024-09-26/PI4/2024-09-26 14_39_46.146124.csv")
)
t = df["Time"]
co2 = df["CO2"]

co2_smoothed = ukf_rts_smooth(t, co2, x0=500)

fig, ax = plot_co2(
    [t, t], [co2, co2_smoothed], [{"label": "Raw"}, {"label": "Smoothed"}]
)
export_fig(fig, PRESENTATION_MEDIA_DIR / "co2_smoothed_1.svg")
ax.get_xlim()

In [None]:
df = pd.read_csv(
    Path("data/co2_readings/processed/2024-10-18/PI3/2024-10-18 18_35_31.804712.csv")
)
t = df["Time"]
co2 = df["CO2"]

co2_smoothed = ukf_rts_smooth(t, co2, R=100)

fig, ax = plot_co2(
    [t, t], [co2, co2_smoothed], [{"label": "Raw"}, {"label": "Smoothed"}]
)
export_fig(fig, PRESENTATION_MEDIA_DIR / "co2_smoothed_2.svg")