In [None]:
# Parameters (Injected by Papermill)
logs_data = []  # Papermill will inject this


In [None]:
from datetime import datetime, timezone
import IPython.display as ipython

# Extract Pipeline Name from logs
pipeline_name = (
    logs_data[0]["log_metrics"].get("Pipeline Name", "Unknown Pipeline")
    if logs_data else "Unknown Pipeline"
)

# Generate report timestamp
report_timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")

# Display title and metadata
ipython.display(ipython.Markdown(f"# PySpark Pipeline Log Report for {pipeline_name}"))
ipython.display(ipython.Markdown(f"**Report Generated On:** {report_timestamp}"))
ipython.display(ipython.Markdown("This report analyses PySpark pipeline logs, focusing on cost over time.")) # noqa: E501

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Ensure logs_data is not empty
if not logs_data:
    error_msg = "logs_data is empty. Ensure Papermill is passing correct data."
    raise ValueError(error_msg)


## Data Processing

This section extracts timestamps and monetary costs from the PySpark logs.

In [None]:
# Convert logs data to a DataFrame
df = pd.DataFrame(logs_data)

# Extract relevant columns
df["timestamp"] = df["log_metrics"].apply(lambda x: x["Timestamp"])
df["pipeline_cost"] = df["cost_metrics"].apply(lambda x: x["costs"]["pipeline_cost"])
df["pipeline_name"] = df["log_metrics"].apply(
    lambda x: x.get("Pipeline Name", "Unknown Pipeline"),
)

# Convert timestamp to datetime
df["timestamp"] = pd.to_datetime(df["timestamp"], unit="ms")

# Keep only required columns
df = df[["timestamp", "pipeline_cost"]]

# Display processed DataFrame
df

## Cost Over Time

This section visualises the pipeline cost over time using a simple line plot.

In [None]:
# Get the pipeline name from the first record, defaulting to "Pipeline Report"
pipeline_name = (
    logs_data[0]["log_metrics"].get("Pipeline Name", "Pipeline Report")
    if logs_data else "Pipeline Report"
)

# Sort values for correct ordering
df = df.sort_values(by="timestamp")

# Scatter plot for better visualization of discrete points
ax = df.plot(
    x="timestamp",
    y="pipeline_cost",
    kind="scatter",
    figsize=(10,5),
    color="b",
)

# Set title dynamically based on pipeline name
ax.set_title(f"Monetary Cost Over Time - {pipeline_name}")

plt.xlabel("Timestamp")
plt.ylabel("Cost ($)")
plt.grid(True)
plt.show()