### Step 1: Read the data from the table and generate a DataFrame

In [0]:
from pyspark.sql import functions as F
from pyspark.sql import Window
import matplotlib.pyplot as plt

TABLE = "playplus_dev_bronze.pdp.articles_v2"
df = spark.table(TABLE)

show the table to get an overview

In [0]:
display(df)

In [0]:
df.columns

In [0]:
row = df.filter(F.col("id") == "urn:pdp:cms_swi:article:73104714").limit(1).collect()
if row:
    for k, v in row[0].asDict().items():
        print(f"{k}: {v}\n")
else:
    print("No entry found with the specified id.")

In [0]:
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
import pandas as pd

# Monat ableiten (yyyy-MM) aus releaseDate
per_month_sdf = (
    df.filter(F.col("releaseDate").isNotNull())
      .withColumn("month", F.date_format(F.col("releaseDate"), "yyyy-MM"))
      .groupBy("month")
      .count()
      .orderBy("month")
)

# Nach Pandas holen und nach Datum sortieren (als echte Zeitachse)
per_month = per_month_sdf.toPandas()
per_month["month"] = pd.to_datetime(per_month["month"], errors='coerce')
per_month = per_month.sort_values("month")

# Liniendiagramm: Artikel pro Monat
plt.figure()
plt.plot(per_month["month"], per_month["count"])
#plt.title("Articles over Time (per month)")
plt.xlabel("Month")
plt.ylabel("Articles")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()
