# PDP Articles v2 (Silver) – Clean Join/Filter Blueprint

**Structure**
- [A] Config & Imports  
- [B] Load tables  
- [C] Core dedupe (latest per `article_id`)  
- [D] 1:n → 1:1 Aggregations  
- [E] Central Join Zone  
- [F] Filter Zone  
- [G] Output Zone  


## [A] Config & Imports

In [0]:
from functools import reduce
from pyspark.sql import functions as F
from pyspark.sql.window import Window

DB = "swi_audience_prd.pdp_articles_v2_silver"

T_CORE               = f"{DB}.core"
T_CONTENT_TEXT       = f"{DB}.content_text"
T_RESOURCES_LINKS    = f"{DB}.resources_links"
T_EXISTS_AS          = f"{DB}.exists_as"
T_RESOURCES_PICTURES = f"{DB}.resources_pictures"
T_CONTRIBUTORS_BASE  = f"{DB}.contributors_base"
T_TITLE_ITEMS        = f"{DB}.title_items"

FILTER_PUBLISHER = "SWI"


## [B] Load Tables

In [0]:
core_df = spark.table(T_CORE)
content_text_df = spark.table(T_CONTENT_TEXT)
resources_links_df = spark.table(T_RESOURCES_LINKS)
exists_as_df = spark.table(T_EXISTS_AS)
resources_pictures_df = spark.table(T_RESOURCES_PICTURES)
contributors_base_df = spark.table(T_CONTRIBUTORS_BASE)
title_items_df = spark.table(T_TITLE_ITEMS)


## [C] Core Dedupe

In [0]:
w_latest = Window.partitionBy("article_id").orderBy(F.col("source_timestamp").desc_nulls_last())

core_latest_df = (
    core_df
    .withColumn("_rn_latest", F.row_number().over(w_latest))
    .filter(F.col("_rn_latest") == 1)
    .drop("_rn_latest")
)


## [D] Aggregations (1:n → 1:1)

In [0]:
schema_fields = {f.name: f.dataType for f in content_text_df.schema.fields}

if "text" in schema_fields and schema_fields["text"].typeName() == "array":
    content_text_agg_df = (
        content_text_df
        .groupBy("article_id")
        .agg(F.concat_ws("\n", F.col("text")).alias("text"))
    )
else:
    content_text_agg_df = (
        content_text_df
        .groupBy("article_id")
        .agg(F.concat_ws("\n", F.collect_list(F.col("text"))).alias("text"))
    )


In [0]:
resources_links_agg_df = (
    resources_links_df
    .select("article_id", "url")
    .groupBy("article_id")
    .agg(
        F.collect_set("url").alias("urls"),
        F.first("url", ignorenulls=True).alias("url_first")
    )
)


In [0]:
exists_as_agg_df = (
    exists_as_df
    .select("article_id", "exists_as")
    .groupBy("article_id")
    .agg(F.first("exists_as", ignorenulls=True).alias("exists_as"))
)


In [0]:
resources_pictures_agg_df = (
    resources_pictures_df
    .select("article_id", F.col("url").alias("resource_url"))
    .groupBy("article_id")
    .agg(
        F.collect_set("resource_url").alias("picture_urls"),
        F.first("resource_url", ignorenulls=True).alias("picture_url_first")
    )
)


In [0]:
contributors_agg_df = (
    contributors_base_df
    .select("article_id", "person_name")
    .groupBy("article_id")
    .agg(
        F.collect_set("person_name").alias("contributors"),
        F.first("person_name", ignorenulls=True).alias("contributor_first")
    )
)


In [0]:
title_items_agg_df = (
    title_items_df
    .select("article_id", "language")
    .groupBy("article_id")
    .agg(F.collect_set("language").alias("languages"))
)


## [E] Central Join Zone

In [0]:
dfs_to_join = [
    content_text_agg_df,
    resources_links_agg_df,
    exists_as_agg_df,
    resources_pictures_agg_df,
    contributors_agg_df,
    title_items_agg_df
]

joined_df = reduce(
    lambda left, right: left.join(right, on="article_id", how="left"),
    dfs_to_join,
    core_latest_df
)


## [F] Filter Zone

In [0]:
filtered_df = (
    joined_df
    .filter(F.col("publisher") !=  F.lit(FILTER_PUBLISHER))
)


## [G] Output Zone

In [0]:
filtered_df = filtered_df.orderBy(
    F.col("source_timestamp").desc_nulls_last()
)

filtered_df.createOrReplaceTempView("pdp_articles_swi_enriched")

#display(filtered_df)

# Content-Factory-Tabelle

In [0]:
from pyspark.sql import Row

languages = ["ar", "de", "en", "es", "fr", "it", "ja", "pt", "rm", "ru", "zh"]
platforms = ["swissinfo.ch","Facebook","Instagram","X","Whatsapp"]

languages_df = spark.createDataFrame([Row(language=l) for l in languages])
platforms_df = spark.createDataFrame([Row(platform=p) for p in platforms])


In [0]:
base_df = filtered_df.select("article_id", "text", "url_first", "source_timestamp")

In [0]:
expanded_df = (
    base_df
    .crossJoin(languages_df)
    .crossJoin(platforms_df)
)


In [0]:
%skip
from pyspark.sql.functions import concat, lit

prompt_df = (
    expanded_df
    .withColumn(
        "prompt",
        concat(
            lit("Translate the following news article into "),
            F.col("language"),
            lit(". Then adapt it stylistically for the platform "),
            F.col("platform"),
            lit(" in the tone and editorial style of swissinfo.ch.\n\nARTICLE:\n"),
            F.col("text")
        )
    )
)

prompt_df = prompt_df.orderBy(F.col("source_timestamp").desc_nulls_last())

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.functions import concat, lit

prompt_df = (
    expanded_df
    .withColumn(
        "prompt",
        concat(
            # ROLE
            lit("You are a professional editor for swissinfo.ch.\n\n"),

            # STYLE BLOCK
            lit("Full-length neutral journalistic article in swissinfo.ch style.\n"),
            lit("At the end add this sentence in the TARGET LANGUAGE:\n"),
            lit("\"This text was created with the help of artificial intelligence based on the original article available at: "),
            F.col("url_first"),
            lit("\"\n\n"),

            # TASK
            lit("TASK:\n"),
            lit("1) Translate the ARTICLE into the TARGET LANGUAGE.\n"),
            lit("2) Rewrite it according to the PLATFORM RULES.\n"),
            lit("3) The entire output MUST be in the TARGET LANGUAGE.\n"),
            lit("4) Follow the platform rules exactly.\n\n"),

            # STRICT OUTPUT RULES
            lit("STRICT OUTPUT RULES:\n"),
            lit("- Output ONLY the final platform text.\n"),
            lit("- No explanations.\n"),
            lit("- No headings.\n"),
            lit("- No meta comments.\n"),
            lit("- Wrap your output EXACTLY like this:\n"),
            lit("<FINAL>\n"),
            lit("...text...\n"),
            lit("</FINAL>\n\n"),

            # DYNAMIC CONTEXT
            lit("TARGET LANGUAGE: "),
            F.col("language"),
            lit("\n"),
            lit("PLATFORM: "),
            F.col("platform"),
            lit("\n\n"),

            # ARTICLE
            lit("ARTICLE:\n"),
            F.col("text")
        )
    )
)

prompt_df = prompt_df.orderBy(F.col("source_timestamp").desc_nulls_last())

In [0]:
#display(prompt_df.limit(100))

In [0]:
%skip
# ============================================================
# AI GENERATION PIPELINE WITH PLATFORM-SPECIFIC ENDING RULES
# ============================================================

from pyspark.sql import functions as F
from pyspark.sql import Row

# ------------------------------------------------------------
# 1️⃣ Plattform-Regeln (Inhalt + Ending Rules)
# ------------------------------------------------------------

platform_rules = {
    "swissinfo.ch": """Full-length neutral journalistic article in swissinfo.ch style.
At the end add this sentence in the TARGET LANGUAGE:
"This text was created with the help of artificial intelligence based on the original article available at: {url}"
""",

    "Facebook": """Engaging but neutral Facebook post. Max 1500 characters.
At the end add this sentence in the TARGET LANGUAGE:
"Read the full article at: {url}"
""",

    "Instagram": """Short engaging Instagram caption. Max 1000 characters.
Add 3 relevant hashtags at the end.
After the hashtags add this sentence in the TARGET LANGUAGE:
"Read the full article on our website."
""",

    "X": """Very concise news post. Max 280 characters.
At the end add this sentence in the TARGET LANGUAGE:
"Read the full article at: {url}"
""",

    "Whatsapp": """Short neutral news summary. Max 800 characters.
At the end add this sentence in the TARGET LANGUAGE:
"Read the full article at: {url}"
"""
}

rules_df = spark.createDataFrame(
    [Row(platform=k, platform_rule=v) for k, v in platform_rules.items()]
)

# ------------------------------------------------------------
# 2️⃣ Plattform-Regeln anhängen
# ------------------------------------------------------------

expanded_with_rules_df = (
    expanded_df
    .join(rules_df, on="platform", how="left")
)

# ------------------------------------------------------------
# 3️⃣ URL dynamisch einsetzen
# ------------------------------------------------------------

expanded_with_rules_df = expanded_with_rules_df.withColumn(
    "platform_rule_with_url",
    F.expr("replace(platform_rule, '{url}', url_first)")
)

# ------------------------------------------------------------
# 4️⃣ Prompt bauen (sehr strikt!)
# ------------------------------------------------------------

prompt_df_final = (
    expanded_with_rules_df
    .withColumn(
        "prompt",
        F.concat(
            F.lit(
"""You are a professional editor for swissinfo.ch.

TASK:
1) Translate the ARTICLE into the TARGET LANGUAGE.
2) Rewrite it according to the PLATFORM RULES.
3) The entire output MUST be in the TARGET LANGUAGE.
4) Follow the platform rules exactly.

STRICT OUTPUT RULES:
- Output ONLY the final platform text.
- No explanations.
- No headings.
- No meta comments.
- Wrap your output EXACTLY like this:

<FINAL>
...text...
</FINAL>

TARGET LANGUAGE: """
            ),
            F.col("language"),
            F.lit("\nPLATFORM RULES:\n"),
            F.col("platform_rule_with_url"),
            F.lit("\n\nARTICLE:\n"),
            F.col("text")
        )
    )
)

# ------------------------------------------------------------
# 5️⃣ Testlauf nur 50 Einträge
# ------------------------------------------------------------

test_df = prompt_df_final.limit(200)

# ------------------------------------------------------------
# 6️⃣ AI Call
# ------------------------------------------------------------

result_df = test_df.withColumn(
    "raw_output",
    F.expr("ai_gen(prompt)")
)

# ------------------------------------------------------------
# 7️⃣ FINAL Text extrahieren
# ------------------------------------------------------------

final_df = (
    result_df
    .withColumn(
        "extracted",
        F.regexp_extract(
            F.col("raw_output"),
            r"(?s)<FINAL>\s*(.*?)\s*</FINAL>",
            1
        )
    )
    .withColumn(
        "text",
        F.when(F.length("extracted") > 0, F.col("extracted"))
         .otherwise(F.col("raw_output"))
    )
    .select(
        "article_id",
        "language",
        "platform",
        "text"
    )
)

display(final_df)


In [0]:
%skip
# ============================================================
# AI GENERATION PIPELINE (STARTING FROM prompt_df)
# ============================================================

from pyspark.sql import functions as F
from pyspark.sql import Row

# ------------------------------------------------------------
# 1️⃣ Plattform-Regeln (Inhalt + Ending Rules)
# ------------------------------------------------------------

platform_rules = {
    "swissinfo.ch": """Full-length neutral journalistic article in swissinfo.ch style.
At the end add this sentence in the TARGET LANGUAGE:
"This text was created with the help of artificial intelligence based on the original article available at: {url}"
""",

    "Facebook": """Engaging but neutral Facebook post. Max 1500 characters.
At the end add this sentence in the TARGET LANGUAGE:
"Read the full article at: {url}"
""",

    "Instagram": """Short engaging Instagram caption. Max 1000 characters.
Add 3 relevant hashtags at the end.
After the hashtags add this sentence in the TARGET LANGUAGE:
"Read the full article on our website."
""",

    "X": """Very concise news post. Max 280 characters.
At the end add this sentence in the TARGET LANGUAGE:
"Read the full article at: {url}"
""",

    "Whatsapp": """Short neutral news summary. Max 800 characters.
At the end add this sentence in the TARGET LANGUAGE:
"Read the full article at: {url}"
"""
}

rules_df = spark.createDataFrame(
    [Row(platform=k, platform_rule=v) for k, v in platform_rules.items()]
)

# ------------------------------------------------------------
# 2️⃣ Plattform-Regeln an prompt_df anhängen (statt expanded_df)
# ------------------------------------------------------------

base_df = prompt_df  # <- DAS ist jetzt dein Einstiegspunkt

with_rules_df = (
    base_df
    .join(rules_df, on="platform", how="left")
)

# Optional: falls Plattform nicht gematcht hat, fallback-Regel setzen
with_rules_df = with_rules_df.withColumn(
    "platform_rule",
    F.coalesce(
        F.col("platform_rule"),
        F.lit("""Neutral journalistic text.
At the end add this sentence in the TARGET LANGUAGE:
"Read the full article at: {url}"
""")
    )
)

# ------------------------------------------------------------
# 3️⃣ URL dynamisch einsetzen
# ------------------------------------------------------------

with_rules_df = with_rules_df.withColumn(
    "platform_rule_with_url",
    F.expr("replace(platform_rule, '{url}', coalesce(url_first, ''))")
)

# ------------------------------------------------------------
# 4️⃣ Prompt bauen (überschreibt vorhandene prompt-Spalte sauber)
# ------------------------------------------------------------

prompt_df_final = (
    with_rules_df
    .withColumn(
        "prompt",
        F.concat(
            F.lit(
"""You are a professional editor for swissinfo.ch.

TASK:
1) Translate the ARTICLE into the TARGET LANGUAGE.
2) Rewrite it according to the PLATFORM RULES.
3) The entire output MUST be in the TARGET LANGUAGE.
4) Follow the platform rules exactly.

STRICT OUTPUT RULES:
- Output ONLY the final platform text.
- No explanations.
- No headings.
- No meta comments.
- Wrap your output EXACTLY like this:

<FINAL>
...text...
</FINAL>

TARGET LANGUAGE: """
            ),
            F.col("language"),
            F.lit("\nPLATFORM RULES:\n"),
            F.col("platform_rule_with_url"),
            F.lit("\n\nARTICLE:\n"),
            F.col("text")
        )
    )
)

# ------------------------------------------------------------
# 5️⃣ Testlauf nur 200 Einträge
# ------------------------------------------------------------

test_df = prompt_df_final.limit(200)

# ------------------------------------------------------------
# 6️⃣ AI Call
# ------------------------------------------------------------

result_df = test_df.withColumn(
    "raw_output",
    F.expr("ai_gen(prompt)")
)

# ------------------------------------------------------------
# 7️⃣ FINAL Text extrahieren
# ------------------------------------------------------------

final_df = (
    result_df
    .withColumn(
        "extracted",
        F.regexp_extract(
            F.col("raw_output"),
            r"(?s)<FINAL>\s*(.*?)\s*</FINAL>",
            1
        )
    )
    .withColumn(
        "text",
        F.when(F.length("extracted") > 0, F.col("extracted"))
         .otherwise(F.col("raw_output"))
    )
    .select(
        "article_id",
        "language",
        "platform",
        "text"
    )
)

display(final_df)

In [0]:
from pyspark.sql import functions as F

# 1) Testlauf (z.B. 200)
test_df = prompt_df.limit(200)

# 2) AI Call (nutzt den bestehenden prompt aus prompt_df)
result_df = test_df.withColumn(
    "raw_output",
    F.expr("ai_gen(prompt)")
)

# 3) <FINAL>...</FINAL> extrahieren (falls vorhanden), sonst raw_output nehmen
final_df = (
    result_df
    .withColumn(
        "extracted",
        F.regexp_extract(
            F.col("raw_output"),
            r"(?s)<FINAL>\s*(.*?)\s*</FINAL>",
            1
        )
    )
    .withColumn(
        "text",
        F.when(F.length("extracted") > 0, F.col("extracted"))
         .otherwise(F.col("raw_output"))
    )
    .select("article_id", "language", "platform", "text")
)

#display(final_df)

In [0]:
# ============================================================
# SIMPLE DELTA WRITE (Append)
# ============================================================

(final_df
    .withColumn("created_at", F.current_timestamp())
    .write
    .format("delta")
    .mode("append")  # append, kein overwrite
    .option("mergeSchema", "true")
    .saveAsTable("swi_audience_prd.pdp_articles_v2_gold.ContentFactoryTable")
)