In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Data Quality").getOrCreate()
spark


In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import random

# Start Spark session
spark = SparkSession.builder.appName("FakeTrainingData").getOrCreate()

# Generate fake data
data = [
    (44, 9, 21.0),
    (10, 5, 53.0),
    (37, 6, 80.0),
    (19, 10, 11.0),
    (46, 13, 215.0)
]

# Add more rows to reach 543 samples
for _ in range(543 - len(data)):
    totalAdClicks = random.randint(1, 60)
    totalBuyClicks = random.randint(1, 15)
    totalRevenue = round(random.uniform(5, 250), 1)
    data.append((totalAdClicks, totalBuyClicks, totalRevenue))

# Define schema
columns = ["SumAddClicks", "SumBuyClicks", "Revenue"]




In [6]:
trainingDF = spark.createDataFrame(data, columns)

# Show first 5 rows
trainingDF.show(5)

# Show dimensions (rows, columns)
rows = trainingDF.count()
cols = len(trainingDF.columns)
print((rows, cols))

+------------+------------+-------+
|SumAddClicks|SumBuyClicks|Revenue|
+------------+------------+-------+
|          44|           9|   21.0|
|          10|           5|   53.0|
|          37|           6|   80.0|
|          19|          10|   11.0|
|          46|          13|  215.0|
+------------+------------+-------+
only showing top 5 rows

(543, 3)


In [7]:
rows.shape()

AttributeError: 'int' object has no attribute 'shape'

In [None]:
import json
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, expr, concat, lit
from pyspark.sql.types import StructType, StringType, DoubleType, LongType
import pyspark.sql.functions as F
from kafka import KafkaProducer

# ----------------------------
# Config
# ----------------------------
BOOTSTRAP = "broker:29094"   # inside Docker network
TXN_TOPIC = "transaction"
METRICS_TOPIC = "metrics"
mode = "salting"   # "baseline", "broadcast", or "salting"
enable_aqe = True
SKEW_CARD = "4111-1111-1111-1111"
# ----------------------------
# Spark Session
# ----------------------------
spark = SparkSession.builder \
    .appName("fraud-detection-demo") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.3") \
    .config("spark.sql.shuffle.partitions", "6") \
    .getOrCreate()

print("✅ Spark ready:", spark.version)

if enable_aqe:
    spark.conf.set("spark.sql.adaptive.enabled", "true")

# ----------------------------
# Input schema & Kafka source
# ----------------------------
schema = StructType() \
    .add("txn_id", LongType()) \
    .add("card_number", StringType()) \
    .add("amount", DoubleType()) \
    .add("merchant", StringType()) \
    .add("ts", LongType())

kdf = spark.readStream.format("kafka") \
    .option("kafka.bootstrap.servers", BOOTSTRAP) \
    .option("subscribe", TXN_TOPIC) \
    .option("startingOffsets", "earliest") \
    .load()

txn_df = kdf.select(from_json(col("value").cast("string"), schema).alias("j")).select("j.*")

# ----------------------------
# Lookup table (risk profiles)
# ----------------------------
risk_profiles = spark.createDataFrame([
    ("4111-1111-1111-1111", "high"),
    ("4000-0000-0000-0002", "medium"),
    ("4000-0000-0000-0003", "low")
], ["card_number", "risk_level"])

# ----------------------------
# Join strategies
# ----------------------------
if mode == "baseline":
    joined = txn_df.join(risk_profiles, "card_number", "left")

elif mode == "broadcast":
    joined = txn_df.join(F.broadcast(risk_profiles), "card_number", "left")

elif mode == "salting":
    SALT_N = 6
    salts = spark.range(0, SALT_N).selectExpr("id as salt")
    lookup_salted = risk_profiles.crossJoin(salts) \
        .withColumn("salted_card", concat(col("card_number"), lit("_"), col("salt"))) \
        .select("salted_card", "risk_level")

    salted_stream = txn_df.withColumn(
        "salt",
        expr(f"CASE WHEN card_number='{SKEW_CARD}' THEN floor(rand()*{SALT_N}) ELSE 0 END")
    ).withColumn("salted_card", concat(col("card_number"), lit("_"), col("salt")))

    joined = salted_stream.join(
        lookup_salted,
        salted_stream.salted_card == lookup_salted.salted_card,
        "left"
    ).drop("salted_card")

# ----------------------------
# Metrics sender
# ----------------------------
def send_metrics(batch_df, batch_id):
    total = batch_df.count()
    risky = batch_df.filter("risk_level='high'").count()
    metrics = {
        "batch_id": int(batch_id),
        "mode": mode,
        "total_txns": int(total),
        "high_risk_txns": int(risky),
        "fraud_ratio": round(risky / total, 3) if total > 0 else 0
    }
    print(f"[Metrics] {metrics}")

    producer = KafkaProducer(
        bootstrap_servers=BOOTSTRAP,
        value_serializer=lambda v: json.dumps(v).encode("utf-8")
    )
    producer.send(METRICS_TOPIC, value=metrics)
    producer.flush()
    producer.close()

# ----------------------------
# Write stream
# ----------------------------
query = joined.writeStream \
    .outputMode("append") \
    .foreachBatch(lambda df, bid: (df.show(5, truncate=False), send_metrics(df, bid))) \
    .option("checkpointLocation", f"/opt/output/fraud_checkpoint_{mode}") \
    .start()

query.awaitTermination(60)
query.stop()
