In [None]:
# Module 8: ML with Streaming - Environment Setup
print("Setting up Machine Learning + Streaming Environment...")

import os
import time
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from typing import Iterator
import pickle
import json

# Core PySpark imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.streaming import StreamingQuery

# MLlib imports for machine learning
from pyspark.ml import Pipeline, Transformer
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.evaluation import BinaryClassificationEvaluator, RegressionEvaluator
from pyspark.ml.clustering import KMeans
from pyspark.ml.recommendation import ALS

# Advanced ML imports
from pyspark.ml.stat import Correlation
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf, pandas_udf, PandasUDFType

# Configure Spark for ML + Streaming workloads
spark = SparkSession.builder \
    .appName("PySpark-ML-Streaming-Integration") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.streaming.checkpointLocation", "/tmp/ml-streaming-checkpoints") \
    .config("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true") \
    .config("spark.default.parallelism", "4") \
    .config("spark.sql.shuffle.partitions", "4") \
    .config("spark.ml.streaming.numPartitions", "4") \
    .getOrCreate()

# Set log level for cleaner output
spark.sparkContext.setLogLevel("ERROR")

print("ML + Streaming Session Created")
print("Spark Version: {}".format(spark.version))
print("ML streaming checkpoint location: /tmp/ml-streaming-checkpoints")

# Create directories for ML models and outputs
model_dir = "/tmp/ml_streaming_models"
output_dir = "/tmp/ml_streaming_output"
os.makedirs(model_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)

print(f"Model directory: {model_dir}")
print(f"Output directory: {output_dir}")

# Display ML streaming configurations
print("\nML + Streaming Configuration:")
ml_configs = [
    "spark.sql.adaptive.enabled",
    "spark.sql.streaming.checkpointLocation", 
    "spark.default.parallelism",
    "spark.ml.streaming.numPartitions"
]

for config in ml_configs:
    value = spark.conf.get(config, "Not Set")
    print("   {}: {}".format(config, value))

print("\nML + Streaming environment ready!")
print("Ready for real-time machine learning pipelines and inference!")

In [None]:
# Real-time Model Inference on Streaming Data
print("Setting up real-time ML inference pipeline...")

print("=== 1. Training Initial Model ===")

# Create training data for a fraud detection model
training_data = spark.createDataFrame([
    (1.2, 45.0, 2, 150.0, 0),   # Normal transaction
    (15.5, 67.0, 1, 2500.0, 1), # Fraudulent 
    (0.8, 23.0, 3, 89.0, 0),    # Normal
    (22.3, 45.0, 1, 5000.0, 1), # Fraudulent
    (2.1, 34.0, 2, 200.0, 0),   # Normal
    (18.7, 56.0, 1, 3200.0, 1), # Fraudulent
    (1.5, 29.0, 4, 95.0, 0),    # Normal
    (25.1, 78.0, 1, 7500.0, 1), # Fraudulent
    (0.9, 41.0, 3, 120.0, 0),   # Normal
    (19.8, 52.0, 1, 4100.0, 1), # Fraudulent
], ["transaction_velocity", "age", "num_locations", "amount", "is_fraud"])

print("Created training dataset with fraud detection features")

# Prepare features for ML pipeline
feature_cols = ["transaction_velocity", "age", "num_locations", "amount"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")
lr = LogisticRegression(featuresCol="scaled_features", labelCol="is_fraud", predictionCol="fraud_prediction")

# Create and train the pipeline
pipeline = Pipeline(stages=[assembler, scaler, lr])
model = pipeline.fit(training_data)

# Save the model for streaming inference
model_path = f"{model_dir}/fraud_detection_model"
model.write().overwrite().save(model_path)
print(f"Fraud detection model trained and saved to: {model_path}")

# Test the model on training data
predictions = model.transform(training_data)
predictions.select("amount", "is_fraud", "fraud_prediction", "probability").show()

print("=== 2. Real-time Streaming Inference ===")

# Create streaming data that simulates real-time transactions
streaming_transactions = spark \
    .readStream \
    .format("rate") \
    .option("rowsPerSecond", 3) \
    .load() \
    .withColumn("transaction_velocity", (rand() * 30)) \
    .withColumn("age", (rand() * 80 + 18)) \
    .withColumn("num_locations", (rand() * 5 + 1).cast("int")) \
    .withColumn("amount", (rand() * 10000 + 50)) \
    .withColumn("transaction_id", concat(lit("txn_"), col("value").cast("string"))) \
    .withColumn("transaction_time", col("timestamp")) \
    .select("transaction_id", "transaction_velocity", "age", "num_locations", "amount", "transaction_time")

print("Created streaming transaction data source")

# Load the saved model for inference
loaded_model = Pipeline.load(model_path)

# Apply real-time fraud detection
fraud_predictions = loaded_model.transform(streaming_transactions) \
    .withColumn("fraud_risk", 
        when(col("fraud_prediction") == 1.0, "HIGH")
        .when(col("probability").getItem(1) > 0.7, "MEDIUM")
        .otherwise("LOW")) \
    .select("transaction_id", "amount", "fraud_prediction", "fraud_risk", "transaction_time")

# Start real-time inference query
inference_query = fraud_predictions \
    .writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", False) \
    .trigger(processingTime='5 seconds') \
    .start()

print("Real-time fraud detection started!")
print("Processing transactions and predicting fraud risk...")

# Let the inference run for demonstration
time.sleep(20)

print("\nReal-time ML inference demonstration complete!")
print("Showed how to apply pre-trained models to streaming data")

In [None]:
# Online Learning and Model Updates
print("Setting up online learning pipeline...")

# Stop previous query
inference_query.stop()
time.sleep(2)

print("=== 3. Incremental Learning with Streaming Data ===")

# Create a streaming data source with labeled examples for online learning
online_learning_stream = spark \
    .readStream \
    .format("rate") \
    .option("rowsPerSecond", 2) \
    .load() \
    .withColumn("feature1", rand() * 10) \
    .withColumn("feature2", rand() * 20) \
    .withColumn("feature3", (rand() * 100).cast("int")) \
    .withColumn("true_label", 
        when((col("feature1") > 5) & (col("feature2") > 10), 1).otherwise(0)) \
    .withColumn("batch_id", (col("value") / 10).cast("int")) \
    .select("feature1", "feature2", "feature3", "true_label", "batch_id", "timestamp")

print("Created streaming data source with ground truth labels")

# Define a function to retrain model on each batch
def update_model(batch_df, batch_id):
    """
    Function to incrementally update the model with new data
    """
    print(f"\n--- Processing Batch {batch_id} for Model Update ---")
    
    if batch_df.count() == 0:
        print("Empty batch, skipping model update")
        return
    
    # Show current batch statistics
    print(f"Batch size: {batch_df.count()} records")
    batch_df.groupBy("true_label").count().show()
    
    # Prepare features for this batch
    feature_assembler = VectorAssembler(
        inputCols=["feature1", "feature2", "feature3"],
        outputCol="features"
    )
    
    batch_features = feature_assembler.transform(batch_df)
    
    # Train/update model on this batch
    lr_incremental = LogisticRegression(
        featuresCol="features", 
        labelCol="true_label",
        predictionCol="prediction",
        maxIter=10
    )
    
    try:
        # Train model on current batch
        incremental_model = lr_incremental.fit(batch_features)
        
        # Make predictions on the same batch to evaluate
        predictions = incremental_model.transform(batch_features)
        
        # Calculate accuracy for this batch
        correct_predictions = predictions.filter(col("prediction") == col("true_label")).count()
        total_predictions = predictions.count()
        accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
        
        print(f"Batch accuracy: {accuracy:.3f}")
        print(f"Model coefficients: {incremental_model.stages[-1].coefficients}")
        
        # In production, you would save the updated model here
        # incremental_model.write().overwrite().save(f"{model_dir}/online_model_batch_{batch_id}")
        
    except Exception as e:
        print(f"Error training on batch {batch_id}: {e}")

# Start online learning with foreachBatch
online_learning_query = online_learning_stream \
    .writeStream \
    .foreachBatch(update_model) \
    .trigger(processingTime='8 seconds') \
    .start()

print("Online learning pipeline started!")
print("Model will be retrained on each new batch of data...")

# Let it run to see model updates
time.sleep(30)
online_learning_query.stop()

print("\n=== 4. Concept Drift Detection ===")

# Create a simple concept drift detection example
concept_drift_stream = spark \
    .readStream \
    .format("rate") \
    .option("rowsPerSecond", 4) \
    .load() \
    .withColumn("feature", rand() * 100) \
    .withColumn("time_period", (col("value") / 20).cast("int")) \
    .withColumn("drifted_label",
        # Simulate concept drift: pattern changes over time
        when(col("time_period") < 2, 
             when(col("feature") > 50, 1).otherwise(0))  # Original pattern
        .otherwise(
             when(col("feature") > 30, 1).otherwise(0))) # Drifted pattern
    .select("feature", "drifted_label", "time_period", "timestamp")

def detect_concept_drift(batch_df, batch_id):
    """
    Simple concept drift detection based on label distribution changes
    """
    print(f"\n--- Concept Drift Analysis - Batch {batch_id} ---")
    
    if batch_df.count() == 0:
        return
    
    # Calculate label distribution for current batch
    current_dist = batch_df.groupBy("drifted_label").count().collect()
    
    if len(current_dist) >= 2:
        total = sum([row['count'] for row in current_dist])
        positive_ratio = next((row['count']/total for row in current_dist if row['drifted_label'] == 1), 0)
        
        print(f"Positive label ratio: {positive_ratio:.3f}")
        
        # Simple drift detection: if ratio changes significantly from expected 0.5
        if abs(positive_ratio - 0.5) > 0.2:
            print("⚠️  CONCEPT DRIFT DETECTED!")
            print("   Consider retraining the model with recent data")
        else:
            print("✅ No significant concept drift detected")
            
        # Show time period distribution to visualize drift
        batch_df.groupBy("time_period", "drifted_label").count().orderBy("time_period").show()

# Start concept drift monitoring
drift_query = concept_drift_stream \
    .writeStream \
    .foreachBatch(detect_concept_drift) \
    .trigger(processingTime='10 seconds') \
    .start()

print("Concept drift detection started!")
print("Monitoring for changes in data patterns...")

time.sleep(25)
drift_query.stop()

print("\nOnline learning and concept drift detection complete!")
print("Demonstrated incremental model updates and drift monitoring")

In [None]:
# Real-time Feature Engineering for ML
print("Setting up real-time feature engineering pipeline...")

print("=== 5. Streaming Feature Engineering ===")

# Create a streaming source for e-commerce events
ecommerce_stream = spark \
    .readStream \
    .format("rate") \
    .option("rowsPerSecond", 5) \
    .load() \
    .withColumn("user_id", (col("value") % 100).cast("string")) \
    .withColumn("product_id", concat(lit("prod_"), ((col("value") % 50) + 1).cast("string"))) \
    .withColumn("action", 
        when(col("value") % 4 == 0, "view")
        .when(col("value") % 4 == 1, "click")
        .when(col("value") % 4 == 2, "add_to_cart")
        .otherwise("purchase")) \
    .withColumn("price", rand() * 1000 + 10) \
    .withColumn("event_time", col("timestamp")) \
    .select("user_id", "product_id", "action", "price", "event_time")

print("Created e-commerce event stream")

# Real-time feature engineering with windowing
feature_engineering = ecommerce_stream \
    .withWatermark("event_time", "1 minute") \
    .groupBy(
        col("user_id"),
        window(col("event_time"), "2 minutes", "30 seconds")
    ) \
    .agg(
        # Behavioral features
        count("*").alias("total_events"),
        countDistinct("product_id").alias("unique_products_viewed"),
        sum(when(col("action") == "purchase", 1).otherwise(0)).alias("purchase_count"),
        sum(when(col("action") == "view", 1).otherwise(0)).alias("view_count"),
        sum(when(col("action") == "add_to_cart", 1).otherwise(0)).alias("cart_additions"),
        
        # Financial features
        avg("price").alias("avg_price_viewed"),
        max("price").alias("max_price_viewed"), 
        sum(when(col("action") == "purchase", col("price")).otherwise(0)).alias("total_spent"),
        
        # Derived features
        (sum(when(col("action") == "purchase", 1).otherwise(0)) / count("*")).alias("conversion_rate")
    ) \
    .withColumn("window_start", col("window.start")) \
    .withColumn("window_end", col("window.end")) \
    .withColumn("engagement_score", 
        col("unique_products_viewed") * 2 + 
        col("purchase_count") * 10 + 
        col("cart_additions") * 3) \
    .withColumn("high_value_user", 
        when(col("total_spent") > 500, 1).otherwise(0)) \
    .drop("window")

print("Defined real-time feature engineering pipeline")

# Start feature engineering stream
feature_query = feature_engineering \
    .writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", False) \
    .trigger(processingTime='10 seconds') \
    .start()

print("Real-time feature engineering started!")
print("Computing user behavior features in sliding windows...")

time.sleep(30)
feature_query.stop()

print("=== 6. Real-time Anomaly Detection ===")

# Create a stream for anomaly detection using the computed features
anomaly_detection_stream = spark \
    .readStream \
    .format("rate") \
    .option("rowsPerSecond", 3) \
    .load() \
    .withColumn("cpu_usage", rand() * 100) \
    .withColumn("memory_usage", rand() * 100) \
    .withColumn("network_io", rand() * 1000) \
    .withColumn("disk_io", rand() * 500) \
    .withColumn("server_id", concat(lit("server_"), (col("value") % 10).cast("string"))) \
    .withColumn("metric_time", col("timestamp")) \
    .select("server_id", "cpu_usage", "memory_usage", "network_io", "disk_io", "metric_time")

print("Created system metrics stream for anomaly detection")

# Real-time anomaly detection using statistical thresholds
anomaly_detection = anomaly_detection_stream \
    .withWatermark("metric_time", "2 minutes") \
    .groupBy(
        col("server_id"),
        window(col("metric_time"), "1 minute")
    ) \
    .agg(
        avg("cpu_usage").alias("avg_cpu"),
        max("cpu_usage").alias("max_cpu"),
        stddev("cpu_usage").alias("stddev_cpu"),
        avg("memory_usage").alias("avg_memory"),
        max("memory_usage").alias("max_memory"),
        avg("network_io").alias("avg_network"),
        max("network_io").alias("max_network"),
        count("*").alias("metric_count")
    ) \
    .withColumn("cpu_anomaly", 
        when((col("max_cpu") > 90) | (col("stddev_cpu") > 25), 1).otherwise(0)) \
    .withColumn("memory_anomaly",
        when(col("max_memory") > 85, 1).otherwise(0)) \
    .withColumn("network_anomaly",
        when(col("max_network") > 800, 1).otherwise(0)) \
    .withColumn("anomaly_score",
        col("cpu_anomaly") + col("memory_anomaly") + col("network_anomaly")) \
    .withColumn("alert_level",
        when(col("anomaly_score") >= 3, "CRITICAL")
        .when(col("anomaly_score") >= 2, "HIGH")
        .when(col("anomaly_score") >= 1, "MEDIUM")
        .otherwise("LOW")) \
    .filter(col("anomaly_score") > 0)  # Only show anomalies

# Start anomaly detection
anomaly_query = anomaly_detection \
    .writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", False) \
    .trigger(processingTime='15 seconds') \
    .start()

print("Real-time anomaly detection started!")
print("Monitoring system metrics for anomalous behavior...")

time.sleep(35)
anomaly_query.stop()

print("\nReal-time feature engineering and anomaly detection complete!")
print("Demonstrated streaming feature computation and ML-based monitoring")

In [None]:
# Advanced ML Streaming Patterns
print("Setting up advanced ML streaming patterns...")

print("=== 7. Multi-Model Ensemble Predictions ===")

# Create streaming data for recommendation system
recommendation_stream = spark \
    .readStream \
    .format("rate") \
    .option("rowsPerSecond", 4) \
    .load() \
    .withColumn("user_id", (col("value") % 50).cast("string")) \
    .withColumn("item_id", (col("value") % 100).cast("string")) \
    .withColumn("rating", rand() * 5) \
    .withColumn("user_age", (rand() * 50 + 18).cast("int")) \
    .withColumn("item_category", 
        when(col("value") % 3 == 0, "electronics")
        .when(col("value") % 3 == 1, "books")
        .otherwise("clothing")) \
    .withColumn("interaction_time", col("timestamp")) \
    .select("user_id", "item_id", "rating", "user_age", "item_category", "interaction_time")

print("Created recommendation streaming data")

# Simulate ensemble predictions with multiple models
def ensemble_predictions(batch_df, batch_id):
    """
    Apply multiple models and combine their predictions
    """
    print(f"\n--- Ensemble Prediction - Batch {batch_id} ---")
    
    if batch_df.count() == 0:
        return
    
    # Model 1: Content-based (simplified)
    content_based = batch_df.withColumn("content_score",
        when(col("item_category") == "electronics", col("user_age") / 100)
        .when(col("item_category") == "books", (80 - col("user_age")) / 100)
        .otherwise(col("user_age") / 150))
    
    # Model 2: Collaborative filtering (simplified)
    collaborative = content_based.withColumn("collab_score",
        rand() * 0.8 + 0.1)  # Simulated CF score
    
    # Model 3: Popularity-based (simplified)
    popularity = collaborative.withColumn("popularity_score",
        when(col("item_id").cast("int") < 20, 0.9)  # Popular items
        .otherwise(rand() * 0.5))
    
    # Ensemble combination (weighted average)
    ensemble = popularity.withColumn("ensemble_score",
        col("content_score") * 0.4 + 
        col("collab_score") * 0.4 + 
        col("popularity_score") * 0.2) \
    .withColumn("recommendation",
        when(col("ensemble_score") > 0.6, "RECOMMEND").otherwise("NO_RECOMMEND"))
    
    # Show results
    print(f"Batch size: {batch_df.count()} interactions")
    ensemble.groupBy("recommendation").count().show()
    
    # Show sample predictions
    ensemble.select("user_id", "item_id", "ensemble_score", "recommendation").show(5)

# Start ensemble prediction stream
ensemble_query = recommendation_stream \
    .writeStream \
    .foreachBatch(ensemble_predictions) \
    .trigger(processingTime='12 seconds') \
    .start()

print("Multi-model ensemble predictions started!")
print("Combining content-based, collaborative, and popularity models...")

time.sleep(30)
ensemble_query.stop()

print("=== 8. Time Series Forecasting with Streaming ===")

# Create time series data stream
timeseries_stream = spark \
    .readStream \
    .format("rate") \
    .option("rowsPerSecond", 2) \
    .load() \
    .withColumn("metric_id", concat(lit("metric_"), (col("value") % 5).cast("string"))) \
    .withColumn("hour", hour(col("timestamp"))) \
    .withColumn("minute", minute(col("timestamp"))) \
    .withColumn("base_value", 100 + sin(col("hour") * 0.26) * 20)  # Daily pattern
    .withColumn("noise", (rand() - 0.5) * 10) \
    .withColumn("observed_value", col("base_value") + col("noise")) \
    .withColumn("forecast_time", col("timestamp")) \
    .select("metric_id", "observed_value", "hour", "minute", "forecast_time")

print("Created time series data stream with seasonal patterns")

# Simple forecasting using moving averages
forecasting = timeseries_stream \
    .withWatermark("forecast_time", "5 minutes") \
    .groupBy(
        col("metric_id"),
        window(col("forecast_time"), "3 minutes", "1 minute")
    ) \
    .agg(
        avg("observed_value").alias("moving_avg"),
        stddev("observed_value").alias("moving_stddev"),
        count("*").alias("sample_count"),
        min("observed_value").alias("min_value"),
        max("observed_value").alias("max_value")
    ) \
    .withColumn("forecast_next", col("moving_avg")) \
    .withColumn("confidence_interval_lower", col("moving_avg") - 2 * col("moving_stddev")) \
    .withColumn("confidence_interval_upper", col("moving_avg") + 2 * col("moving_stddev")) \
    .withColumn("trend",
        when(col("moving_avg") > 110, "INCREASING")
        .when(col("moving_avg") < 90, "DECREASING")
        .otherwise("STABLE")) \
    .select("metric_id", "forecast_next", "confidence_interval_lower", "confidence_interval_upper", "trend", "window")

# Start forecasting stream
forecast_query = forecasting \
    .writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", False) \
    .trigger(processingTime='15 seconds') \
    .start()

print("Time series forecasting started!")
print("Generating predictions with confidence intervals...")

time.sleep(35)
forecast_query.stop()

print("\nAdvanced ML streaming patterns complete!")
print("Demonstrated ensemble models and time series forecasting")

In [None]:
# Module 8 Summary and Production Best Practices
print("=== Module 8: ML + Streaming Integration - Complete! ===")

# Stop any remaining active queries
print("\nCleaning up remaining streaming queries...")
for stream in spark.streams.active:
    print(f"Stopping: {stream.name if stream.name else 'Unnamed Query'}")
    stream.stop()

print("All streaming queries stopped")

# Summary of ML streaming patterns covered
print("\n" + "="*70)
print("MODULE 8 ML + STREAMING ACCOMPLISHMENTS")
print("="*70)

print("\n✅ REAL-TIME ML INFERENCE")
print("   • Pre-trained model deployment for streaming predictions")
print("   • Fraud detection with real-time transaction scoring")
print("   • Model loading and application to streaming data")

print("\n✅ ONLINE LEARNING & MODEL UPDATES")
print("   • Incremental model training with streaming batches")
print("   • Concept drift detection and monitoring")
print("   • Adaptive learning pipelines for evolving data")

print("\n✅ REAL-TIME FEATURE ENGINEERING")
print("   • Streaming feature computation with windowing")
print("   • Behavioral analytics and user engagement scoring")
print("   • Feature stores integration patterns")

print("\n✅ ADVANCED ML STREAMING PATTERNS")
print("   • Multi-model ensemble predictions")
print("   • Real-time anomaly detection with ML")
print("   • Time series forecasting with streaming data")
print("   • Recommendation systems with live updates")

print("\n" + "="*70)
print("PRODUCTION ML STREAMING BEST PRACTICES")
print("="*70)

print("\n🏗️ Model Deployment Strategies:")
print("   • Model versioning and A/B testing frameworks")
print("   • Hot model swapping without downtime")
print("   • Performance monitoring and latency optimization")
print("   • Graceful fallback for model failures")

print("\n📊 Feature Engineering:")
print("   • Real-time feature computation and caching")
print("   • Feature drift monitoring and validation")
print("   • Cross-batch feature consistency")
print("   • Feature store integration for serving")

print("\n🔍 Model Monitoring:")
print("   • Prediction accuracy tracking over time")
print("   • Input data quality monitoring")
print("   • Model performance degradation alerts")
print("   • Business metric correlation analysis")

print("\n⚙️ Operational Excellence:")
print("   • Checkpoint management for model state")
print("   • Error handling and recovery strategies")
print("   • Resource scaling for ML workloads")
print("   • Integration with MLOps pipelines")

print("\n" + "="*70)
print("COMPREHENSIVE PYSPARK TUTORIAL STATUS")
print("="*70)

print("\n🎓 Complete Module Mastery:")
print("   ✅ Module 1: PySpark Fundamentals & DataFrames")
print("   ✅ Module 2: Advanced Data Operations & SQL")
print("   ✅ Module 3: Data Cleaning & Transformation")  
print("   ✅ Module 4: Advanced Analytics & Window Functions")
print("   ✅ Module 5: Performance Optimization & Tuning")
print("   ✅ Module 6: Machine Learning with MLlib")
print("   ✅ Module 7: Structured Streaming Fundamentals")
print("   ✅ Module 7B: Advanced Streaming Patterns")
print("   ✅ Module 8: ML + Streaming Integration")

print("\n🚀 Production-Ready Skills Achieved:")
print("   • End-to-end data processing pipelines")
print("   • Real-time streaming analytics systems")
print("   • Machine learning model deployment")
print("   • Performance optimization techniques")
print("   • Production monitoring and maintenance")

print("\n🎯 Ready for Enterprise Applications:")
print("   • Real-time fraud detection systems")
print("   • Recommendation engines with live updates")
print("   • IoT analytics and anomaly detection")
print("   • Financial trading and risk systems")
print("   • Supply chain optimization platforms")

# Clean up temporary directories
import shutil
try:
    shutil.rmtree("/tmp/ml_streaming_models", ignore_errors=True)
    shutil.rmtree("/tmp/ml_streaming_output", ignore_errors=True)
    shutil.rmtree("/tmp/ml-streaming-checkpoints", ignore_errors=True)
    print("\n🧹 ML streaming artifacts cleaned up")
except:
    pass

print("\n" + "="*70)
print("🎯 COMPREHENSIVE PYSPARK TUTORIAL COMPLETE!")
print("ENTERPRISE-READY BIG DATA & ML SKILLS ACHIEVED!")
print("="*70)

print("\n🌟 Congratulations! You have mastered:")
print("   📊 Big Data Processing with PySpark")
print("   🔄 Real-time Streaming Analytics") 
print("   🤖 Machine Learning Pipeline Development")
print("   ⚡ Performance Optimization Techniques")
print("   🚀 Production Deployment Strategies")

print("\nReady to tackle any big data challenge in production! 🎉")

# Module 8: Machine Learning with Streaming Integration
*Real-time ML Inference, Online Learning, and Production ML Pipelines*

## Learning Objectives
Master the integration of machine learning with streaming data:

**Real-time ML Inference**
- Loading pre-trained models for streaming predictions
- Batch vs streaming inference patterns
- Model deployment strategies for real-time systems
- Performance optimization for low-latency inference

**Online Learning & Model Updates**
- Incremental learning with streaming data
- Model retraining strategies and triggers
- Feature engineering for streaming ML
- Handling concept drift and model decay

**Production ML Pipelines**
- End-to-end ML pipelines with streaming data
- Feature stores and real-time feature serving
- Model monitoring and performance tracking
- A/B testing for ML models in production

**Advanced ML Streaming Patterns**
- Multi-model ensemble predictions
- Real-time anomaly detection with ML
- Recommendation systems with streaming updates
- Time series forecasting with streaming data

---

## Module Structure
1. **ML Environment Setup** - MLlib streaming configuration
2. **Real-time Model Inference** - Streaming predictions with pre-trained models
3. **Online Learning Pipeline** - Incremental model updates
4. **Feature Engineering Streams** - Real-time feature computation
5. **ML Model Monitoring** - Performance tracking and drift detection
6. **Production ML Pipeline** - End-to-end streaming ML system
7. **Advanced ML Patterns** - Ensemble models and anomaly detection
8. **Summary & Best Practices** - Production deployment guidelines