<a href="https://colab.research.google.com/github/ShovalBenjer/Bigdata_Pyspark_Spark_Hadoop_Apache/blob/Final_Project/CFPB_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 The setup includes:

All necessary imports for PySpark SQL, ML features, classification models, pipelines, evaluation metrics, and tuning components.
Configuration variables with the requested naming conventions:

File paths for CSV input and all model/pipeline storage locations
Kafka broker and topic configurations
ML pipeline settings
Database connection parameters


SparkSession initialization with performance optimizations:

Arrow enabled for improved Python-JVM data transfer
Memory configurations for driver and executors
Parallelism settings
Kafka and JDBC connector configurations
Checkpointing for streaming applications


Optional imports for deep learning models (commented out but available if needed for Task 5)

In [None]:
#!/usr/bin/env python
# Consumer Complaints PySpark ML Pipeline

# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover, StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.functions import vector_to_array

# Advanced ML imports (if using deep learning models)
# from pyspark.ml.deepspeed import DeepspeedBertClassifier, DeepspeedBertEmbeddings
# import torch
# from transformers import BertTokenizer, BertModel

# Configuration variables
# File and storage paths
CSV_FILE_PATH = "/path/to/Consumer_Complaints.csv"
TEST_DATA_PERSISTENCE_PATH = "/path/to/test_data_source.parquet"
TRAINING_PIPELINE_SAVE_PATH = "/path/to/training_pipeline"
BEST_MODEL_SAVE_PATH = "/path/to/best_model"
EMBEDDING_MODEL_SAVE_PATH = "/path/to/embedding_model"  # If using Task 5
STREAMING_CHECKPOINT_LOCATION = "/path/to/streaming_checkpoints"

# Kafka configuration
KAFKA_BROKERS = "kafka1:9092,kafka2:9092"
KAFKA_TOPIC_RAW = "complaints-raw"
KAFKA_TOPIC_TRAINING = "complaints-training-data"
KAFKA_TOPIC_TESTING_STREAM = "complaints-testing-stream"
KAFKA_TOPIC_PREDICTIONS = "complaint-predictions"

# ML pipeline configuration
NUM_FOLDS = 5

# Database sink configuration
DATABASE_SINK_FORMAT = "jdbc"
DATABASE_CONNECTION_OPTIONS = {
    "url": "jdbc:postgresql://dbhost:5432/complaints_db",
    "dbtable": "complaint_predictions",
    "user": "username",
    "password": "password",
    "driver": "org.postgresql.Driver"
}

# Initialize Spark Session with appropriate configurations
def get_spark_session():
    """
    Initialize and return a SparkSession with appropriate configurations
    """
    return (SparkSession.builder
            .appName("Consumer Complaints ML Pipeline")
            .config("spark.sql.execution.arrow.pyspark.enabled", "true")
            .config("spark.executor.memory", "8g")
            .config("spark.driver.memory", "4g")
            .config("spark.executor.cores", "4")
            .config("spark.default.parallelism", "100")
            .config("spark.sql.shuffle.partitions", "100")
            .config("spark.streaming.kafka.maxRatePerPartition", "10000")
            # Checkpointing for streaming
            .config("spark.sql.streaming.checkpointLocation", STREAMING_CHECKPOINT_LOCATION)
            # For Delta Lake if using
            # .config("spark.jars.packages", "io.delta:delta-core_2.12:1.2.1")
            # For Kafka streaming if using
            .config("spark.jars.packages",
                   "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1,org.postgresql:postgresql:42.5.1")
            .getOrCreate())

# Create Spark session
spark = get_spark_session()

# Enable dynamic allocation if available
spark.conf.set("spark.dynamicAllocation.enabled", "true")
spark.conf.set("spark.shuffle.service.enabled", "true")

# Log the session configuration
print("Spark Configuration:")
print(f"Spark Version: {spark.version}")
print(f"Application ID: {spark.sparkContext.applicationId}")
print(f"Number of Executors: {spark.sparkContext.getConf().get('spark.executor.instances', 'Not set')}")

print("Setup complete. Ready to process consumer complaints data.")

Task 1: Schema Definition & Initial Kafka Load

Schema Definition:

Created a complete 18-column StructType schema for the Consumer Complaints dataset
All columns are defined as StringType() as requested, including the important "Consumer complaint narrative" and "Consumer consent provided?" fields
The schema properly handles dates as strings per the requirements


CSV Loading:

Read the CSV file using the defined schema with header=True
Implemented basic data inspection with sample display and record count


Kafka Writing:

Converted the DataFrame to Kafka-compatible format with "Complaint ID" as the key
Used to_json(struct(*)) to convert all columns to a JSON string as the value
Used the configured Kafka brokers and topic name



Task 2: Preprocessing & Filtering

Reading from Kafka:

Read data back from the Kafka raw topic using the earliest offset
Parsed the JSON value column using the full schema


Filtering Pipeline:

Date Parsing: Applied date parsing to the "Date received" column using the MM/dd/yyyy format, handling potential errors by allowing nulls
Narrative Filter: Removed records with null or empty complaint narratives
Date Filter: Kept only records with valid dates on or before 2017-03-31
Response Filter: Excluded records where "Company response to consumer" is "In progress"


Monitoring and Reporting:

Added record counts at each filtering stage for monitoring purposes
Displayed sample data after the complete filtering process

In [None]:
#!/usr/bin/env python
# Consumer Complaints Schema Definition, Kafka Load & Preprocessing

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

# Import configuration from the setup file
# In practice, you might want to import these from a common module
CSV_FILE_PATH = "/path/to/Consumer_Complaints.csv"
KAFKA_BROKERS = "kafka1:9092,kafka2:9092"
KAFKA_TOPIC_RAW = "complaints-raw"
KAFKA_TOPIC_TRAINING = "complaints-training-data"

# Initialize Spark Session (in case this is run as a standalone script)
spark = SparkSession.builder.appName("Consumer Complaints Schema & Kafka Load").getOrCreate()

# Task 1: Define Schema & Initial Load to Kafka
def define_schema_and_load_to_kafka():
    """
    Define the full 18-column schema for consumer complaints data,
    load from CSV, and write to Kafka raw topic
    """
    # Define the full 18-column schema
    full_schema = StructType([
        StructField("Date received", StringType(), True),
        StructField("Product", StringType(), True),
        StructField("Sub-product", StringType(), True),
        StructField("Issue", StringType(), True),
        StructField("Sub-issue", StringType(), True),
        StructField("Consumer complaint narrative", StringType(), True),
        StructField("Company public response", StringType(), True),
        StructField("Company", StringType(), True),
        StructField("State", StringType(), True),
        StructField("ZIP code", StringType(), True),
        StructField("Tags", StringType(), True),
        StructField("Consumer consent provided?", StringType(), True),
        StructField("Submitted via", StringType(), True),
        StructField("Date sent to company", StringType(), True),
        StructField("Company response to consumer", StringType(), True),
        StructField("Timely response?", StringType(), True),
        StructField("Consumer disputed?", StringType(), True),
        StructField("Complaint ID", StringType(), True)
    ])

    # Read the CSV file using the defined schema
    print(f"Reading CSV from: {CSV_FILE_PATH}")
    raw_df = spark.read.format("csv") \
                       .option("header", "true") \
                       .schema(full_schema) \
                       .load(CSV_FILE_PATH)

    # Show sample data and schema
    print("Sample data from CSV:")
    raw_df.select("Complaint ID", "Date received", "Product", "Consumer complaint narrative").show(5, truncate=True)
    print("DataFrame Schema:")
    raw_df.printSchema()

    # Count records
    count = raw_df.count()
    print(f"Total records loaded: {count}")

    # Write to Kafka topic
    print(f"Writing data to Kafka topic: {KAFKA_TOPIC_RAW}")
    kafka_df = raw_df.selectExpr(
        "Complaint ID AS key",
        "to_json(struct(*)) AS value"
    )

    # Write to Kafka
    kafka_df.write \
        .format("kafka") \
        .option("kafka.bootstrap.servers", KAFKA_BROKERS) \
        .option("topic", KAFKA_TOPIC_RAW) \
        .save()

    print(f"Successfully wrote {count} records to Kafka topic: {KAFKA_TOPIC_RAW}")
    return raw_df


# Task 2: Preprocessing & Filtering
def preprocess_and_filter_from_kafka():
    """
    Read data from Kafka raw topic and apply preprocessing and filtering
    """
    # Read from Kafka topic
    print(f"Reading data from Kafka topic: {KAFKA_TOPIC_RAW}")
    kafka_raw_df = spark.read \
        .format("kafka") \
        .option("kafka.bootstrap.servers", KAFKA_BROKERS) \
        .option("subscribe", KAFKA_TOPIC_RAW) \
        .option("startingOffsets", "earliest") \
        .load()

    # Parse JSON value
    parsed_df = kafka_raw_df.select(
        F.col("key").cast("string").alias("key"),
        F.from_json(F.col("value").cast("string"), full_schema).alias("data")
    ).select("data.*")

    # Apply filters sequentially:

    # 1. Parse date received - handle potential errors gracefully
    DATE_FORMAT = "MM/dd/yyyy"
    with_parsed_date_df = parsed_df.withColumn(
        "parsed_date_received",
        F.to_date(F.col("Date received"), DATE_FORMAT)
    )

    # 2. Filter for non-null and non-empty narratives
    narrative_filtered_df = with_parsed_date_df.filter(
        (F.col("Consumer complaint narrative").isNotNull()) &
        (F.length(F.trim(F.col("Consumer complaint narrative"))) > 0)
    )

    # 3. Filter for valid dates before or on 2017-03-31
    date_filtered_df = narrative_filtered_df.filter(
        (F.col("parsed_date_received").isNotNull()) &
        (F.col("parsed_date_received") <= F.lit("2017-03-31"))
    )

    # 4. Filter out "In progress" company responses
    filtered_df = date_filtered_df.filter(
        F.col("Company response to consumer") != "In progress"
    )

    # Show sample data after filtering
    print("Sample data after filtering:")
    filtered_df.select("Complaint ID", "parsed_date_received", "Product", "Consumer complaint narrative").show(5, truncate=True)

    # Count records at each filtering stage for monitoring
    initial_count = parsed_df.count()
    narrative_count = narrative_filtered_df.count()
    date_count = date_filtered_df.count()
    final_count = filtered_df.count()

    print(f"Initial record count: {initial_count}")
    print(f"After narrative filter: {narrative_count}")
    print(f"After date filter: {date_count}")
    print(f"Final record count: {final_count}")

    return filtered_df


# Execute the tasks
if __name__ == "__main__":
    # Define schema and load data to Kafka
    raw_df = define_schema_and_load_to_kafka()

    # Define the full schema again for the processing function
    # (In a real implementation, this would be imported from a common module)
    full_schema = StructType([
        StructField("Date received", StringType(), True),
        StructField("Product", StringType(), True),
        StructField("Sub-product", StringType(), True),
        StructField("Issue", StringType(), True),
        StructField("Sub-issue", StringType(), True),
        StructField("Consumer complaint narrative", StringType(), True),
        StructField("Company public response", StringType(), True),
        StructField("Company", StringType(), True),
        StructField("State", StringType(), True),
        StructField("ZIP code", StringType(), True),
        StructField("Tags", StringType(), True),
        StructField("Consumer consent provided?", StringType(), True),
        StructField("Submitted via", StringType(), True),
        StructField("Date sent to company", StringType(), True),
        StructField("Company response to consumer", StringType(), True),
        StructField("Timely response?", StringType(), True),
        StructField("Consumer disputed?", StringType(), True),
        StructField("Complaint ID", StringType(), True)
    ])

    # Read from Kafka and apply preprocessing
    filtered_df = preprocess_and_filter_from_kafka()

    print("Tasks 1-2 completed successfully")

1. Data Splitting (80/20)

Performed a random split on the filtered DataFrame using a fixed seed value (42) for reproducibility
Created training_base_df (80%) and test_base_df (20%)
Added logging to verify the split proportions

2. Training Data Preparation

Created the binary target column is_target_complaint using these rules:

A complaint is labeled as a target (1) if ALL of these conditions are met:

"Consumer disputed?" is "Yes"
"Timely response?" is "No"
"Company response to consumer" is one of: "Closed with explanation", "Closed", "Closed with monetary relief", or "Closed with non-monetary relief"


Otherwise, the complaint is labeled as not a target (0)


Added analytics on the target distribution to help with potential class imbalance:

Calculated class distribution counts
Computed class weights that could be used for model training


Selected all 18 original columns plus the new is_target_complaint column
Wrote the training data to the Kafka topic KAFKA_TOPIC_TRAINING:

Used "Complaint ID" as the key
Converted all columns to JSON for the value



3. Test Data Preparation

Selected all 18 original columns from test_base_df
Wrote this data to TEST_DATA_PERSISTENCE_PATH using Parquet format with overwrite mode
This stored test data will be used by the external simulation script

In [None]:
#!/usr/bin/env python
# Data Split, Target Labeling & Kafka Preparation

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

# Import configuration from the setup file
# In practice, you would import these from a common module
KAFKA_BROKERS = "kafka1:9092,kafka2:9092"
KAFKA_TOPIC_TRAINING = "complaints-training-data"
TEST_DATA_PERSISTENCE_PATH = "/path/to/test_data_source.parquet"

# Set a seed value for reproducibility
SEED_VALUE = 42

# Initialize Spark Session (in case this is run as a standalone script)
spark = SparkSession.builder.appName("Consumer Complaints Data Split & Labeling").getOrCreate()

def split_label_and_prepare_data(filtered_df):
    """
    Split the filtered data into training and test sets, label the target,
    and prepare data for Kafka and persistence
    """
    print("Starting data split, target labeling, and Kafka preparation...")

    # Task 3.1: Perform 80/20 random split
    print(f"Performing 80/20 random split with seed {SEED_VALUE}...")
    training_base_df, test_base_df = filtered_df.randomSplit([0.8, 0.2], seed=SEED_VALUE)

    # Log the split results
    training_count = training_base_df.count()
    test_count = test_base_df.count()
    print(f"Training set size: {training_count} records ({training_count / (training_count + test_count):.2%})")
    print(f"Test set size: {test_count} records ({test_count / (training_count + test_count):.2%})")

    # Task 3.2: Prepare Training Data with target labeling
    print("Labeling target complaints in training data...")

    # Create binary target column based on the target group definition
    training_labeled_df = training_base_df.withColumn(
        "is_target_complaint",
        F.when(
            # A complaint is a target if it meets ALL of these conditions:
            (F.col("Consumer disputed?") == "Yes") &
            (F.col("Timely response?") == "No") &
            (
                # AND it meets ANY of these response conditions:
                (F.col("Company response to consumer") == "Closed with explanation") |
                (F.col("Company response to consumer") == "Closed") |
                (F.col("Company response to consumer") == "Closed with monetary relief") |
                (F.col("Company response to consumer") == "Closed with non-monetary relief")
            ),
            1  # True case: it's a target complaint
        ).otherwise(0)  # False case: not a target complaint
    )

    # Show target distribution
    target_distribution = training_labeled_df.groupBy("is_target_complaint").count()
    print("Target distribution in training data:")
    target_distribution.show()

    # Calculate class weights for potential use in modeling
    total_count = training_labeled_df.count()
    class_weights = target_distribution.withColumn(
        "weight",
        F.round(F.lit(total_count) / F.col("count"), 2)
    )
    print("Class weights for potential use in modeling:")
    class_weights.show()

    # Select all original columns plus the target column
    training_final_df = training_labeled_df.select(
        ["Date received", "Product", "Sub-product", "Issue", "Sub-issue",
         "Consumer complaint narrative", "Company public response", "Company",
         "State", "ZIP code", "Tags", "Consumer consent provided?",
         "Submitted via", "Date sent to company", "Company response to consumer",
         "Timely response?", "Consumer disputed?", "Complaint ID",
         "is_target_complaint"]
    )

    # Write training data to Kafka
    print(f"Writing training data to Kafka topic: {KAFKA_TOPIC_TRAINING}")
    training_kafka_df = training_final_df.selectExpr(
        "Complaint ID AS key",
        "to_json(struct(*)) AS value"
    )

    training_kafka_df.write \
        .format("kafka") \
        .option("kafka.bootstrap.servers", KAFKA_BROKERS) \
        .option("topic", KAFKA_TOPIC_TRAINING) \
        .save()

    print(f"Successfully wrote {training_count} training records to Kafka topic: {KAFKA_TOPIC_TRAINING}")

    # Task 3.3: Prepare Test Data Source
    print("Preparing test data for persistence...")

    # Select all original columns from test data
    test_final_df = test_base_df.select(
        ["Date received", "Product", "Sub-product", "Issue", "Sub-issue",
         "Consumer complaint narrative", "Company public response", "Company",
         "State", "ZIP code", "Tags", "Consumer consent provided?",
         "Submitted via", "Date sent to company", "Company response to consumer",
         "Timely response?", "Consumer disputed?", "Complaint ID"]
    )

    # Write test data to persistence path
    print(f"Writing test data to persistence path: {TEST_DATA_PERSISTENCE_PATH}")
    test_final_df.write \
        .format("parquet") \
        .mode("overwrite") \
        .save(TEST_DATA_PERSISTENCE_PATH)

    print(f"Successfully wrote {test_count} test records to: {TEST_DATA_PERSISTENCE_PATH}")

    # Return the DataFrames for potential further processing
    return training_final_df, test_final_df

# Example usage
if __name__ == "__main__":
    # For demonstration, assume filtered_df exists from previous tasks
    # In a real implementation, you would chain these functions together
    # or use a workflow orchestration tool

    # filtered_df = preprocess_and_filter_from_kafka()
    # training_df, test_df = split_label_and_prepare_data(filtered_df)

    print("Task 3 implementation complete")

In [None]:
#!/usr/bin/env python
# Training Feature Engineering Pipeline Definition

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer,
    OneHotEncoder, VectorAssembler, Imputer, StandardScaler, MinMaxScaler,
    RegexTokenizer, CountVectorizer
)
from pyspark.ml.functions import vector_to_array

# Import configuration from the setup file
TRAINING_PIPELINE_SAVE_PATH = "/path/to/training_pipeline"

# Initialize Spark Session (in case this is run as a standalone script)
spark = SparkSession.builder.appName("Consumer Complaints Feature Engineering").getOrCreate()

def create_training_feature_pipeline():
    """
    Define a comprehensive feature engineering pipeline for the consumer complaints data
    """
    print("Creating training feature engineering pipeline...")

    # Define categorical columns to be encoded
    categorical_columns = [
        "Product", "Sub-product", "Issue", "Sub-issue", "Company",
        "State", "Tags", "Consumer consent provided?", "Submitted via",
        "Company response to consumer", "Timely response?", "Consumer disputed?"
    ]

    # Lists to store pipeline stages
    stages = []

    # ---------- Text Feature Engineering ----------
    print("Adding text feature engineering stages...")

    # Text cleaning - remove special characters and convert to lowercase
    stages.append(
        RegexTokenizer(
            inputCol="Consumer complaint narrative",
            outputCol="narrative_tokens",
            pattern="\\W+",
            toLowercase=True
        )
    )

    # Remove stop words
    stages.append(
        StopWordsRemover(
            inputCol="narrative_tokens",
            outputCol="narrative_filtered"
        )
    )

    # Generate TF-IDF features
    # Using HashingTF for better handling of large vocabularies
    stages.append(
        HashingTF(
            inputCol="narrative_filtered",
            outputCol="narrative_tf",
            numFeatures=10000  # Adjust based on vocabulary size
        )
    )

    stages.append(
        IDF(
            inputCol="narrative_tf",
            outputCol="narrative_features"
        )
    )

    # ---------- Categorical Feature Engineering ----------
    print("Adding categorical feature engineering stages...")

    # Store transformed column names for later use in VectorAssembler
    indexed_columns = []
    encoded_columns = []

    # For each categorical column, create a StringIndexer and OneHotEncoder
    for category in categorical_columns:
        # Skip columns with too many unique values to avoid explosion
        # This is a placeholder - you may need to analyze your data to set thresholds

        # Create a StringIndexer for this category
        indexer_output = f"{category}_indexed"
        indexer = StringIndexer(
            inputCol=category,
            outputCol=indexer_output,
            handleInvalid="keep"  # Handle unseen labels as specified
        )
        stages.append(indexer)
        indexed_columns.append(indexer_output)

        # Create a OneHotEncoder for this indexed category
        encoder_output = f"{category}_encoded"
        encoder = OneHotEncoder(
            inputCol=indexer_output,
            outputCol=encoder_output,
            dropLast=True  # Drop the last category to avoid collinearity
        )
        stages.append(encoder)
        encoded_columns.append(encoder_output)

    # ---------- Numeric Feature Engineering ----------
    print("Adding numeric feature engineering stages...")

    # Extract ZIP code numeric part (first 5 digits) and convert to numeric
    stages.append(
        RegexTokenizer(
            inputCol="ZIP code",
            outputCol="zip_numeric_str",
            pattern="\\D+",  # Non-digit characters
            gaps=True  # Use gaps between tokens
        )
    )

    # Convert ZIP numeric string tokens to a single string
    stages.append(
        Pipeline(stages=[
            # Custom UDF to convert array of strings to first element
            lambda df: df.withColumn(
                "zip_numeric_str",
                F.when(F.size(F.col("zip_numeric_str")) > 0, F.col("zip_numeric_str")[0])
                .otherwise(None)
            )
        ])
    )

    # Convert ZIP string to numeric
    stages.append(
        Pipeline(stages=[
            # Custom UDF to convert string to numeric
            lambda df: df.withColumn(
                "zip_numeric",
                F.col("zip_numeric_str").cast(IntegerType())
            )
        ])
    )

    # Handle missing values in numeric features
    numeric_columns = ["zip_numeric"]
    stages.append(
        Imputer(
            inputCols=numeric_columns,
            outputCols=[f"{col}_imputed" for col in numeric_columns],
            strategy="median"  # Use median for ZIP codes
        )
    )

    # Scale numeric features
    for col in numeric_columns:
        stages.append(
            MinMaxScaler(
                inputCol=f"{col}_imputed",
                outputCol=f"{col}_scaled"
            )
        )

    # Get list of scaled numeric columns
    scaled_numeric_columns = [f"{col}_scaled" for col in numeric_columns]

    # ---------- Final Feature Assembly ----------
    print("Adding final Vector Assembler stage...")

    # Combine all feature columns using VectorAssembler
    feature_columns = ["narrative_features"] + encoded_columns + scaled_numeric_columns

    stages.append(
        VectorAssembler(
            inputCols=feature_columns,
            outputCol="features",
            handleInvalid="keep"  # Handle invalid entries
        )
    )

    # Create and return the pipeline
    training_pipeline = Pipeline(stages=stages)
    print(f"Training pipeline created with {len(stages)} stages")

    return training_pipeline

# ---------- Task 5: Advanced Pretraining (Optional - BERT Embeddings) ----------
def create_bert_embedding_pipeline():
    """
    Define a pipeline that uses BERT embeddings for text features
    This is a placeholder for Task 5 - implementations will vary based on environment
    """
    print("Creating BERT embedding pipeline...")

    # Configuration for BERT embedding
    TEMP_PYTORCH_DATA_PATH = "/path/to/temp_pytorch_data"
    EMBEDDING_MODEL_SAVE_PATH = "/path/to/embedding_model"
    DEEPSPEED_CONFIG_DICT = {
        "train_batch_size": 32,
        "fp16": {"enabled": True},
        "zero_optimization": {"stage": 2}
    }

    # Placeholder for the actual BERT embedding implementation
    # This would be replaced with actual code if Task 5 is implemented

    print("""
    Task 5 implementation would:
    1. Save training data to TEMP_PYTORCH_DATA_PATH
    2. Define train_embedding_model() function for fine-tuning BERT
    3. Configure DeepspeedTorchDistributor
    4. Execute distributed training
    5. Define bert_embed_udf for embedding generation
    6. Update training_pipeline to use bert_embed_udf
    """)

# Example usage
if __name__ == "__main__":
    # Create the training pipeline
    training_pipeline = create_training_feature_pipeline()

    # Save the pipeline if needed
    training_pipeline.write().overwrite().save(TRAINING_PIPELINE_SAVE_PATH)
    print(f"Training pipeline saved to: {TRAINING_PIPELINE_SAVE_PATH}")

    # Optionally, create BERT embedding pipeline
    # Uncomment to implement Task 5
    # create_bert_embedding_pipeline()

In [None]:
#!/usr/bin/env python
# Task 5: Advanced Pretraining with BERT Embeddings

import os
import torch
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import StopWordsRemover, StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.sql.functions import udf
from pyspark.ml.deepspeed import DeepspeedTorchDistributor

# Import necessary libraries for BERT
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.utils.rnn import pad_sequence

# Configuration
KAFKA_BROKERS = "kafka1:9092,kafka2:9092"
KAFKA_TOPIC_TRAINING = "complaints-training-data"
TEMP_PYTORCH_DATA_PATH = "/path/to/temp_pytorch_data"
EMBEDDING_MODEL_SAVE_PATH = "/path/to/embedding_model"
TRAINING_PIPELINE_SAVE_PATH = "/path/to/training_pipeline"

# Initialize Spark Session
spark = SparkSession.builder.appName("Consumer Complaints BERT Embeddings").getOrCreate()

# Define a PyTorch Dataset for complaint narratives
class ComplaintDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]

        # Tokenize the text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

# Function to extract and save narrative data from Kafka for PyTorch
def extract_narratives_for_pytorch():
    """
    Read complaint narratives from Kafka and save to format for PyTorch
    """
    print("Extracting narratives from Kafka for PyTorch...")

    # Read from Kafka topic
    kafka_df = spark.read \
        .format("kafka") \
        .option("kafka.bootstrap.servers", KAFKA_BROKERS) \
        .option("subscribe", KAFKA_TOPIC_TRAINING) \
        .option("startingOffsets", "earliest") \
        .load()

    # Parse JSON and extract relevant columns
    parsed_df = kafka_df.select(
        F.col("key").cast("string").alias("complaint_id"),
        F.from_json(F.col("value").cast("string"), full_schema).alias("data")
    ).select(
        "complaint_id",
        "data.Consumer complaint narrative"
    )

    # Filter out nulls and empty narratives
    filtered_df = parsed_df.filter(
        (F.col("Consumer complaint narrative").isNotNull()) &
        (F.length(F.trim(F.col("Consumer complaint narrative"))) > 0)
    )

    print(f"Extracted {filtered_df.count()} narratives for BERT training")

    # Save to parquet for PyTorch processing
    filtered_df.write \
        .mode("overwrite") \
        .parquet(TEMP_PYTORCH_DATA_PATH)

    print(f"Narratives saved to {TEMP_PYTORCH_DATA_PATH}")

# Define the BERT fine-tuning function
def train_embedding_model(data_path, epochs=2, batch_size=32, learning_rate=2e-5):
    """
    Fine-tune a BERT model on complaint narratives
    """
    print("Starting BERT model fine-tuning...")

    # Load data from parquet
    data_df = spark.read.parquet(data_path)

    # Convert to pandas for easier PyTorch integration
    pd_df = data_df.toPandas()
    texts = pd_df["Consumer complaint narrative"].tolist()

    # Initialize tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    # Create dataset and dataloader
    dataset = ComplaintDataset(texts, tokenizer)
    dataloader = DataLoader(
        dataset,
        sampler=RandomSampler(dataset),
        batch_size=batch_size
    )

    # Setup optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)
    total_steps = len(dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    # Training loop
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for epoch in range(epochs):
        print(f"Starting epoch {epoch+1}/{epochs}")
        model.train()
        total_loss = 0

        for step, batch in enumerate(dataloader):
            # Get inputs
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # Clear gradients
            model.zero_grad()

            # Forward pass - we'll use MLM (Masked Language Modeling) objective
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                output_hidden_states=True
            )

            # Use the CLS token representation as sentence embedding
            # and calculate a simple contrastive loss
            hidden_states = outputs.last_hidden_state
            cls_embeddings = hidden_states[:, 0, :]

            # Simple contrastive loss (maximize similarity within batch)
            similarity_matrix = torch.matmul(cls_embeddings, cls_embeddings.T)

            # Create targets (identity matrix)
            targets = torch.eye(similarity_matrix.size(0)).to(device)

            # Calculate loss
            loss_fct = torch.nn.MSELoss()
            loss = loss_fct(similarity_matrix, targets)

            # Backward pass
            loss.backward()

            # Update parameters
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

            if step % 100 == 0:
                print(f"  Batch {step}/{len(dataloader)} - Loss: {loss.item():.4f}")

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}/{epochs} - Average Loss: {avg_loss:.4f}")

    # Save the fine-tuned model and tokenizer
    os.makedirs(EMBEDDING_MODEL_SAVE_PATH, exist_ok=True)
    model.save_pretrained(EMBEDDING_MODEL_SAVE_PATH)
    tokenizer.save_pretrained(EMBEDDING_MODEL_SAVE_PATH)

    print(f"BERT model fine-tuned and saved to {EMBEDDING_MODEL_SAVE_PATH}")

    return model, tokenizer

# Define a Spark UDF for BERT embeddings
def create_bert_embedding_udf():
    """
    Create a UDF that loads the fine-tuned BERT model and converts text to embeddings
    """
    print("Creating BERT embedding UDF...")

    # Load the fine-tuned model and tokenizer
    tokenizer = BertTokenizer.from_pretrained(EMBEDDING_MODEL_SAVE_PATH)
    model = BertModel.from_pretrained(EMBEDDING_MODEL_SAVE_PATH)

    # Move to CPU as Spark workers will use the UDF
    model.to("cpu")
    model.eval()

    # Define the UDF function
    def bert_embed(text):
        if not text or len(text.strip()) == 0:
            # Return zeros for empty text
            return [0.0] * 768  # BERT base hidden size is 768

        # Tokenize and prepare input
        inputs = tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=128,
            padding="max_length"
        )

        # Generate embeddings
        with torch.no_grad():
            outputs = model(**inputs)

        # Get the [CLS] token embedding (first token)
        embeddings = outputs.last_hidden_state[:, 0, :].squeeze().tolist()
        return embeddings

    # Register the UDF
    bert_embed_udf = udf(bert_embed, ArrayType(FloatType()))

    print("BERT embedding UDF created")
    return bert_embed_udf

# Main execution for Task 5
def execute_bert_embedding_task():
    """
    Execute the BERT embedding task with distributed training
    """
    print("Executing Task 5: BERT embedding generation...")

    # Step 1: Extract narratives and save to temporary location
    extract_narratives_for_pytorch()

    # Step 2: Configure DeepspeedTorchDistributor
    distributor = DeepspeedTorchDistributor(
        numGpus=4,  # Adjust based on available resources
        nnodes=1,
        localMode=False,
        useGpu=True,
        deepspeedConfig={
            "train_batch_size": 32,
            "fp16": {"enabled": True},
            "zero_optimization": {"stage": 2}
        }
    )

    # Step 3: Run distributed training
    print("Starting distributed BERT fine-tuning...")
    distributor.run(
        train_embedding_model,
        TEMP_PYTORCH_DATA_PATH,
        epochs=3,
        batch_size=32,
        learning_rate=2e-5
    )

    # Step 4: Create UDF for embeddings
    bert_embed_udf = create_bert_embedding_udf()

    # Step 5: Update the training pipeline
    # This would be done by modifying the training_pipeline from Task 4
    print("""
    To complete Task 5:
    1. Replace the basic text feature stages (tokenizer, stopwords, etc.) with the BERT UDF
    2. The UDF would be applied directly to 'Consumer complaint narrative'
    3. The output would be aliased as 'narrative_features'
    4. The resulting vector would then be used in the VectorAssembler stage
    """)

    return bert_embed_udf

# Example usage
if __name__ == "__main__":
    # Define the full schema (needed for parsing Kafka data)
    full_schema = StructType([
        StructField("Date received", StringType(), True),
        StructField("Product", StringType(), True),
        StructField("Sub-product", StringType(), True),
        StructField("Issue", StringType(), True),
        StructField("Sub-issue", StringType(), True),
        StructField("Consumer complaint narrative", StringType(), True),
        StructField("Company public response", StringType(), True),
        StructField("Company", StringType(), True),
        StructField("State", StringType(), True),
        StructField("ZIP code", StringType(), True),
        StructField("Tags", StringType(), True),
        StructField("Consumer consent provided?", StringType(), True),
        StructField("Submitted via", StringType(), True),
        StructField("Date sent to company", StringType(), True),
        StructField("Company response to consumer", StringType(), True),
        StructField("Timely response?", StringType(), True),
        StructField("Consumer disputed?", StringType(), True),
        StructField("Complaint ID", StringType(), True)
    ])

    # Execute the BERT embedding task
    bert_embed_udf = execute_bert_embedding_task()

    print("Task 5 (BERT Embeddings) completed")

In [None]:
#!/usr/bin/env python
# Task 6: Classifier Training, Comparison & Cross-Validation

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import time
import json

# Import configuration from the setup file
KAFKA_BROKERS = "kafka1:9092,kafka2:9092"
KAFKA_TOPIC_TRAINING = "complaints-training-data"
TRAINING_PIPELINE_SAVE_PATH = "/path/to/training_pipeline"
BEST_MODEL_SAVE_PATH = "/path/to/best_model"
NUM_FOLDS = 5

# Initialize Spark Session
spark = SparkSession.builder.appName("Consumer Complaints Classifier Training").getOrCreate()

def train_and_evaluate_classifiers(training_pipeline):
    """
    Train and evaluate multiple classifiers using cross-validation
    """
    print("Starting classifier training and evaluation...")

    # Step 1: Read training data from Kafka
    print(f"Reading training data from Kafka topic: {KAFKA_TOPIC_TRAINING}")

    # Define the full schema including is_target_complaint
    full_schema_with_target = StructType([
        StructField("Date received", StringType(), True),
        StructField("Product", StringType(), True),
        StructField("Sub-product", StringType(), True),
        StructField("Issue", StringType(), True),
        StructField("Sub-issue", StringType(), True),
        StructField("Consumer complaint narrative", StringType(), True),
        StructField("Company public response", StringType(), True),
        StructField("Company", StringType(), True),
        StructField("State", StringType(), True),
        StructField("ZIP code", StringType(), True),
        StructField("Tags", StringType(), True),
        StructField("Consumer consent provided?", StringType(), True),
        StructField("Submitted via", StringType(), True),
        StructField("Date sent to company", StringType(), True),
        StructField("Company response to consumer", StringType(), True),
        StructField("Timely response?", StringType(), True),
        StructField("Consumer disputed?", StringType(), True),
        StructField("Complaint ID", StringType(), True),
        StructField("is_target_complaint", IntegerType(), True)
    ])

    # Read from Kafka and parse JSON values
    kafka_df = spark.read \
        .format("kafka") \
        .option("kafka.bootstrap.servers", KAFKA_BROKERS) \
        .option("subscribe", KAFKA_TOPIC_TRAINING) \
        .option("startingOffsets", "earliest") \
        .load()

    # Parse the JSON values
    training_df = kafka_df.select(
        F.from_json(F.col("value").cast("string"), full_schema_with_target).alias("data")
    ).select("data.*")

    # Display training data overview
    print(f"Training data loaded: {training_df.count()} records")
    print("Target distribution:")
    training_df.groupBy("is_target_complaint").count().show()

    # Step 2: Define classifier instances
    print("Defining classifier instances and parameter grids...")

    # Random Forest Classifier
    rf = RandomForestClassifier(
        featuresCol="features",
        labelCol="is_target_complaint",
        predictionCol="prediction",
        probabilityCol="probability"
    )

    # Gradient-Boosted Trees Classifier
    gbt = GBTClassifier(
        featuresCol="features",
        labelCol="is_target_complaint",
        predictionCol="prediction",
        maxIter=10  # Limit iterations for GBT to prevent overfitting
    )

    # Logistic Regression Classifier
    lr = LogisticRegression(
        featuresCol="features",
        labelCol="is_target_complaint",
        predictionCol="prediction",
        probabilityCol="probability"
    )

    # Step 3: Define parameter grids for hyperparameter tuning
    rf_param_grid = ParamGridBuilder() \
        .addGrid(rf.numTrees, [10, 50, 100]) \
        .addGrid(rf.maxDepth, [5, 10, 15]) \
        .addGrid(rf.impurity, ["gini", "entropy"]) \
        .build()

    gbt_param_grid = ParamGridBuilder() \
        .addGrid(gbt.maxDepth, [5, 10]) \
        .addGrid(gbt.stepSize, [0.1, 0.05]) \
        .addGrid(gbt.maxIter, [10, 20]) \
        .build()

    lr_param_grid = ParamGridBuilder() \
        .addGrid(lr.regParam, [0.01, 0.1, 1.0]) \
        .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
        .addGrid(lr.maxIter, [10, 50, 100]) \
        .build()

    # Store models and their parameter grids in a dictionary
    models_and_params = {
        "RandomForest": (rf, rf_param_grid),
        "GradientBoostedTrees": (gbt, gbt_param_grid),
        "LogisticRegression": (lr, lr_param_grid)
    }

    # Step 4: Define the evaluator
    evaluator = BinaryClassificationEvaluator(
        rawPredictionCol="probability",
        labelCol="is_target_complaint",
        metricName="areaUnderROC"
    )

    # Step 5: Train and evaluate each model
    results = {}
    best_cv_score = 0.0
    best_model_name = None
    best_model_params = None
    best_cv_model = None

    for model_name, (classifier, param_grid) in models_and_params.items():
        print(f"Training and evaluating {model_name}...")
        start_time = time.time()

        # Create a pipeline with feature engineering and the classifier
        model_pipeline = Pipeline(stages=[training_pipeline, classifier])

        # Create a cross-validator
        cv = CrossValidator(
            estimator=model_pipeline,
            estimatorParamMaps=param_grid,
            evaluator=evaluator,
            numFolds=NUM_FOLDS,
            parallelism=4  # Adjust based on cluster capacity
        )

        # Fit the cross-validator to find the best model
        cv_model = cv.fit(training_df)

        # Get the average metrics across all folds
        avg_metrics = cv_model.avgMetrics
        best_metric_idx = avg_metrics.index(max(avg_metrics))
        best_metric = avg_metrics[best_metric_idx]
        best_params = param_grid[best_metric_idx]

        # Record the results
        results[model_name] = {
            "best_score": best_metric,
            "best_params": str(best_params),
            "training_time": time.time() - start_time
        }

        print(f"{model_name} best CV score: {best_metric:.4f}")

        # Check if this is the best model overall
        if best_metric > best_cv_score:
            best_cv_score = best_metric
            best_model_name = model_name
            best_model_params = best_params
            best_cv_model = cv_model

    # Step 6: Display comparison of models
    print("\nModel Comparison Results:")
    for model_name, metrics in results.items():
        print(f"{model_name}:")
        print(f"  Best AUC-ROC: {metrics['best_score']:.4f}")
        print(f"  Best Parameters: {metrics['best_params']}")
        print(f"  Training Time: {metrics['training_time']:.2f} seconds")

    print(f"\nOverall Best Model: {best_model_name}")
    print(f"Best AUC-ROC: {best_cv_score:.4f}")
    print(f"Best Parameters: {str(best_model_params)}")

    # Step 7: Retrain the best model on the entire training dataset
    print(f"\nRetraining {best_model_name} with optimal parameters on entire training dataset...")

    # Get the best classifier type and configure with best parameters
    best_classifier_base = models_and_params[best_model_name][0]
    best_classifier = best_classifier_base.copy(ParamMap(best_model_params))

    # Create a pipeline with the training pipeline and the best classifier
    final_pipeline = Pipeline(stages=[training_pipeline, best_classifier])

    # Fit the pipeline on the entire training dataset
    final_model = final_pipeline.fit(training_df)

    # Save the fitted training pipeline (feature engineering part)
    fitted_training_pipeline = final_model.stages[0]
    fitted_training_pipeline.write().overwrite().save(TRAINING_PIPELINE_SAVE_PATH)
    print(f"Fitted training pipeline saved to: {TRAINING_PIPELINE_SAVE_PATH}")

    # Save the trained classifier model
    trained_classifier = final_model.stages[1]
    trained_classifier.write().overwrite().save(BEST_MODEL_SAVE_PATH)
    print(f"Trained classifier model saved to: {BEST_MODEL_SAVE_PATH}")

    # Save model performance metrics to file
    metrics_file = f"{BEST_MODEL_SAVE_PATH}_metrics.json"
    with open(metrics_file, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"Model performance metrics saved to: {metrics_file}")

    return final_model, results

# Example usage
if __name__ == "__main__":
    # Load the training pipeline from Task 4 (or Task 5 if BERT embeddings were used)
    try:
        training_pipeline = Pipeline.load(TRAINING_PIPELINE_SAVE_PATH)
        print(f"Loaded training pipeline from: {TRAINING_PIPELINE_SAVE_PATH}")
    except:
        # If not available, import it from the training_pipeline module
        from training_pipeline import create_training_feature_pipeline
        training_pipeline = create_training_feature_pipeline()
        print("Created new training pipeline instance")

    # Train and evaluate classifiers
    final_model, results = train_and_evaluate_classifiers(training_pipeline)

    print("Task 6: Classifier Training, Comparison & Cross-Validation completed")

In [None]:
#!/usr/bin/env python
# Task 7: Simulation Script for Streaming Test Data to Kafka

import json
import time
import argparse
import pandas as pd
from kafka import KafkaProducer
from pyspark.sql import SparkSession

# Configuration (these values can be overridden via command-line arguments)
TEST_DATA_PERSISTENCE_PATH = "/path/to/test_data_source.parquet"
KAFKA_BROKERS = "kafka1:9092,kafka2:9092"
KAFKA_TOPIC_TESTING_STREAM = "complaints-testing-stream"
MESSAGES_PER_MINUTE = 10000  # Target throughput
BATCH_SIZE = 100  # Send messages in batches for efficiency

def load_data_with_pandas():
    """
    Load test data using pandas for smaller datasets
    Returns a pandas DataFrame with all records
    """
    print(f"Loading test data from {TEST_DATA_PERSISTENCE_PATH} using pandas...")

    if TEST_DATA_PERSISTENCE_PATH.endswith('.parquet'):
        df = pd.read_parquet(TEST_DATA_PERSISTENCE_PATH)
    elif TEST_DATA_PERSISTENCE_PATH.endswith('.json'):
        df = pd.read_json(TEST_DATA_PERSISTENCE_PATH)
    elif TEST_DATA_PERSISTENCE_PATH.endswith('.csv'):
        df = pd.read_csv(TEST_DATA_PERSISTENCE_PATH)
    else:
        raise ValueError(f"Unsupported file format: {TEST_DATA_PERSISTENCE_PATH}")

    print(f"Loaded {len(df)} records")
    return df

def load_data_with_spark():
    """
    Load test data using Spark for larger datasets
    Returns a list of dictionaries with all records
    """
    print(f"Loading test data from {TEST_DATA_PERSISTENCE_PATH} using Spark...")

    spark = SparkSession.builder \
        .appName("Test Data Reader") \
        .master("local[*]") \
        .getOrCreate()

    # Read the test data
    test_df = spark.read.format("parquet").load(TEST_DATA_PERSISTENCE_PATH)

    # Convert to pandas to make it easier to iterate
    pd_df = test_df.toPandas()

    print(f"Loaded {len(pd_df)} records")
    spark.stop()

    return pd_df

def prepare_messages(df):
    """
    Prepare messages for Kafka by selecting only the required columns
    and converting to JSON
    """
    print("Preparing messages for Kafka...")

    # Select only the 7 specified columns
    required_columns = [
        "Date received",
        "Complaint ID",
        "Company",
        "State",
        "ZIP code",
        "Submitted via",
        "Consumer complaint narrative"
    ]

    # Ensure all required columns are in the DataFrame
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        raise ValueError(f"Missing required columns: {missing_columns}")

    # Select only the required columns
    df_subset = df[required_columns]

    # Convert to list of dictionaries for easier processing
    messages = df_subset.to_dict('records')

    print(f"Prepared {len(messages)} messages")
    return messages

def send_messages_to_kafka(messages, kafka_brokers, topic, messages_per_minute):
    """
    Send messages to Kafka at the specified rate
    """
    print(f"Connecting to Kafka brokers: {kafka_brokers}")

    # Create Kafka producer
    producer = KafkaProducer(
        bootstrap_servers=kafka_brokers.split(','),
        value_serializer=lambda v: json.dumps(v).encode('utf-8'),
        key_serializer=lambda k: k.encode('utf-8'),
        batch_size=16384,  # Adjust based on message size
        linger_ms=5,       # Small delay to allow batching
        buffer_memory=33554432  # 32MB buffer
    )

    total_messages = len(messages)
    print(f"Sending {total_messages} messages to topic '{topic}' at {messages_per_minute} messages/minute...")

    # Calculate delay between messages to achieve target rate
    # Add some margin to account for processing overhead
    delay_between_messages

In [None]:
#!/usr/bin/env python
# Task 7: Simulation Script for Streaming Test Data to Kafka

import json
import time
import argparse
import pandas as pd
from kafka import KafkaProducer
from pyspark.sql import SparkSession

# Configuration (these values can be overridden via command-line arguments)
TEST_DATA_PERSISTENCE_PATH = "/path/to/test_data_source.parquet"
KAFKA_BROKERS = "kafka1:9092,kafka2:9092"
KAFKA_TOPIC_TESTING_STREAM = "complaints-testing-stream"
MESSAGES_PER_MINUTE = 10000  # Target throughput
BATCH_SIZE = 100  # Send messages in batches for efficiency

def load_data_with_pandas():
    """
    Load test data using pandas for smaller datasets
    Returns a pandas DataFrame with all records
    """
    print(f"Loading test data from {TEST_DATA_PERSISTENCE_PATH} using pandas...")

    if TEST_DATA_PERSISTENCE_PATH.endswith('.parquet'):
        df = pd.read_parquet(TEST_DATA_PERSISTENCE_PATH)
    elif TEST_DATA_PERSISTENCE_PATH.endswith('.json'):
        df = pd.read_json(TEST_DATA_PERSISTENCE_PATH)
    elif TEST_DATA_PERSISTENCE_PATH.endswith('.csv'):
        df = pd.read_csv(TEST_DATA_PERSISTENCE_PATH)
    else:
        raise ValueError(f"Unsupported file format: {TEST_DATA_PERSISTENCE_PATH}")

    print(f"Loaded {len(df)} records")
    return df

def load_data_with_spark():
    """
    Load test data using Spark for larger datasets
    Returns a list of dictionaries with all records
    """
    print(f"Loading test data from {TEST_DATA_PERSISTENCE_PATH} using Spark...")

    spark = SparkSession.builder \
        .appName("Test Data Reader") \
        .master("local[*]") \
        .getOrCreate()

    # Read the test data
    test_df = spark.read.format("parquet").load(TEST_DATA_PERSISTENCE_PATH)

    # Convert to pandas to make it easier to iterate
    pd_df = test_df.toPandas()

    print(f"Loaded {len(pd_df)} records")
    spark.stop()

    return pd_df

def prepare_messages(df):
    """
    Prepare messages for Kafka by selecting only the required columns
    and converting to JSON
    """
    print("Preparing messages for Kafka...")

    # Select only the 7 specified columns
    required_columns = [
        "Date received",
        "Complaint ID",
        "Company",
        "State",
        "ZIP code",
        "Submitted via",
        "Consumer complaint narrative"
    ]

    # Ensure all required columns are in the DataFrame
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        raise ValueError(f"Missing required columns: {missing_columns}")

    # Select only the required columns
    df_subset = df[required_columns]

    # Convert to list of dictionaries for easier processing
    messages = df_subset.to_dict('records')

    print(f"Prepared {len(messages)} messages")
    return messages

def send_messages_to_kafka(messages, kafka_brokers, topic, messages_per_minute):
    """
    Send messages to Kafka at the specified rate
    """
    print(f"Connecting to Kafka brokers: {kafka_brokers}")

    # Create Kafka producer
    producer = KafkaProducer(
        bootstrap_servers=kafka_brokers.split(','),
        value_serializer=lambda v: json.dumps(v).encode('utf-8'),
        key_serializer=lambda k: k.encode('utf-8'),
        batch_size=16384,  # Adjust based on message size
        linger_ms=5,       # Small delay to allow batching
        buffer_memory=33554432  # 32MB buffer
    )

    total_messages = len(messages)
    print(f"Sending {total_messages} messages to topic '{topic}' at {messages_per_minute} messages/minute...")

    # Calculate delay between messages to achieve target rate
    # Add some margin to account for processing overhead
    delay_between_messages = 60.0 / messages_per_minute

    start_time = time.time()
    messages_sent = 0
    batches_sent = 0

    try:
        # Process messages in batches for efficiency
        for i in range(0, len(messages), BATCH_SIZE):
            batch_start_time = time.time()

            # Get the current batch of messages
            batch = messages[i:i + BATCH_SIZE]
            batch_size = len(batch)

            # Send each message in the batch
            for msg in batch:
                # Use Complaint ID as the key
                key = msg["Complaint ID"]

                # Send the message (async)
                producer.send(topic, key=key, value=msg)
                messages_sent += 1

            # Make sure all messages are sent
            producer.flush()
            batches_sent += 1

            # Calculate time spent sending this batch
            batch_elapsed = time.time() - batch_start_time

            # Calculate required delay to maintain rate
            target_batch_time = batch_size * delay_between_messages
            sleep_time = max(0, target_batch_time - batch_elapsed)

            # Sleep if needed to maintain rate
            if sleep_time > 0:
                time.sleep(sleep_time)

            # Print progress
            if batches_sent % 10 == 0:
                elapsed = time.time() - start_time
                rate = messages_sent / elapsed * 60
                percent_complete = messages_sent / total_messages * 100
                remaining = total_messages - messages_sent
                eta = remaining / rate * 60 if rate > 0 else 0

                print(f"Progress: {messages_sent}/{total_messages} messages "
                      f"({percent_complete:.1f}%) @ {rate:.1f} msgs/min, "
                      f"ETA: {eta/60:.1f} minutes")

    except KeyboardInterrupt:
        print("\nInterrupted by user. Stopping...")
    except Exception as e:
        print(f"Error sending messages: {e}")
    finally:
        # Close the producer
        producer.close()

        # Print final statistics
        elapsed = time.time() - start_time
        rate = messages_sent / elapsed * 60 if elapsed > 0 else 0

        print(f"\nSummary:")
        print(f"- Total messages sent: {messages_sent}/{total_messages}")
        print(f"- Total time: {elapsed:.2f} seconds")
        print(f"- Average rate: {rate:.1f} messages/minute")
        print(f"- Number of batches: {batches_sent}")

def main():
    # Parse command-line arguments
    parser = argparse.ArgumentParser(description="Stream test data to Kafka")
    parser.add_argument("--data-path", default=TEST_DATA_PERSISTENCE_PATH,
                        help="Path to test data (parquet, json, or csv)")
    parser.add_argument("--brokers", default=KAFKA_BROKERS,
                        help="Comma-separated list of Kafka broker addresses")
    parser.add_argument("--topic", default=KAFKA_TOPIC_TESTING_STREAM,
                        help="Kafka topic to send messages to")
    parser.add_argument("--rate", type=int, default=MESSAGES_PER_MINUTE,
                        help="Number of messages per minute to send")
    parser.add_argument("--batch-size", type=int, default=BATCH_SIZE,
                        help="Number of messages to send in each batch")
    parser.add_argument("--use-spark", action="store_true",
                        help="Use Spark to load data (for very large datasets)")

    args = parser.parse_args()

    # Update global variables with command-line arguments
    global TEST_DATA_PERSISTENCE_PATH, KAFKA_BROKERS, KAFKA_TOPIC_TESTING_STREAM
    global MESSAGES_PER_MINUTE, BATCH_SIZE

    TEST_DATA_PERSISTENCE_PATH = args.data_path
    KAFKA_BROKERS = args.brokers
    KAFKA_TOPIC_TESTING_STREAM = args.topic
    MESSAGES_PER_MINUTE = args.rate
    BATCH_SIZE = args.batch_size

    # Load data
    if args.use_spark:
        df = load_data_with_spark()
    else:
        df = load_data_with_pandas()

    # Prepare messages
    messages = prepare_messages(df)

    # Send messages to Kafka
    send_messages_to_kafka(messages, KAFKA_BROKERS, KAFKA_TOPIC_TESTING_STREAM, MESSAGES_PER_MINUTE)

if __name__ == "__main__":
    main()

In [None]:
#!/usr/bin/env python
# Task 8: Define Streaming Inference Pipeline

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import (
    Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer,
    OneHotEncoder, VectorAssembler, Imputer, StandardScaler, MinMaxScaler,
    RegexTokenizer, CountVectorizer
)
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.ml.functions import vector_to_array
import os

# Import configuration
TRAINING_PIPELINE_SAVE_PATH = "/path/to/training_pipeline"
BEST_MODEL_SAVE_PATH = "/path/to/best_model"

# Initialize Spark Session
spark = SparkSession.builder.appName("Consumer Complaints Streaming Inference").getOrCreate()

def create_streaming_inference_pipeline(use_bert_embeddings=False):
    """
    Create a pipeline for streaming inference that processes only the 7 available columns

    Args:
        use_bert_embeddings: Whether to use BERT embeddings (from Task 5) or basic text features

    Returns:
        inference_pipeline: A PipelineModel for streaming inference
    """
    print("Creating streaming inference pipeline...")

    # Define the 7 available columns in the streaming data
    available_columns = [
        "Date received",
        "Complaint ID",
        "Company",
        "State",
        "ZIP code",
        "Submitted via",
        "Consumer complaint narrative"
    ]

    # Define categorical columns to be encoded
    categorical_columns = [
        "Company",
        "State",
        "Submitted via"
    ]

    # Lists to store pipeline stages
    stages = []

    # Parse date received (similar to Task 2)
    # This ensures we handle dates consistently
    DATE_FORMAT = "MM/dd/yyyy"
    stages.append(
        Pipeline(stages=[
            lambda df: df.withColumn(
                "parsed_date_received",
                F.to_date(F.col("Date received"), DATE_FORMAT)
            )
        ])
    )

    # ---------- Text Feature Engineering ----------
    print("Adding text feature engineering stages...")

    if use_bert_embeddings:
        # BERT Embedding approach (if Task 5 was implemented)
        print("Using BERT embeddings for text features")

        # Import the BERT embedding UDF from the bert-embeddings module
        # Note: This is a placeholder and would need actual implementation
        try:
            from bert_embeddings import create_bert_embedding_udf
            bert_embed_udf = create_bert_embedding_udf()

            # Add a stage to apply the BERT embedding UDF
            stages.append(
                Pipeline(stages=[
                    lambda df: df.withColumn(
                        "narrative_features",
                        bert_embed_udf(F.col("Consumer complaint narrative"))
                    )
                ])
            )
        except ImportError:
            print("BERT embedding module not found, falling back to basic text features")
            use_bert_embeddings = False

    if not use_bert_embeddings:
        # Basic text feature engineering (TF-IDF)
        print("Using basic TF-IDF for text features")

        # Text cleaning and tokenization
        stages.append(
            RegexTokenizer(
                inputCol="Consumer complaint narrative",
                outputCol="narrative_tokens",
                pattern="\\W+",
                toLowercase=True
            )
        )

        # Remove stop words
        stages.append(
            StopWordsRemover(
                inputCol="narrative_tokens",
                outputCol="narrative_filtered"
            )
        )

        # Generate TF-IDF features
        stages.append(
            HashingTF(
                inputCol="narrative_filtered",
                outputCol="narrative_tf",
                numFeatures=10000  # Must match training pipeline
            )
        )

        stages.append(
            IDF(
                inputCol="narrative_tf",
                outputCol="narrative_features"
            )
        )

    # ---------- Categorical Feature Engineering ----------
    print("Adding categorical feature engineering stages...")

    # Store transformed column names for later use in VectorAssembler
    indexed_columns = []
    encoded_columns = []

    # For each categorical column, create a StringIndexer and OneHotEncoder
    for category in categorical_columns:
        # Create a StringIndexer with handleInvalid='keep'
        indexer_output = f"{category}_indexed"
        indexer = StringIndexer(
            inputCol=category,
            outputCol=indexer_output,
            handleInvalid="keep"  # Handle unseen labels as specified
        )
        stages.append(indexer)
        indexed_columns.append(indexer_output)

        # Create a OneHotEncoder
        encoder_output = f"{category}_encoded"
        encoder = OneHotEncoder(
            inputCol=indexer_output,
            outputCol=encoder_output,
            dropLast=True  # Drop last category to avoid collinearity
        )
        stages.append(encoder)
        encoded_columns.append(encoder_output)

    # ---------- Numeric Feature Engineering ----------
    print("Adding numeric feature engineering stages...")

    # Extract ZIP code numeric part and convert to numeric
    stages.append(
        RegexTokenizer(
            inputCol="ZIP code",
            outputCol="zip_numeric_str",
            pattern="\\D+",  # Non-digit characters
            gaps=True  # Use gaps between tokens
        )
    )

    # Convert ZIP numeric string tokens to a single string
    stages.append(
        Pipeline(stages=[
            lambda df: df.withColumn(
                "zip_numeric_str",
                F.when(F.size(F.col("zip_numeric_str")) > 0, F.col("zip_numeric_str")[0])
                .otherwise(None)
            )
        ])
    )

    # Convert ZIP string to numeric
    stages.append(
        Pipeline(stages=[
            lambda df: df.withColumn(
                "zip_numeric",
                F.col("zip_numeric_str").cast(IntegerType())
            )
        ])
    )

    # Handle missing values in numeric features
    numeric_columns = ["zip_numeric"]
    stages.append(
        Imputer(
            inputCols=numeric_columns,
            outputCols=[f"{col}_imputed" for col in numeric_columns],
            strategy="median"  # Use median for ZIP codes
        )
    )

    # Scale numeric features
    for col in numeric_columns:
        stages.append(
            MinMaxScaler(
                inputCol=f"{col}_imputed",
                outputCol=f"{col}_scaled"
            )
        )

    # Get list of scaled numeric columns
    scaled_numeric_columns = [f"{col}_scaled" for col in numeric_columns]

    # ---------- Final Feature Assembly ----------
    print("Adding final Vector Assembler stage...")

    # Combine all feature columns using VectorAssembler
    # This must match the subset of features used in the model
    feature_columns = ["narrative_features"] + encoded_columns + scaled_numeric_columns

    stages.append(
        VectorAssembler(
            inputCols=feature_columns,
            outputCol="features",
            handleInvalid="keep"  # Handle invalid entries
        )
    )

    # Create the pipeline
    inference_pipeline = Pipeline(stages=stages)
    print(f"Streaming inference pipeline created with {len(stages)} stages")

    return inference_pipeline

def load_saved_training_pipeline():
    """
    Load the fitted training pipeline and extract relevant components for streaming

    Returns:
        inference_pipeline: A Pipeline with relevant transformations for streaming
    """
    print(f"Attempting to load saved training pipeline from: {TRAINING_PIPELINE_SAVE_PATH}")

    try:
        # Try to load the saved pipeline
        saved_pipeline = PipelineModel.load(TRAINING_PIPELINE_SAVE_PATH)
        print("Successfully loaded saved training pipeline")

        # Extract relevant stages (this would need customization based on your pipeline)
        # This is just an example - you would need to analyze your saved pipeline

        # For simplicity, we'll create a new streaming pipeline
        # that mimics the relevant parts of the saved pipeline
        # In practice, you might want to extract and adapt specific stages

        return create_streaming_inference_pipeline(use_bert_embeddings=False)

    except Exception as e:
        print(f"Error loading saved pipeline: {e}")
        print("Creating new streaming inference pipeline from scratch")

        # Create a new pipeline for streaming
        return create_streaming_inference_pipeline(use_bert_embeddings=False)

# Example usage
if __name__ == "__main__":
    # Load saved pipeline or create new one
    inference_pipeline = load_saved_training_pipeline()

    # Optionally, fit on a sample dataset to initialize stages
    # This may be necessary for certain transformers like StringIndexer

    print("Task 8: Streaming Inference Pipeline created")

In [None]:
#!/usr/bin/env python
# Task 9: Streaming Inference Job

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.ml import PipelineModel
from pyspark.ml.classification import RandomForestClassificationModel, GBTClassificationModel, LogisticRegressionModel
from pyspark.ml.functions import vector_to_array
import time
import os

# Import configuration
KAFKA_BROKERS = "kafka1:9092,kafka2:9092"
KAFKA_TOPIC_TESTING_STREAM = "complaints-testing-stream"
KAFKA_TOPIC_PREDICTIONS = "complaint-predictions"
BEST_MODEL_SAVE_PATH = "/path/to/best_model"
STREAMING_CHECKPOINT_LOCATION = "/path/to/streaming_checkpoints"
DATABASE_SINK_FORMAT = "jdbc"
DATABASE_CONNECTION_OPTIONS = {
    "url": "jdbc:postgresql://dbhost:5432/complaints_db",
    "dbtable": "complaint_predictions",
    "user": "username",
    "password": "password",
    "driver": "org.postgresql.Driver"
}

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("Consumer Complaints Streaming Inference") \
    .config("spark.sql.streaming.checkpointLocation", STREAMING_CHECKPOINT_LOCATION) \
    .getOrCreate()

def run_streaming_inference_job(inference_pipeline, use_kafka_sink=False):
    """
    Execute the streaming inference job:
    1. Load the trained classifier model
    2. Read streaming data from Kafka
    3. Apply the inference pipeline and model
    4. Write predictions to database or Kafka

    Args:
        inference_pipeline: The prepared inference pipeline from Task 8
        use_kafka_sink: Whether to write results to Kafka (True) or database (False)
    """
    print("Starting streaming inference job...")

    # Step 1: Load the saved best classifier model
    print(f"Loading best model from: {BEST_MODEL_SAVE_PATH}")
    try:
        # Try loading as RandomForestClassificationModel first
        loaded_model = RandomForestClassificationModel.load(BEST_MODEL_SAVE_PATH)
        print("Loaded RandomForestClassificationModel")
    except:
        try:
            # Try loading as GBTClassificationModel
            loaded_model = GBTClassificationModel.load(BEST_MODEL_SAVE_PATH)
            print("Loaded GBTClassificationModel")
        except:
            try:
                # Try loading as LogisticRegressionModel
                loaded_model = LogisticRegressionModel.load(BEST_MODEL_SAVE_PATH)
                print("Loaded LogisticRegressionModel")
            except Exception as e:
                raise Exception(f"Failed to load model: {e}")

    # Step 2: Define the schema for streaming data
    stream_schema = StructType([
        StructField("Date received", StringType(), True),
        StructField("Complaint ID", StringType(), True),
        StructField("Company", StringType(), True),
        StructField("State", StringType(), True),
        StructField("ZIP code", StringType(), True),
        StructField("Submitted via", StringType(), True),
        StructField("Consumer complaint narrative", StringType(), True)
    ])

    # Step 3: Create a streaming DataFrame from Kafka
    print(f"Setting up Kafka streaming source from topic: {KAFKA_TOPIC_TESTING_STREAM}")
    streaming_df = spark.readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", KAFKA_BROKERS) \
        .option("subscribe", KAFKA_TOPIC_TESTING_STREAM) \
        .option("startingOffsets", "latest") \
        .load()

    # Parse the Kafka JSON value
    parsed_df = streaming_df.select(
        F.col("key").cast("string").alias("message_key"),
        F.from_json(F.col("value").cast("string"), stream_schema).alias("data")
    ).select(
        "message_key",
        "data.*"
    )

    # Step 4: Apply the inference pipeline
    print("Applying inference pipeline to streaming data")
    # Fit the pipeline if it's not already fitted
    # In a production environment, you might want to fit this on a sample dataset first
    try:
        # Try to transform directly (if already fitted)
        processed_df = inference_pipeline.transform(parsed_df)
    except:
        # If not fitted, we need to fit it first
        # This should ideally be done before starting the stream with a sample dataset
        print("Pipeline not fitted, fitting on first batch...")
        inference_pipeline_model = inference_pipeline.fit(parsed_df)
        processed_df = inference_pipeline_model.transform(parsed_df)

    # Step 5: Apply the model to get predictions
    print("Applying model to get predictions")
    predictions_df = loaded_model.transform(processed_df)

    # Step 6: Select and format output columns
    output_df = predictions_df.select(
        F.col("Complaint ID").alias("complaint_id"),
        F.col("prediction").cast("double").alias("prediction"),
        # Extract probability of positive class (1)
        F.when(
            F.size(F.col("probability")) > 1,
            F.element_at(vector_to_array(F.col("probability")), 2)
        ).otherwise(
            F.when(F.col("prediction") > 0.5, 1.0).otherwise(0.0)
        ).alias("probability_1"),
        F.col("State").alias("state"),
        F.col("ZIP code").alias("zip_code"),
        F.col("Submitted via").alias("submitted_via"),
        F.col("parsed_date_received").alias("complaint_date"),
        F.current_timestamp().alias("inference_time")
    )

    # Add a timestamp string for easier querying
    final_df = output_df.withColumn(
        "complaint_date_str",