<a href="https://colab.research.google.com/github/ShovalBenjer/Bigdata_Pyspark_Spark_Hadoop_Apache/blob/Final_Project/CFPB_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Install required packages
!pip install kafka-python
!pip install transformers
!pip install torch

Collecting kafka-python
  Downloading kafka_python-2.1.3-py2.py3-none-any.whl.metadata (9.1 kB)
Downloading kafka_python-2.1.3-py2.py3-none-any.whl (276 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/276.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m266.2/276.1 kB[0m [31m9.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.1/276.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kafka-python
Successfully installed kafka-python-2.1.3
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloa

In [4]:
#!/usr/bin/env python
# Comprehensive Big Data Pipeline with Spark Structured Streaming, Kafka, and Superset Integration

import os
import torch
import numpy as np
import json
import time
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import (
    Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer,
    OneHotEncoder, VectorAssembler, RegexTokenizer
)
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.functions import vector_to_array
from pyspark.ml.base import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from pyspark.ml.torch.distributor import TorchDistributor
from pyspark.sql.streaming import StreamingQueryListener
from kafka import KafkaProducer

# Import BERT-related libraries for distributed training
from transformers import DistilBertTokenizer, DistilBertModel, get_linear_schedule_with_warmup
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader, RandomSampler

# Configuration variables
CSV_FILE_PATH = "/content/Consumer_Complaints.csv"
TEST_DATA_PERSISTENCE_PATH = "/content/consumer_complaints/data/test_data_source.parquet"
TRAINING_PIPELINE_SAVE_PATH = "/content/consumer_complaints/models/training_pipeline"
BEST_MODEL_SAVE_PATH = "/content/consumer_complaints/models/best_model"
EMBEDDING_MODEL_SAVE_PATH = "/content/consumer_complaints/models/embedding_model"
TEMP_PYTORCH_DATA_PATH = "/content/consumer_complaints/models/temp_pytorch_data"
STREAMING_CHECKPOINT_LOCATION = "/content/consumer_complaints/checkpoints"
SUPERSET_API_ENDPOINT = "http://localhost:8088/api/v1"  # Example Superset endpoint

# Kafka configuration
KAFKA_BROKERS = "kafka1:9092,kafka2:9092"
KAFKA_TOPIC_RAW = "complaints-raw"
KAFKA_TOPIC_TRAINING = "complaints-training-data"
KAFKA_TOPIC_TESTING_STREAM = "complaints-testing-stream"
KAFKA_TOPIC_PREDICTIONS = "complaint-predictions"
KAFKA_TOPIC_METRICS = "streaming-metrics"

# Parameters for simulation
MESSAGES_PER_MINUTE = 1000  # Target throughput
BATCH_SIZE = 100  # Send messages in batches for efficiency

# Create directories
for path in [
    "/content/consumer_complaints/data",
    "/content/consumer_complaints/models",
    "/content/consumer_complaints/models/temp_pytorch_data",
    "/content/consumer_complaints/checkpoints"
]:
    os.makedirs(path, exist_ok=True)

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("Consumer Complaints ML Pipeline") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1") \
    .config("spark.sql.streaming.checkpointLocation", STREAMING_CHECKPOINT_LOCATION) \
    .getOrCreate()

print("Spark Configuration:")
print(f"Spark Version: {spark.version}")
print(f"Application ID: {spark.sparkContext.applicationId}")

Spark Configuration:
Spark Version: 3.5.5
Application ID: local-1743097057732


In [5]:
#####################################
### Schema Definition             ###
#####################################

def get_full_schema():
    """Return the full schema for consumer complaints data"""
    return StructType([
        StructField("Date received", StringType(), True),
        StructField("Product", StringType(), True),
        StructField("Sub-product", StringType(), True),
        StructField("Issue", StringType(), True),
        StructField("Sub-issue", StringType(), True),
        StructField("Consumer complaint narrative", StringType(), True),
        StructField("Company public response", StringType(), True),
        StructField("Company", StringType(), True),
        StructField("State", StringType(), True),
        StructField("ZIP code", StringType(), True),
        StructField("Tags", StringType(), True),
        StructField("Consumer consent provided?", StringType(), True),
        StructField("Submitted via", StringType(), True),
        StructField("Date sent to company", StringType(), True),
        StructField("Company response to consumer", StringType(), True),
        StructField("Timely response?", StringType(), True),
        StructField("Consumer disputed?", StringType(), True),
        StructField("Complaint ID", StringType(), True)
    ])

def get_streaming_schema():
    """Return the schema for streaming inference data"""
    return StructType([
        StructField("Date received", StringType(), True),
        StructField("Complaint ID", StringType(), True),
        StructField("Company", StringType(), True),
        StructField("State", StringType(), True),
        StructField("ZIP code", StringType(), True),
        StructField("Submitted via", StringType(), True),
        StructField("Consumer complaint narrative", StringType(), True)
    ])


In [6]:
#####################################
### Custom Transformers           ###
#####################################

# Enhanced transformer for BERT embeddings
class BERTEmbeddingTransformer(Transformer, HasInputCol, HasOutputCol, DefaultParamsReadable, DefaultParamsWritable):
    """Custom MLWritable transformer that applies BERT embeddings to text"""

    def __init__(self, inputCol=None, outputCol=None, modelPath=None):
        super(BERTEmbeddingTransformer, self).__init__()
        self.inputCol = Param(self, "inputCol", "Input column")
        self.outputCol = Param(self, "outputCol", "Output column")
        self.modelPath = Param(self, "modelPath", "Path to the BERT model")
        self._setDefault(inputCol=None, outputCol=None, modelPath=None)
        self.setInputCol(inputCol)
        self.setOutputCol(outputCol)
        self.setModelPath(modelPath)
        self._bert_embed_udf = None

    def setInputCol(self, value): return self._set(inputCol=value)
    def getInputCol(self): return self.getOrDefault(self.inputCol)
    def setOutputCol(self, value): return self._set(outputCol=value)
    def getOutputCol(self): return self.getOrDefault(self.outputCol)
    def setModelPath(self, value): return self._set(modelPath=value)
    def getModelPath(self): return self.getOrDefault(self.modelPath)

    def _transform(self, dataset):
        from pyspark.sql.functions import pandas_udf
        import pandas as pd

        # Using pandas_udf to leverage vectorized operations
        @pandas_udf(ArrayType(FloatType()))
        def bert_embed_batch(texts_series):
            # Load model once per executor
            if not hasattr(bert_embed_batch, 'model') or bert_embed_batch.model is None:
                modelPath = self.getModelPath()
                tokenizer = DistilBertTokenizer.from_pretrained(modelPath)
                model = DistilBertModel.from_pretrained(modelPath)
                model.to("cpu").eval()
                bert_embed_batch.model = model
                bert_embed_batch.tokenizer = tokenizer
                bert_embed_batch.embedding_dim = model.config.dim

            # Process batch of texts
            results = []
            for text in texts_series:
                if not text or len(str(text).strip()) == 0:
                    results.append([0.0] * bert_embed_batch.embedding_dim)
                    continue

                # Tokenize and get embeddings
                inputs = bert_embed_batch.tokenizer(
                    str(text),
                    return_tensors="pt",
                    truncation=True,
                    max_length=128,
                    padding="max_length"
                )

                with torch.no_grad():
                    outputs = bert_embed_batch.model(**inputs)
                embeddings = outputs.last_hidden_state[:, 0, :].squeeze().tolist()
                results.append(embeddings)

            return pd.Series(results)

        # Transform the dataset
        return dataset.withColumn(
            self.getOutputCol(),
            bert_embed_batch(F.col(self.getInputCol()))
        )

    def copy(self, extra=None):
        if extra is None: extra = {}
        return super(BERTEmbeddingTransformer, self).copy(extra)

#####################################
### PyTorch BERT Training Classes ###
#####################################

# Dataset class for PyTorch
class ComplaintDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self): return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer.encode_plus(
            text, add_special_tokens=True, max_length=self.max_length,
            padding='max_length', truncation=True, return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Enhanced DistilBERT classifier model
class EnhancedDistilBERTClassifier(torch.nn.Module):
    def __init__(self, bert_model, dropout_rate=0.3):
        super(EnhancedDistilBERTClassifier, self).__init__()
        self.bert = bert_model
        self.dropout1 = torch.nn.Dropout(dropout_rate)

        hidden_size = self.bert.config.dim
        self.dense1 = torch.nn.Linear(hidden_size, 256)
        self.batch_norm1 = torch.nn.BatchNorm1d(256)
        self.dropout2 = torch.nn.Dropout(dropout_rate)

        self.dense2 = torch.nn.Linear(256, 64)
        self.batch_norm2 = torch.nn.BatchNorm1d(64)
        self.dropout3 = torch.nn.Dropout(dropout_rate)

        self.classifier = torch.nn.Linear(64, 2)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        x = self.dropout1(outputs.last_hidden_state[:, 0, :])
        x = self.dense1(x)
        x = self.batch_norm1(x)
        x = torch.nn.functional.relu(x)

        x = self.dropout2(x)
        x = self.dense2(x)
        x = self.batch_norm2(x)
        x = torch.nn.functional.relu(x)

        x = self.dropout3(x)
        return self.classifier(x)

In [7]:
# Stream metrics listener
class MetricsListener(StreamingQueryListener):
    def __init__(self, kafka_brokers, topic):
        self.kafka_brokers = kafka_brokers
        self.topic = topic
        self.producer = KafkaProducer(
            bootstrap_servers=kafka_brokers.split(','),
            value_serializer=lambda v: json.dumps(v).encode('utf-8')
        )

    def onQueryStarted(self, event):
        metrics = {
            "queryName": event.name,
            "id": str(event.id),
            "runId": str(event.runId),
            "timestamp": event.timestamp,
            "event": "started"
        }
        self.producer.send(self.topic, value=metrics)

    def onQueryProgress(self, event):
        progress = event.progress
        metrics = {
            "queryName": progress.name,
            "id": str(progress.id),
            "runId": str(progress.runId),
            "timestamp": progress.timestamp,
            "event": "progress",
            "numInputRows": progress.numInputRows,
            "inputRowsPerSecond": progress.inputRowsPerSecond,
            "processedRowsPerSecond": progress.processedRowsPerSecond,
            "batchId": progress.batchId
        }
        self.producer.send(self.topic, value=metrics)

    def onQueryTerminated(self, event):
        metrics = {
            "queryName": event.name if hasattr(event, 'name') else None,
            "id": str(event.id),
            "runId": str(event.runId),
            "timestamp": time.time() * 1000,
            "event": "terminated",
            "exception": str(event.exception) if event.exception else None
        }
        self.producer.send(self.topic, value=metrics)

In [19]:
#########################################
### TASK 1: Load Data into Kafka      ###
#########################################

def load_data_to_kafka():
    """Load Consumer Complaints data into Kafka with proper error handling"""
    print(f"Reading CSV from: {CSV_FILE_PATH}")
    full_schema = get_full_schema()

    # Using Spark SQL to read and process the data
    raw_df = spark.read.format("csv") \
                   .option("header", "true") \
                   .schema(full_schema) \
                   .load(CSV_FILE_PATH)

    # Register as temp view for SQL queries
    raw_df.createOrReplaceTempView("raw_complaints")

    # Use Spark SQL to analyze data
    summary_df = spark.sql("""
        SELECT
            Product,
            COUNT(*) as complaint_count,
            SUM(CASE WHEN `Consumer complaint narrative` IS NOT NULL THEN 1 ELSE 0 END) as narratives_count,
            SUM(CASE WHEN `Consumer disputed?` = 'Yes' THEN 1 ELSE 0 END) as disputed_count
        FROM raw_complaints
        GROUP BY Product
        ORDER BY complaint_count DESC
    """)

    print("Data summary by product:")
    summary_df.show(5)

    print(f"Total records loaded: {raw_df.count()}")

    # Write to Kafka topic using improved error handling
    try:
        print(f"Writing data to Kafka topic: {KAFKA_TOPIC_RAW}")

        # IMPORTANT: Use backticks around column names with spaces
        kafka_df = raw_df.selectExpr(
            "`Complaint ID` AS key",
            "to_json(struct(*)) AS value"
        )

        # Add version-specific options for better compatibility
        kafka_df.write \
            .format("kafka") \
            .option("kafka.bootstrap.servers", KAFKA_BROKERS) \
            .option("topic", KAFKA_TOPIC_RAW) \
            .option("kafka.max.block.ms", "600000")  # Increase timeout to 10 minutes
            .option("kafka.acks", "1")  # Use less strict acknowledgment
            .save()

        print(f"Data successfully written to Kafka topic: {KAFKA_TOPIC_RAW}")
    except Exception as e:
        print(f"Error writing to Kafka: {e}")
        print("Falling back to alternative storage method...")

        # Fallback: Save as parquet if Kafka fails
        fallback_path = "/content/consumer_complaints/data/raw_data.parquet"
        raw_df.write.mode("overwrite").parquet(fallback_path)
        print(f"Data saved to fallback location: {fallback_path}")

    return raw_df

IndentationError: unexpected indent (<ipython-input-19-9853db49cb1b>, line 52)

In [None]:
# Install AutoViz and related dependencies
!pip install autoviz

def visualize_with_autoviz(filtered_df, max_rows=10000, max_cols=30, save_dir=None):
    """
    Visualize the filtered data using AutoViz

    Args:
        filtered_df: The filtered DataFrame to visualize
        max_rows: Maximum number of rows to analyze
        max_cols: Maximum number of columns to analyze
        save_dir: Directory to save visualizations (None for default)

    Returns:
        The visualization DataFrame
    """
    from autoviz import AutoViz_Class
    import os

    print("Starting AutoViz data visualization...")

    # Create the save directory for AutoViz plots if provided
    if save_dir:
        os.makedirs(save_dir, exist_ok=True)
        print(f"Created directory for AutoViz plots: {save_dir}")

    # Sample the data if it's too large
    if filtered_df.count() > max_rows:
        print(f"Sampling {max_rows} rows from {filtered_df.count()} total rows")
        sample_df = filtered_df.sample(False, max_rows / filtered_df.count(), seed=42)
    else:
        sample_df = filtered_df

    # Convert to Pandas for AutoViz
    pandas_df = sample_df.toPandas()

    # Create AutoViz instance
    AV = AutoViz_Class()

    # Generate visualizations
    print("Generating visualizations with AutoViz...")
    viz_df = AV.AutoViz(
        "",  # Empty filename since we're using a DataFrame
        depVar="",  # No target variable specified yet
        dfte=pandas_df,
        header=0,  # Header is in first row
        verbose=1,  # Show info and charts
        lowess=False,  # Disable lowess for larger datasets
        chart_format="html",  # Save as interactive HTML
        max_rows_analyzed=max_rows,
        max_cols_analyzed=max_cols,
        save_plot_dir=save_dir or "/content/consumer_complaints/visualizations"
    )

    print("AutoViz visualization complete!")
    print(f"Visualizations saved in: {save_dir or '/content/consumer_complaints/visualizations'}")

    return viz_df

In [9]:
#########################################
### TASK 2: Preprocess & Filter Data  ###
#########################################

# Update the preprocessing and filtering function to include visualization
def preprocess_filter_and_visualize():
    """
    Read from Kafka raw topic, filter records, and visualize the results
    """
    # Original preprocess_and_filter code
    print(f"Reading data from Kafka topic: {KAFKA_TOPIC_RAW}")
    full_schema = get_full_schema()

    # Try to read from Kafka
    try:
        kafka_raw_df = spark.read \
            .format("kafka") \
            .option("kafka.bootstrap.servers", KAFKA_BROKERS) \
            .option("subscribe", KAFKA_TOPIC_RAW) \
            .option("startingOffsets", "earliest") \
            .load()

        # Parse JSON value
        parsed_df = kafka_raw_df.select(
            F.col("key").cast("string").alias("key"),
            F.from_json(F.col("value").cast("string"), full_schema).alias("data")
        ).select("data.*")
    except Exception as e:
        print(f"Error reading from Kafka: {e}")
        print("Falling back to parquet file...")
        # Fallback: Read from parquet if Kafka read fails
        fallback_path = "/content/consumer_complaints/data/raw_data.parquet"
        parsed_df = spark.read.parquet(fallback_path)

    # Parse date received
    DATE_FORMAT = "MM/dd/yyyy"
    with_parsed_date_df = parsed_df.withColumn(
        "parsed_date_received",
        F.to_date(F.col("Date received"), DATE_FORMAT)
    )

    # Filter for non-null narratives
    narrative_filtered_df = with_parsed_date_df.filter(
        (F.col("Consumer complaint narrative").isNotNull()) &
        (F.length(F.trim(F.col("Consumer complaint narrative"))) > 0)
    )

    # Filter for valid dates before or on 2017-03-31
    date_filtered_df = narrative_filtered_df.filter(
        (F.col("parsed_date_received").isNotNull()) &
        (F.col("parsed_date_received") <= F.lit("2017-03-31"))
    )

    # Filter out "In progress" responses
    filtered_df = date_filtered_df.filter(
        F.col("Company response to consumer") != "In progress"
    )

    # Track record counts at each stage using Spark SQL
    narrative_filtered_df.createOrReplaceTempView("narrative_filtered")
    date_filtered_df.createOrReplaceTempView("date_filtered")
    filtered_df.createOrReplaceTempView("final_filtered")

    filtering_stats = spark.sql("""
        SELECT
            (SELECT COUNT(*) FROM narrative_filtered) as after_narrative_filter,
            (SELECT COUNT(*) FROM date_filtered) as after_date_filter,
            (SELECT COUNT(*) FROM final_filtered) as final_count
    """)

    print("Filtering statistics:")
    filtering_stats.show()

    print(f"After filtering: {filtered_df.count()} records")

    # NEW: Visualize the filtered data with AutoViz
    print("Visualizing filtered data with AutoViz...")
    viz_df = visualize_with_autoviz(
        filtered_df,
        max_rows=15000,  # Analyze 15000 rows maximum
        max_cols=20,     # Analyze 20 columns maximum
        save_dir="/content/consumer_complaints/visualizations"
    )

    return filtered_df

In [10]:

########################################################
### TASK 3: Split, Label & Prepare Data for Training ###
########################################################

def split_label_and_prepare_data(filtered_df, seed_value=42):
    """Split data, create target labels, save to Kafka and file storage"""
    print("Starting data split and target labeling...")

    # Perform 80/20 split
    training_base_df, test_base_df = filtered_df.randomSplit([0.8, 0.2], seed=seed_value)

    # Create target label using Spark SQL for clarity
    training_base_df.createOrReplaceTempView("training_base")

    training_labeled_df = spark.sql("""
        SELECT *,
            CASE WHEN
                `Consumer disputed?` = 'No' AND
                `Timely response?` = 'Yes' AND
                (`Company response to consumer` = 'Closed with explanation' OR
                 `Company response to consumer` = 'Closed' OR
                 `Company response to consumer` = 'Closed with monetary relief' OR
                 `Company response to consumer` = 'Closed with non-monetary relief')
            THEN 1 ELSE 0 END AS is_target_complaint
        FROM training_base
    """)

    # Show target distribution
    print("Target distribution in training data:")
    training_labeled_df.groupBy("is_target_complaint").count().show()

    # Prepare training data for Kafka
    training_kafka_df = training_labeled_df.selectExpr(
        "`Complaint ID` AS key",
        "to_json(struct(*)) AS value"
    )

    # Write training data to Kafka
    training_kafka_df.write \
        .format("kafka") \
        .option("kafka.bootstrap.servers", KAFKA_BROKERS) \
        .option("topic", KAFKA_TOPIC_TRAINING) \
        .save()

    # Save test data to parquet
    test_base_df.write \
        .format("parquet") \
        .mode("overwrite") \
        .save(TEST_DATA_PERSISTENCE_PATH)

    print(f"Training data written to Kafka topic: {KAFKA_TOPIC_TRAINING}")
    print(f"Test data saved to: {TEST_DATA_PERSISTENCE_PATH}")

    return training_labeled_df, test_base_df

In [11]:

#############################################################
### TASK 4: DistilBERT Training with Distributed Training ###
#############################################################

def train_bert_model_distributed():
    """Train DistilBERT model using Distributed Training"""

    def train_function():
        print("Loading training data from Kafka...")
        full_schema_with_target = get_full_schema()
        full_schema_with_target = full_schema_with_target.add(
            StructField("is_target_complaint", IntegerType(), True)
        )

        # Read from Kafka training topic
        kafka_df = spark.read \
            .format("kafka") \
            .option("kafka.bootstrap.servers", KAFKA_BROKERS) \
            .option("subscribe", KAFKA_TOPIC_TRAINING) \
            .option("startingOffsets", "earliest") \
            .load()

        # Parse JSON and extract data
        training_df = kafka_df.select(
            F.from_json(F.col("value").cast("string"), full_schema_with_target).alias("data")
        ).select("data.*")

        # Create balanced dataset for training
        pos_df = training_df.filter(F.col("is_target_complaint") == 1)
        neg_df = training_df.filter(F.col("is_target_complaint") == 0)
        pos_count = pos_df.count()
        neg_count = neg_df.count()

        print(f"Positive examples: {pos_count}, Negative examples: {neg_count}")

        # Handle imbalance if needed
        if pos_count / (pos_count + neg_count) < 0.1:
            print("Balancing dataset...")
            target_neg_count = min(neg_count, pos_count * 7)
            neg_sample_df = neg_df.sample(fraction=target_neg_count / neg_count, seed=42)
            balanced_df = pos_df.union(neg_sample_df)
        else:
            balanced_df = training_df

        # Create train/val split
        train_df, val_df = balanced_df.randomSplit([0.9, 0.1], seed=42)

        # Convert to pandas for PyTorch
        train_pd = train_df.limit(10000).toPandas()  # Limit to 10k examples for speed
        val_pd = val_df.toPandas()

        # Extract texts and labels
        train_texts = train_pd["Consumer complaint narrative"].tolist()
        train_labels = train_pd["is_target_complaint"].tolist()
        val_texts = val_pd["Consumer complaint narrative"].tolist()
        val_labels = val_pd["is_target_complaint"].tolist()

        print(f"Training on {len(train_texts)} examples, Validating on {len(val_texts)} examples")

        # Initialize DistilBERT
        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        model = DistilBertModel.from_pretrained('distilbert-base-uncased')
        classifier = EnhancedDistilBERTClassifier(model, dropout_rate=0.3)

        # Initialize data loaders
        train_dataset = ComplaintDataset(train_texts, train_labels, tokenizer)
        val_dataset = ComplaintDataset(val_texts, val_labels, tokenizer)

        train_dataloader = DataLoader(
            train_dataset,
            sampler=RandomSampler(train_dataset),
            batch_size=32
        )

        val_dataloader = DataLoader(val_dataset, batch_size=32)

        # Initialize distributed training
        if torch.cuda.is_available():
            # Set up for multi-GPU if available
            torch.distributed.init_process_group(backend="nccl")
            local_rank = torch.distributed.get_rank()
            torch.cuda.set_device(local_rank)
            device = torch.device("cuda", local_rank)
            classifier.to(device)
        else:
            device = torch.device("cpu")
            classifier.to(device)

        # Training parameters
        optimizer = AdamW(classifier.parameters(), lr=3e-5)
        total_steps = len(train_dataloader) * 5  # 5 epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=int(0.1*total_steps), num_training_steps=total_steps
        )

        # Calculate class weights
        if sum(train_labels) > 0 and sum(train_labels) < len(train_labels):
            pos_weight = len(train_labels) / (2 * sum(train_labels))
            neg_weight = len(train_labels) / (2 * (len(train_labels) - sum(train_labels)))
            class_weights = torch.tensor([neg_weight, pos_weight]).to(device)
        else:
            class_weights = torch.tensor([1.0, 1.0]).to(device)

        # Training loop
        best_val_f1 = 0.0

        for epoch in range(5):
            print(f"Starting epoch {epoch+1}/5")
            classifier.train()
            total_loss = 0

            # Train epoch
            for step, batch in enumerate(train_dataloader):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                optimizer.zero_grad()
                logits = classifier(input_ids, attention_mask)
                loss = torch.nn.CrossEntropyLoss(weight=class_weights)(logits, labels)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(classifier.parameters(), 1.0)
                optimizer.step()
                scheduler.step()

                total_loss += loss.item()

                if step % 50 == 0:
                    print(f"Batch {step}/{len(train_dataloader)} - Loss: {loss.item():.4f}")

            avg_loss = total_loss / len(train_dataloader)
            print(f"Epoch {epoch+1}/5 - Avg Loss: {avg_loss:.4f}")

            # Evaluate
            classifier.eval()
            all_preds = []
            all_labels = []

            with torch.no_grad():
                for batch in val_dataloader:
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    labels = batch['labels'].to(device)

                    logits = classifier(input_ids, attention_mask)
                    preds = torch.argmax(logits, dim=1).cpu().numpy()
                    all_preds.extend(preds)
                    all_labels.extend(labels.cpu().numpy())

            # Calculate metrics
            accuracy = sum(p == l for p, l in zip(all_preds, all_labels)) / len(all_labels)
            true_pos = sum(p == 1 and l == 1 for p, l in zip(all_preds, all_labels))
            false_pos = sum(p == 1 and l == 0 for p, l in zip(all_preds, all_labels))
            false_neg = sum(p == 0 and l == 1 for p, l in zip(all_preds, all_labels))

            precision = true_pos / (true_pos + false_pos) if (true_pos + false_pos) > 0 else 0
            recall = true_pos / (true_pos + false_neg) if (true_pos + false_neg) > 0 else 0
            f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

            print(f"Validation metrics: Accuracy: {accuracy:.4f}, F1: {f1:.4f}")

            if f1 > best_val_f1:
                best_val_f1 = f1
                # Save the model
                os.makedirs(EMBEDDING_MODEL_SAVE_PATH, exist_ok=True)
                torch.save(classifier.state_dict(), f"{EMBEDDING_MODEL_SAVE_PATH}/classifier.pt")
                classifier.bert.save_pretrained(EMBEDDING_MODEL_SAVE_PATH)
                tokenizer.save_pretrained(EMBEDDING_MODEL_SAVE_PATH)
                print(f"New best model saved with F1: {f1:.4f}")

        # Clean up distributed process group if used
        if torch.cuda.is_available():
            torch.distributed.destroy_process_group()

        print(f"Training complete. Best F1: {best_val_f1:.4f}")
        return classifier, tokenizer

    # Run distributed training using TorchDistributor
    print("Starting distributed training with TorchDistributor...")
    num_processes = torch.cuda.device_count() if torch.cuda.is_available() else 1

    distributor = TorchDistributor(
        num_processes=num_processes,
        local_mode=True,
        use_gpu=torch.cuda.is_available()
    )

    model, tokenizer = distributor.run(train_function)
    return model, tokenizer

In [12]:

###############################################################
### Task 5:  Create Feature Engineering Pipeline for Simulation        ###
###############################################################

def create_bert_pipeline():
    """Create a feature engineering pipeline with BERT embeddings"""
    # Define categorical columns
    categorical_columns = [
        "Company", "State", "Submitted via"
    ]

    stages = []

    # Parse date received
    DATE_FORMAT = "MM/dd/yyyy"
    stages.append(
        RegexTokenizer(
            inputCol="Date received",
            outputCol="date_tokens",
            pattern="[^0-9/]",  # Remove non-date characters
            gaps=True
        )
    )

    # Add BERT embedding transformer
    stages.append(
        BERTEmbeddingTransformer(
            inputCol="Consumer complaint narrative",
            outputCol="narrative_features",
            modelPath=EMBEDDING_MODEL_SAVE_PATH
        )
    )

    # Add categorical features
    indexed_columns = []
    encoded_columns = []

    for category in categorical_columns:
        indexer_output = f"{category}_indexed"
        indexer = StringIndexer(
            inputCol=category,
            outputCol=indexer_output,
            handleInvalid="keep"
        )
        stages.append(indexer)
        indexed_columns.append(indexer_output)

        encoder_output = f"{category}_encoded"
        encoder = OneHotEncoder(
            inputCol=indexer_output,
            outputCol=encoder_output,
            dropLast=True
        )
        stages.append(encoder)
        encoded_columns.append(encoder_output)

    # Add ZIP code processing
    stages.append(
        RegexTokenizer(
            inputCol="ZIP code",
            outputCol="zip_digits",
            pattern="[^0-9]",  # Keep only digits
            gaps=True
        )
    )

    stages.append(
        StringIndexer(
            inputCol="ZIP code",
            outputCol="zip_indexed",
            handleInvalid="keep"
        )
    )

    stages.append(
        OneHotEncoder(
            inputCol="zip_indexed",
            outputCol="zip_encoded",
            dropLast=True
        )
    )

    encoded_columns.append("zip_encoded")

    # Final feature assembly
    feature_columns = ["narrative_features"] + encoded_columns

    stages.append(
        VectorAssembler(
            inputCols=feature_columns,
            outputCol="features",
            handleInvalid="keep"
        )
    )

    pipeline = Pipeline(stages=stages)
    return pipeline



In [13]:
#################################################
### TASK 6: Simulation Script for Test Data   ###
#################################################

def simulate_test_data_to_kafka():
    """
    Load test data and simulate streaming by sending to Kafka at a controlled rate
    """
    print(f"Running simulation to send test data to Kafka topic: {KAFKA_TOPIC_TESTING_STREAM}")

    # Load test data from parquet
    print(f"Loading test data from: {TEST_DATA_PERSISTENCE_PATH}")

    try:
        # Try using Spark first
        test_df = spark.read.parquet(TEST_DATA_PERSISTENCE_PATH)
        test_pd = test_df.toPandas()
        print(f"Loaded {len(test_pd)} records using Spark")
    except Exception as e:
        print(f"Error loading with Spark: {e}")
        print("Falling back to pandas")
        try:
            test_pd = pd.read_parquet(TEST_DATA_PERSISTENCE_PATH)
            print(f"Loaded {len(test_pd)} records using pandas")
        except Exception as e2:
            print(f"Failed to load test data: {e2}")
            return

    # Filter down to required columns
    required_columns = [
        "Date received",
        "Complaint ID",
        "Company",
        "State",
        "ZIP code",
        "Submitted via",
        "Consumer complaint narrative"
    ]

    # Ensure all columns exist
    missing_columns = [col for col in required_columns if col not in test_pd.columns]
    if missing_columns:
        print(f"Missing required columns: {missing_columns}")
        return

    # Extract only needed columns
    test_subset = test_pd[required_columns]

    # Convert to list of dicts for Kafka
    messages = test_subset.to_dict('records')
    total_messages = len(messages)
    print(f"Prepared {total_messages} messages for simulation")

    # Create Kafka producer
    producer = KafkaProducer(
        bootstrap_servers=KAFKA_BROKERS.split(','),
        value_serializer=lambda v: json.dumps(v).encode('utf-8'),
        key_serializer=lambda k: str(k).encode('utf-8'),
        batch_size=16384,
        linger_ms=5,
        buffer_memory=33554432
    )

    # Calculate delay to achieve target throughput
    delay_between_batches = (60.0 / MESSAGES_PER_MINUTE) * BATCH_SIZE

    print(f"Starting simulation: sending {total_messages} messages at ~{MESSAGES_PER_MINUTE} msgs/min")
    print(f"Using batch size: {BATCH_SIZE}, delay between batches: {delay_between_batches:.2f} seconds")

    # Send messages in batches
    start_time = time.time()
    messages_sent = 0

    try:
        for i in range(0, total_messages, BATCH_SIZE):
            batch_start = time.time()

            # Get current batch
            batch_end = min(i + BATCH_SIZE, total_messages)
            current_batch = messages[i:batch_end]
            batch_size = len(current_batch)

            # Send each message in batch
            for msg in current_batch:
                key = msg.get("Complaint ID", str(i))
                producer.send(KAFKA_TOPIC_TESTING_STREAM, key=key, value=msg)
                messages_sent += 1

            # Force messages to be sent
            producer.flush()

            # Calculate time spent sending batch
            batch_elapsed = time.time() - batch_start

            # Sleep to maintain rate
            sleep_time = max(0, delay_between_batches - batch_elapsed)
            if sleep_time > 0:
                time.sleep(sleep_time)

            # Progress update
            if messages_sent % 1000 == 0 or messages_sent == total_messages:
                elapsed = time.time() - start_time
                rate = messages_sent / elapsed * 60
                print(f"Progress: {messages_sent}/{total_messages} messages "
                      f"({messages_sent/total_messages*100:.1f}%) @ {rate:.1f} msgs/min")

    except KeyboardInterrupt:
        print("\nSimulation interrupted by user")
    except Exception as e:
        print(f"Error during simulation: {e}")
    finally:
        producer.close()

        # Print final statistics
        elapsed = time.time() - start_time
        rate = messages_sent / elapsed * 60 if elapsed > 0 else 0

        print("\nSimulation summary:")
        print(f"- Messages sent: {messages_sent}/{total_messages}")
        print(f"- Total time: {elapsed:.2f} seconds")
        print(f"- Average rate: {rate:.1f} messages/minute")

    return messages_sent

In [14]:
################################################
### TASK 7: Streaming Inference Pipeline      ###
################################################

def create_streaming_inference_pipeline():
    """
    Create a pipeline for streaming inference that handles the 7 required columns
    """
    print("Creating streaming inference pipeline...")

    # Define the 7 available columns in the streaming data
    available_columns = [
        "Date received",
        "Complaint ID",
        "Company",
        "State",
        "ZIP code",
        "Submitted via",
        "Consumer complaint narrative"
    ]

    # Define categorical columns to be encoded
    categorical_columns = [
        "Company",
        "State",
        "Submitted via"
    ]

    # Lists to store pipeline stages
    stages = []

    # Parse date received
    DATE_FORMAT = "MM/dd/yyyy"
    stages.append(
        RegexTokenizer(
            inputCol="Date received",
            outputCol="date_tokens",
            pattern="[^0-9/]",
            gaps=True
        )
    )

    # Use BERT embeddings for text features
    print("Using BERT embeddings for text features")
    stages.append(
        BERTEmbeddingTransformer(
            inputCol="Consumer complaint narrative",
            outputCol="narrative_features",
            modelPath=EMBEDDING_MODEL_SAVE_PATH
        )
    )

    # Categorical Feature Engineering
    print("Adding categorical feature engineering stages...")
    indexed_columns = []
    encoded_columns = []

    for category in categorical_columns:
        # Create a StringIndexer with handleInvalid='keep'
        indexer_output = f"{category}_indexed"
        indexer = StringIndexer(
            inputCol=category,
            outputCol=indexer_output,
            handleInvalid="keep"
        )
        stages.append(indexer)
        indexed_columns.append(indexer_output)

        # Create a OneHotEncoder
        encoder_output = f"{category}_encoded"
        encoder = OneHotEncoder(
            inputCol=indexer_output,
            outputCol=encoder_output,
            dropLast=True
        )
        stages.append(encoder)
        encoded_columns.append(encoder_output)

    # ZIP code processing
    stages.append(
        RegexTokenizer(
            inputCol="ZIP code",
            outputCol="zip_digits",
            pattern="[^0-9]",
            gaps=True
        )
    )

    stages.append(
        StringIndexer(
            inputCol="ZIP code",
            outputCol="zip_indexed",
            handleInvalid="keep"
        )
    )

    stages.append(
        OneHotEncoder(
            inputCol="zip_indexed",
            outputCol="zip_encoded",
            dropLast=True
        )
    )

    encoded_columns.append("zip_encoded")

    # Final Feature Assembly
    print("Adding final Vector Assembler stage...")
    feature_columns = ["narrative_features"] + encoded_columns

    stages.append(
        VectorAssembler(
            inputCols=feature_columns,
            outputCol="features",
            handleInvalid="keep"
        )
    )

    # Create the pipeline
    inference_pipeline = Pipeline(stages=stages)
    print(f"Streaming inference pipeline created with {len(stages)} stages")

    return inference_pipeline

In [15]:
################################################
### TASK 8: Streaming Inference Job           ###
################################################

def run_streaming_inference_job():
    """
    Execute the streaming inference job:
    1. Set up the streaming pipeline
    2. Read streaming data from Kafka
    3. Apply inference pipeline and model
    4. Write predictions to Kafka
    5. Publish metrics for monitoring
    """
    print("Starting streaming inference job...")

    # Register metrics listener
    metrics_listener = MetricsListener(KAFKA_BROKERS, KAFKA_TOPIC_METRICS)
    spark.streams.addListener(metrics_listener)

    # Load classifier model
    try:
        print(f"Loading classifier model from: {EMBEDDING_MODEL_SAVE_PATH}")

        # Create sample data for fitting the pipeline
        sample_data = spark.createDataFrame([
            ("01/01/2017", "1234", "ACME Bank", "CA", "90210", "Web",
             "I had an issue with my account that wasn't resolved properly.")
        ], ["Date received", "Complaint ID", "Company", "State", "ZIP code",
            "Submitted via", "Consumer complaint narrative"])

        # Create and fit the inference pipeline on sample data
        inference_pipeline = create_streaming_inference_pipeline()
        inference_pipeline_model = inference_pipeline.fit(sample_data)

        print("Inference pipeline fitted successfully")
    except Exception as e:
        print(f"Error setting up inference pipeline: {e}")
        return

    # Define schema for streaming data
    stream_schema = get_streaming_schema()

    # Create streaming source from Kafka
    print(f"Setting up streaming source from Kafka topic: {KAFKA_TOPIC_TESTING_STREAM}")
    kafka_stream = spark.readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", KAFKA_BROKERS) \
        .option("subscribe", KAFKA_TOPIC_TESTING_STREAM) \
        .option("startingOffsets", "latest") \
        .load()

    # Parse JSON data
    parsed_stream = kafka_stream.select(
        F.col("key").cast("string").alias("message_key"),
        F.from_json(F.col("value").cast("string"), stream_schema).alias("data")
    ).select(
        "message_key",
        "data.*"
    )

    # Apply inference pipeline to get features
    print("Applying inference pipeline to streaming data")
    processed_stream = inference_pipeline_model.transform(parsed_stream)

    # Apply classifier to get predictions
    # For BERT-based model, we need to manually apply the classifier
    # This would typically use the EnhancedDistilBERTClassifier saved model

    @F.pandas_udf("double")
    def predict_udf(features_series):
        import pandas as pd
        import torch
        import numpy as np
        from transformers import DistilBertModel

        # Load model once per executor
        if not hasattr(predict_udf, 'model'):
            # In a real implementation, this would load the classifier.pt
            # For this example, we'll simulate it
            predict_udf.model = None

        # Return predictions (simulated here)
        # In real implementation, this would run the model
        return pd.Series(np.random.binomial(1, 0.3, len(features_series)))

    # Add predictions
    prediction_stream = processed_stream.withColumn("prediction", predict_udf(F.col("features")))

    # Format output for Kafka
    output_stream = prediction_stream.select(
        F.col("Complaint ID").alias("complaint_id"),
        F.col("prediction"),
        F.col("State").alias("state"),
        F.col("ZIP code").alias("zip_code"),
        F.col("Submitted via").alias("submitted_via"),
        F.current_timestamp().alias("inference_time")
    )

    # Add a timestamp string for easier querying
    final_stream = output_stream.withColumn(
        "inference_time_str",
        F.date_format("inference_time", "yyyy-MM-dd HH:mm:ss")
    )

    # Prepare for Kafka output
    kafka_output = final_stream.selectExpr(
        "complaint_id AS key",
        "to_json(struct(*)) AS value"
    )

    # Write predictions to Kafka
    print(f"Starting streaming query to write predictions to Kafka topic: {KAFKA_TOPIC_PREDICTIONS}")
    query = kafka_output.writeStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", KAFKA_BROKERS) \
        .option("topic", KAFKA_TOPIC_PREDICTIONS) \
        .option("checkpointLocation", f"{STREAMING_CHECKPOINT_LOCATION}/predictions") \
        .outputMode("append") \
        .trigger(processingTime="10 seconds") \
        .start()

    # Also write predictions to console for monitoring
    console_query = final_stream.writeStream \
        .format("console") \
        .option("truncate", "false") \
        .option("numRows", 10) \
        .trigger(processingTime="10 seconds") \
        .start()

    print("Streaming queries started. Use query.awaitTermination() to keep the application running.")

    return query, console_query

In [16]:

###############################################
### SUPERSET INTEGRATION                    ###
###############################################

# Apache Superset Installation and Setup Script
import os
import random
import string
import subprocess
import time

def install_and_configure_superset():
    """Install and configure Apache Superset"""
    print("Starting Apache Superset installation...")

    # Install Superset and its dependencies
    print("Installing Apache Superset...")
    !pip install apache-superset

    # Generate a random secret key
    secret_key = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(32))

    # Set environment variables
    os.environ["SUPERSET_SECRET_KEY"] = secret_key
    os.environ["FLASK_APP"] = "superset"

    print("Initializing Superset database...")
    !superset db upgrade

    print("Creating admin user...")
    # Create a subprocess to handle interactive prompts
    admin_creation = subprocess.Popen(
        ["superset", "fab", "create-admin",
         "--username", "admin",
         "--firstname", "Admin",
         "--lastname", "User",
         "--email", "admin@example.com",
         "--password", "admin"],
        stdin=subprocess.PIPE,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        universal_newlines=True
    )

    # Wait for process to complete
    stdout, stderr = admin_creation.communicate()

    print("Loading examples...")
    !superset load_examples

    print("Initializing roles and permissions...")
    !superset init

    print("Apache Superset installation and configuration complete!")
    print("\nTo start the Superset server, run:")
    print("superset run -p 8088 --with-threads --reload --debugger")

    # Return connection details for convenience
    connection_details = {
        "username": "admin",
        "password": "admin",
        "url": "http://localhost:8088",
        "secret_key": secret_key
    }

    return connection_details

def configure_superset_datasource():
    """Configure Superset to connect to the Kafka topics and database"""
    print("Configuring Superset data sources...")

    # Here you would programmatically configure data sources
    # This typically requires API calls to Superset which would be complex to include here

    print("""
    To manually configure Superset data sources:

    1. Start the Superset server: superset run -p 8088
    2. Login to Superset at http://localhost:8088
    3. Go to Data -> Databases -> + Database
    4. Configure a connection to Kafka topics:
       - Use the SQLAlchemy URI format specific to your Kafka setup
       - For JDBC connections, use: postgresql://username:password@dbhost:5432/complaints_db
    5. Create datasets from your Kafka topics or database tables
    6. Create dashboards to visualize streaming predictions and metrics
    """)

def create_superset_dashboards():
    """Create example dashboards in Superset"""
    print("Setting up Apache Superset dashboards...")
    print("""
    To set up useful dashboards in Superset:

    1. Complaint Predictions Dashboard:
       - Prediction Distribution by State (Map)
       - Prediction Trends Over Time (Line Chart)
       - Top Companies by Complaint Volume (Bar Chart)

    2. Streaming Performance Dashboard:
       - Throughput Metrics (Gauge)
       - Processing Latency (Line Chart)
       - Error Rates (Line Chart)
    """)

# Function to setup Superset integration
def setup_superset_integration():
    """Complete setup of Superset integration"""
    # Install and configure Superset
    connection_details = install_and_configure_superset()

    # Configure data sources
    configure_superset_datasource()

    # Create example dashboards
    create_superset_dashboards()

    print("Superset integration complete!")
    return connection_details

In [18]:
##################################################
### Main Execution                             ###
##################################################

def main():
    """Execute the full pipeline with improved error handling and new features"""
    print("Starting comprehensive big data pipeline with Spark and Kafka")

    # Create all required directories
    for path in [
        "/content/consumer_complaints/data",
        "/content/consumer_complaints/models",
        "/content/consumer_complaints/models/temp_pytorch_data",
        "/content/consumer_complaints/checkpoints",
        "/content/consumer_complaints/visualizations"
    ]:
        os.makedirs(path, exist_ok=True)

    try:
        # Task 1: Load data to Kafka with improved error handling
        raw_df = load_data_to_kafka()

        # Task 2: Filter, preprocess and visualize with AutoViz
        filtered_df = preprocess_filter_and_visualize()

        # Task 3: Split, label and prepare for training
        training_df, test_df = split_label_and_prepare_data(filtered_df)

        # Task 6: Train DistilBERT model with improved distributed training
        classifier, tokenizer = train_bert_model_distributed()

        # Create and save pipeline
        bert_pipeline = create_bert_pipeline()
        bert_pipeline.write().overwrite().save(TRAINING_PIPELINE_SAVE_PATH)
        print(f"BERT pipeline saved to: {TRAINING_PIPELINE_SAVE_PATH}")

        # Simulate test data for streaming
        print("\n--- Starting Simulation ---")
        simulate_test_data_to_kafka()

        # Set up Superset dashboards
        print("\n--- Setting up Superset ---")
        superset_details = setup_superset_integration()

        # Start streaming inference job
        print("\n--- Starting Streaming Inference ---")
        query, console_query = run_streaming_inference_job()

        # Keep the application running
        try:
            print("Pipeline running. Press Ctrl+C to stop.")
            query.awaitTermination()
        except KeyboardInterrupt:
            print("Stopping the pipeline...")
            query.stop()
            console_query.stop()

        print("Pipeline complete!")

    except Exception as e:
        print(f"Error in pipeline execution: {e}")
        import traceback
        traceback.print_exc()
        print("Pipeline execution failed. Please review errors above.")

if __name__ == "__main__":
    main()

Starting comprehensive big data pipeline with Spark and Kafka
Reading CSV from: /content/Consumer_Complaints.csv
Data summary by product:
+--------------------+---------------+----------------+--------------+
|             Product|complaint_count|narratives_count|disputed_count|
+--------------------+---------------+----------------+--------------+
|            Mortgage|         242194|           36582|         47475|
|     Debt collection|         171567|           47915|         23412|
|    Credit reporting|         140424|           31592|         19941|
|         Credit card|          89190|           18842|         16537|
|Bank account or s...|          86207|           14888|         14653|
+--------------------+---------------+----------------+--------------+
only showing top 5 rows

Total records loaded: 1076322


Py4JJavaError: An error occurred while calling o55.save.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 6.0 failed 1 times, most recent failure: Lost task 0.0 in stage 6.0 (TID 11) (1e8efdeaae13 executor driver): java.lang.NoSuchMethodError: 'boolean org.apache.spark.sql.catalyst.expressions.Cast$.apply$default$4()'
	at org.apache.spark.sql.kafka010.KafkaRowWriter.createProjection(KafkaWriteTask.scala:128)
	at org.apache.spark.sql.kafka010.KafkaRowWriter.<init>(KafkaWriteTask.scala:76)
	at org.apache.spark.sql.kafka010.KafkaWriteTask.<init>(KafkaWriteTask.scala:41)
	at org.apache.spark.sql.kafka010.KafkaWriter$.$anonfun$write$1(KafkaWriter.scala:71)
	at org.apache.spark.sql.kafka010.KafkaWriter$.$anonfun$write$1$adapted(KafkaWriter.scala:70)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2(RDD.scala:1039)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2$adapted(RDD.scala:1039)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2458)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$1(RDD.scala:1039)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:1037)
	at org.apache.spark.sql.kafka010.KafkaWriter$.write(KafkaWriter.scala:70)
	at org.apache.spark.sql.kafka010.KafkaSourceProvider.createRelation(KafkaSourceProvider.scala:183)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:48)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:75)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:73)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:84)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:869)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:391)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:364)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:251)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.lang.NoSuchMethodError: 'boolean org.apache.spark.sql.catalyst.expressions.Cast$.apply$default$4()'
	at org.apache.spark.sql.kafka010.KafkaRowWriter.createProjection(KafkaWriteTask.scala:128)
	at org.apache.spark.sql.kafka010.KafkaRowWriter.<init>(KafkaWriteTask.scala:76)
	at org.apache.spark.sql.kafka010.KafkaWriteTask.<init>(KafkaWriteTask.scala:41)
	at org.apache.spark.sql.kafka010.KafkaWriter$.$anonfun$write$1(KafkaWriter.scala:71)
	at org.apache.spark.sql.kafka010.KafkaWriter$.$anonfun$write$1$adapted(KafkaWriter.scala:70)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2(RDD.scala:1039)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2$adapted(RDD.scala:1039)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more
