# DTSA 5508: Software Architecture Patterns for Big Data

## Course Overview and Quick Reference Guide

This notebook serves as a comprehensive overview and quick reference guide for the key concepts, techniques, and implementations covered in this course.

### Course Objectives
- Understanding big data architecture patterns
- Implementing distributed systems
- Working with data processing frameworks
- Applying scalable design patterns

In [None]:
# Import common libraries
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Display settings
%matplotlib inline
plt.style.use('seaborn')
pd.set_option('display.max_columns', None)

## Week 1: Introduction to Big Data Architecture

### Key Concepts
- 

### Important Terms
- 

### Code Examples

In [None]:
def initialize_spark_session():
    """Initialize a Spark session for big data processing"""
    spark = SparkSession.builder \
        .appName("BigDataProcessing") \
        .config("spark.memory.offHeap.enabled", True) \
        .config("spark.memory.offHeap.size", "10g") \
        .getOrCreate()
    
    print("Spark Configuration:")
    print(f"Spark Version: {spark.version}")
    print(f"Master: {spark.sparkContext.master}")
    print(f"Application ID: {spark.sparkContext.applicationId}")
    
    return spark

def demonstrate_distributed_processing(spark, data_path):
    """Demonstrate distributed data processing"""
    # Read data
    df = spark.read.csv(data_path, header=True, inferSchema=True)
    
    # Show basic info
    print("\nDataset Info:")
    print(f"Number of partitions: {df.rdd.getNumPartitions()}")
    print(f"Number of records: {df.count()}")
    
    # Show schema
    print("\nSchema:")
    df.printSchema()
    
    return df

## Week 2: Data Processing Patterns

### Key Concepts
- 

### Important Patterns
- 

### Code Examples

In [None]:
def implement_data_pipeline(spark, input_data):
    """Implement a typical big data processing pipeline"""
    # 1. Data ingestion
    df = spark.createDataFrame(input_data)
    
    # 2. Data transformation
    transformed_df = df.withColumn("processed_date", F.current_date()) \
        .withColumn("value_squared", F.col("value") * F.col("value")) \
        .withWatermark("timestamp", "10 minutes")
    
    # 3. Data aggregation
    aggregated_df = transformed_df.groupBy("category") \
        .agg(F.avg("value").alias("avg_value"),
             F.count("*").alias("count"))
    
    # 4. Show results
    print("Pipeline Results:")
    aggregated_df.show()
    
    return aggregated_df

## Week 3: Scalable System Design

### Key Concepts
- 

### Important Components
- 

### Code Examples

In [None]:
class DataProcessor:
    """Example of a scalable data processor"""
    def __init__(self, spark_session, batch_size=1000):
        self.spark = spark_session
        self.batch_size = batch_size
    
    def process_in_batches(self, data):
        """Process data in batches"""
        total_batches = (len(data) + self.batch_size - 1) // self.batch_size
        results = []
        
        for i in range(total_batches):
            start_idx = i * self.batch_size
            end_idx = min((i + 1) * self.batch_size, len(data))
            batch = data[start_idx:end_idx]
            
            # Process batch
            batch_df = self.spark.createDataFrame(batch)
            processed = self._process_batch(batch_df)
            results.append(processed)
        
        return self.spark.union(results)
    
    def _process_batch(self, batch_df):
        """Process a single batch"""
        return batch_df.withColumn("processed", F.lit(True))

## Week 4: System Integration and Deployment

### Key Concepts
- 

### Important Practices
- 

### Code Examples

In [None]:
def deploy_streaming_pipeline(spark):
    """Example of a streaming data pipeline"""
    # Create streaming DataFrame
    streaming_df = spark.readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("subscribe", "input-topic") \
        .load()
    
    # Process streaming data
    processed_df = streaming_df \
        .select(F.from_json(F.col("value").cast("string"), schema).alias("data")) \
        .select("data.*") \
        .withWatermark("timestamp", "1 minute") \
        .groupBy(F.window("timestamp", "5 minutes"), "category") \
        .agg(F.count("*").alias("count"))
    
    # Write stream
    query = processed_df.writeStream \
        .outputMode("complete") \
        .format("console") \
        .start()
    
    return query