In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("BroadCast").getOrCreate()

spark

In [7]:
spark.stop()


In [4]:
import logging

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("simple_broadcast.log"),
        logging.StreamHandler()
    ]
)

logger = logging.getLogger(__name__)

logger.info("Initializing Spark session")
sc = spark.sparkContext
logger.info("Spark session initialized successfully")

# Create a broadcast variable
focus_origins = ["Delhi", "Hyderabad", "Mumbai"]
logger.info(f"Broadcasting focus origins: {focus_origins}")
broadcast_origins = sc.broadcast(focus_origins)
logger.info("Broadcast variable created successfully")

# Load the dataset
logger.info("Loading dataset from /opt/notebooks/Flight_Schedule.csv")
df = spark.read.csv("/opt/notebooks/Flight_Schedule.csv", header=True, inferSchema=True)
logger.info("Dataset loaded successfully")

# Convert to RDD
logger.info("Converting DataFrame to RDD")
rdd = df.rdd
logger.info("DataFrame converted to RDD successfully")

# Filter flights using broadcast variable
def filter_origins(row):
    if row["origin"] in broadcast_origins.value:
        return True
    return False

logger.info("Filtering RDD using broadcast variable")
filtered_rdd = rdd.filter(filter_origins)
logger.info("RDD filtering completed")

# Collect and print results (limited to first 10 rows)
logger.info("Collecting filtered results")
filtered_flights = filtered_rdd.collect()
logger.info(f"Found {len(filtered_flights)} flights from focus origins")
logger.info("Printing first 10 filtered flights (or fewer if less exist)")
for flight in filtered_flights[:10]:  # Limit to first 10 rows
    print(flight)

# Stop Spark session
logger.info("Stopping Spark session")

2025-04-19 14:41:49,507 - INFO - Initializing Spark session
2025-04-19 14:41:49,547 - INFO - Spark session initialized successfully
2025-04-19 14:41:49,554 - INFO - Broadcasting focus origins: ['Delhi', 'Hyderabad', 'Mumbai']
2025-04-19 14:41:49,613 - INFO - Broadcast variable created successfully
2025-04-19 14:41:49,625 - INFO - Loading dataset from /opt/notebooks/Flight_Schedule.csv
2025-04-19 14:41:50,704 - INFO - Dataset loaded successfully
2025-04-19 14:41:50,706 - INFO - Converting DataFrame to RDD
2025-04-19 14:41:50,779 - INFO - DataFrame converted to RDD successfully
2025-04-19 14:41:50,781 - INFO - Filtering RDD using broadcast variable
2025-04-19 14:41:50,786 - INFO - RDD filtering completed
2025-04-19 14:41:50,788 - INFO - Collecting filtered results
2025-04-19 14:41:52,109 - INFO - Found 10929 flights from focus origins
2025-04-19 14:41:52,113 - INFO - Printing first 10 filtered flights (or fewer if less exist)
2025-04-19 14:41:52,116 - INFO - Stopping Spark session


Row(flightNumber='425', airline='GoAir', origin='Delhi', destination='Hyderabad', dayOfWeek='Sunday,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday', scheduledDepartureTime=datetime.datetime(2025, 4, 19, 5, 45), scheduledArrivalTime=None, validFrom='28-10-2018', validTo='30-03-2019')
Row(flightNumber='423', airline='GoAir', origin='Delhi', destination='Hyderabad', dayOfWeek='Saturday', scheduledDepartureTime=datetime.datetime(2025, 4, 19, 7, 30), scheduledArrivalTime=None, validFrom='28-10-2018', validTo='28-10-2018')
Row(flightNumber='423', airline='GoAir', origin='Delhi', destination='Hyderabad', dayOfWeek='Friday', scheduledDepartureTime=datetime.datetime(2025, 4, 19, 7, 30), scheduledArrivalTime=None, validFrom='03-11-2018', validTo='01-12-2018')
Row(flightNumber='423', airline='GoAir', origin='Delhi', destination='Hyderabad', dayOfWeek='Friday', scheduledDepartureTime=datetime.datetime(2025, 4, 19, 7, 30), scheduledArrivalTime=None, validFrom='02-02-2019', validTo='30-03-2019')


In [5]:
data = spark.sparkContext.parallelize([
    {"id": 1, "location": "Warehouse1"},
    {"id": 2, "location": None},
    {"id": 3, "location": "Warehouse2"},
    {"id": 4, "location": None}
])

# Naive approach using a regular variable
null_count = 0

def count_null(record):
    global null_count
    if record["location"] is None:
        null_count += 1

data.foreach(count_null)
print(f"Number of null locations: {null_count}")

Number of null locations: 0


In [6]:
data = spark.sparkContext.parallelize([
    {"id": 1, "location": "Warehouse1"},
    {"id": 2, "location": None},
    {"id": 3, "location": "Warehouse2"},
    {"id": 4, "location": None}
])

# Initialize accumulator
null_count_acc = spark.sparkContext.accumulator(0)

def count_null(record):
    if record["location"] is None:
        null_count_acc.add(1)  # Update accumulator

data.foreach(count_null)
print(f"Number of null locations: {null_count_acc.value}")

Number of null locations: 2


In [7]:
# from pyspark.sql import SparkSession

# spark = SparkSession.builder.appName("NoBroadcastExample").getOrCreate()
# Large lookup dictionary (simulating location mappings)
location_map = {
    "WH1": "Warehouse 1 - New York",
    "WH2": "Warehouse 2 - Chicago",
    "ST1": "Site 1 - Miami",
    # ... imagine thousands of entries
    "WH1000": "Warehouse 1000 - Seattle"
}

# Sample equipment data
data = spark.sparkContext.parallelize([
    {"id": 1, "location_code": "WH1"},
    {"id": 2, "location_code": "WH2"},
    {"id": 3, "location_code": "ST1"},
    {"id": 4, "location_code": None}
])

# Naive approach: Use the dictionary directly
def enrich_location(record):
    code = record["location_code"]
    # Handle null locations (as in your MBS-Acumatic case)
    return {
        "id": record["id"],
        "location_name": location_map.get(code, "Unknown") if code else "Uninstalled"
    }

enriched_data = data.map(enrich_location)
print(enriched_data.collect())

[{'id': 1, 'location_name': 'Warehouse 1 - New York'}, {'id': 2, 'location_name': 'Warehouse 2 - Chicago'}, {'id': 3, 'location_name': 'Site 1 - Miami'}, {'id': 4, 'location_name': 'Uninstalled'}]


In [None]:
# from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

# spark = SparkSession.builder.appName("MBSIntegration").getOrCreate()
# Sample equipment data (mimicking MBS output)
equipment_data = spark.createDataFrame([
    (1, "Printer", "WH1"),
    (2, "Scanner", "WH2"),
    (3, "Monitor", None),
    (4, "Laptop", "ST1")
], ["equipment_id", "type", "location_code"])

# Large lookup dictionary
location_map = {
    "WH1": "Warehouse 1 - New York",
    "WH2": "Warehouse 2 - Chicago",
    "ST1": "Site 1 - Miami",
    # ... thousands of entries
    "WH1000": "Warehouse 1000 - Seattle"
}

# Broadcast the dictionary
broadcast_map = spark.sparkContext.broadcast(location_map)

# UDF to use broadcast variable
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def get_location_name(code):
    if code is None:
        return "Uninstalled"
    return broadcast_map.value.get(code, "Unknown")

# Register UDF
location_udf = udf(get_location_name, StringType())

# Enrich DataFrame
enriched_df = equipment_data.withColumn(
    "location_name",
    location_udf(col("location_code"))
)

enriched_df.show(truncate=False)

In [8]:
import logging
from pyspark.sql import SparkSession

# Configure logging with explicit style
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    style='%',  # Explicitly set style to % to match the format string
    handlers=[
        logging.FileHandler("simple_broadcast.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Initialize Spark session
logger.info("Initializing Spark session")
spark = SparkSession.builder.appName("SimpleBroadcast").getOrCreate()
sc = spark.sparkContext
logger.info("Spark session initialized successfully")

# Create a broadcast variable
focus_origins = ["Delhi", "Hyderabad", "Mumbai"]
logger.info(f"Broadcasting focus origins: {focus_origins}")
broadcast_origins = sc.broadcast(focus_origins)
logger.info("Broadcast variable created successfully")

# Load the dataset
logger.info("Loading dataset from /opt/notebooks/Flight_Schedule.csv")
df = spark.read.csv("Flight_Schedule.csv", header=True, inferSchema=True)
logger.info("Dataset loaded successfully")

# Convert to RDD
logger.info("Converting DataFrame to RDD")
rdd = df.rdd
logger.info("DataFrame converted to RDD successfully")

# Filter flights using broadcast variable
def filter_origins(row):
    if row["origin"] in broadcast_origins.value:
        return True
    return False

logger.info("Filtering RDD using broadcast variable")
filtered_rdd = rdd.filter(filter_origins)
logger.info("RDD filtering completed")

# Collect and print results (limited to first 10 rows to avoid IOPub error)
logger.info("Collecting filtered results")
filtered_flights = filtered_rdd.collect()
logger.info(f"Found {len(filtered_flights)} flights from focus origins")
logger.info("Printing first 10 filtered flights (or fewer if less exist)")
for flight in filtered_flights[:10]:  # Limit to first 10 rows
    print(flight)

# Stop Spark session
logger.info("Stopping Spark session")
spark.stop()
logger.info("Spark session stopped successfully")

--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.9/logging/__init__.py", line 1079, in emit
    msg = self.format(record)
  File "/usr/lib/python3.9/logging/__init__.py", line 923, in format
    return fmt.format(record)
  File "/usr/lib/python3.9/logging/__init__.py", line 662, in format
    s = self.formatMessage(record)
  File "/usr/lib/python3.9/logging/__init__.py", line 631, in formatMessage
    return self._style.format(record)
  File "/usr/lib/python3.9/logging/__init__.py", line 430, in format
    return self._format(record)
  File "/usr/lib/python3.9/logging/__init__.py", line 426, in _format
    return self._fmt % record.__dict__
TypeError: not enough arguments for format string
Call stack:
  File "/usr/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.9/dist-packages/ipy

Row(flightNumber='425', airline='GoAir', origin='Delhi', destination='Hyderabad', dayOfWeek='Sunday,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday', scheduledDepartureTime=datetime.datetime(2025, 4, 20, 5, 45), scheduledArrivalTime=None, validFrom='28-10-2018', validTo='30-03-2019')
Row(flightNumber='423', airline='GoAir', origin='Delhi', destination='Hyderabad', dayOfWeek='Saturday', scheduledDepartureTime=datetime.datetime(2025, 4, 20, 7, 30), scheduledArrivalTime=None, validFrom='28-10-2018', validTo='28-10-2018')
Row(flightNumber='423', airline='GoAir', origin='Delhi', destination='Hyderabad', dayOfWeek='Friday', scheduledDepartureTime=datetime.datetime(2025, 4, 20, 7, 30), scheduledArrivalTime=None, validFrom='03-11-2018', validTo='01-12-2018')
Row(flightNumber='423', airline='GoAir', origin='Delhi', destination='Hyderabad', dayOfWeek='Friday', scheduledDepartureTime=datetime.datetime(2025, 4, 20, 7, 30), scheduledArrivalTime=None, validFrom='02-02-2019', validTo='30-03-2019')


--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.9/logging/__init__.py", line 1079, in emit
    msg = self.format(record)
  File "/usr/lib/python3.9/logging/__init__.py", line 923, in format
    return fmt.format(record)
  File "/usr/lib/python3.9/logging/__init__.py", line 662, in format
    s = self.formatMessage(record)
  File "/usr/lib/python3.9/logging/__init__.py", line 631, in formatMessage
    return self._style.format(record)
  File "/usr/lib/python3.9/logging/__init__.py", line 430, in format
    return self._format(record)
  File "/usr/lib/python3.9/logging/__init__.py", line 426, in _format
    return self._fmt % record.__dict__
TypeError: not enough arguments for format string
Call stack:
  File "/usr/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.9/dist-packages/ipy