In [8]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("AdvancedSpark").getOrCreate()

spark

In [9]:
sc = spark.sparkContext

In [10]:

from pyspark import AccumulatorParam

class MultiCounterAccumulator(AccumulatorParam):
    def zero(self, initialValue):
        return {"null_arrival": 0, "weekend_flights": 0}

    def addInPlace(self, v1,v2):
        v1["null_arrival"] += v2["null_arrival"]
        v1["weekend_flights"] += v2["weekend_flights"]
        return v1

multi_counter = sc.accumulator(
    {"null_arrival":0, "weekend_flights": 0},
    MultiCounterAccumulator()
)

df = spark.read.csv("Flight_Schedule.csv", header = True, inferSchema = True)

rdd = df.rdd

def update_counters(row):
    counts = {"null_arrival": 0, "weekend_flights": 0}
    # Check for null arrival time
    if row["scheduledArrivalTime"] is None:
        counts["null_arrival"] += 1
    # Check if the flight operates on a weekend day
    days = row["dayOfWeek"].split(",") if row["dayOfWeek"] else []
    if "Saturday" in days or "Sunday" in days:
        counts["weekend_flights"] += 1
    return counts

rdd.foreach(lambda row: multi_counter.add(update_counters(row)))
results = multi_counter.value
print(f"Null Arrival Times: {results['null_arrival']}")
print(f"Weekend Flights: {results['weekend_flights']}")

Null Arrival Times: 10774
Weekend Flights: 28354


In [11]:
import logging
from pyspark import AccumulatorParam

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("flight_schedule_analysis.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

class MultiCounterAccumulator(AccumulatorParam):
    def zero(self, initialValue):
        logger.info("Setting initial accumulator value")
        return {"null_arrival": 0, "weekend_flights": 0}
    
    def addInPlace(self, v1, v2):
        logger.debug(f"Adding accumulator values: {v1} + {v2}")
        v1["null_arrival"] += v2["null_arrival"]
        v1["weekend_flights"] += v2["weekend_flights"]
        return v1

# Create an accumulator
logger.info("Creating accumulator with initial values")
multi_counter = sc.accumulator(
    {"null_arrival": 0, "weekend_flights": 0},
    MultiCounterAccumulator()
)
logger.info("Accumulator created successfully")

# Load the dataset
logger.info("Flight_Schedule.csv")
df = spark.read.csv("Flight_Schedule.csv", header=True, inferSchema=True)
logger.info("Dataset loaded successfully")

# Convert to RDD for lower-level control
logger.info("Converting DataFrame to RDD")
rdd = df.rdd
logger.info("DataFrame converted to RDD successfully")

# Function to update accumulator for each row
def update_counters(row):
    counts = {"null_arrival": 0, "weekend_flights": 0}
    logger.debug(f"Processing row: {row}")
    
    # Check for null arrival time
    if row["scheduledArrivalTime"] is None:
        counts["null_arrival"] += 1
        logger.debug(f"Found null scheduledArrivalTime in row: {row}")
    else:
        logger.debug(f"scheduledArrivalTime is not null in row: {row}")
    
    # Check if the flight operates on a weekend day
    days = row["dayOfWeek"].split(",") if row["dayOfWeek"] else []
    if "Saturday" in days or "Sunday" in days:
        counts["weekend_flights"] += 1
        logger.debug(f"Found weekend flight (Saturday/Sunday) in row: {row}")
    else:
        logger.debug(f"No weekend flight in row: {row}")
    
    return counts

# Process RDD and update accumulator
logger.info("Processing RDD and updating accumulator")
rdd.foreach(lambda row: multi_counter.add(update_counters(row)))
logger.info("RDD processing completed")

# Print results
results = multi_counter.value
logger.info(f"Final accumulator results: {results}")
print(f"Null Arrival Times: {results['null_arrival']}")
print(f"Weekend Flights: {results['weekend_flights']}")

# Stop Spark session
logger.info("Stopping Spark session")
# spark.stop()
logger.info("Spark session stopped successfully")

2025-04-19 12:02:47,559 - INFO - Creating accumulator with initial values
2025-04-19 12:02:47,567 - INFO - Accumulator created successfully
2025-04-19 12:02:47,581 - INFO - Flight_Schedule.csv
2025-04-19 12:02:48,551 - INFO - Dataset loaded successfully
2025-04-19 12:02:48,553 - INFO - Converting DataFrame to RDD
2025-04-19 12:02:48,651 - INFO - DataFrame converted to RDD successfully
2025-04-19 12:02:48,654 - INFO - Processing RDD and updating accumulator
2025-04-19 12:02:48,665 - INFO - Setting initial accumulator value
2025-04-19 12:02:50,672 - INFO - RDD processing completed
2025-04-19 12:02:50,674 - INFO - Final accumulator results: {'null_arrival': 10774, 'weekend_flights': 28354}
2025-04-19 12:02:50,677 - INFO - Stopping Spark session
2025-04-19 12:02:50,680 - INFO - Spark session stopped successfully


Null Arrival Times: 10774
Weekend Flights: 28354


In [12]:
logging.basicConfig(
    level = logging.INFO,
    format = '%(asctime)s - %(levelname)s - %(message)s',
    handlers = [
        logging.FileHandler("simple_accumulator.log"),
        logging.StreamHandler()
    ]
)

logger = logging.getLogger(__name__)

logger.info("Initializing Spark session")
logger.info("Spark session initialized successfully")

logger.info("Creating accumulator for null arrival times")
null_arrival_counter = sc.accumulator(0)
logger.info("Accumulator created with initial value: 0")
logger.info("Flight_Schedule.csv")
df = spark.read.csv("Flight_Schedule.csv", header=True, inferSchema=True)
logger.info("Dataset loaded successfully")

logger.info("Converting DataFrame to RDD")
rdd = df.rdd
logger.info("DataFrame converted to RDD to successfully")

def check_null_arrival(row):
    if row["scheduledArrivalTime"] is None:
        logger.debug(f"Found null scheduledArrivalTime in row: {row}")
        null_arrival_counter.add(1)
    else:
        logger.debug(f"scheduledArrivalTime is not null in row: {row}")

logger.info("Processing RDD to count null arrival times")
rdd.foreach(check_null_arrival)
logger.info("RDD processing completed")

logger.info(f"Total null arrival times: {null_arrival_counter.value}")
print(f"Total Null Arrival Times: {null_arrival_counter.value}")

logger.info("Stopping Spark session")

2025-04-19 13:07:15,679 - INFO - Initializing Spark session
2025-04-19 13:07:15,682 - INFO - Spark session initialized successfully
2025-04-19 13:07:15,683 - INFO - Creating accumulator for null arrival times
2025-04-19 13:07:15,685 - INFO - Accumulator created with initial value: 0
2025-04-19 13:07:15,686 - INFO - Flight_Schedule.csv
2025-04-19 13:07:16,408 - INFO - Dataset loaded successfully
2025-04-19 13:07:16,410 - INFO - Converting DataFrame to RDD
2025-04-19 13:07:16,470 - INFO - DataFrame converted to RDD to successfully
2025-04-19 13:07:16,472 - INFO - Processing RDD to count null arrival times
2025-04-19 13:07:17,428 - INFO - RDD processing completed
2025-04-19 13:07:17,430 - INFO - Total null arrival times: 10774
2025-04-19 13:07:17,432 - INFO - Stopping Spark session


Total Null Arrival Times: 10774


In [13]:
import logging

logging.basicConfig(
    level=logging.DEBUG,  # Changed from INFO to DEBUG
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("simple_accumulator.log"),
        logging.StreamHandler()
    ]
)

logger = logging.getLogger(__name__)

logger.info("Creating accumulator for null arrival times")
null_arrival_counter = sc.accumulator(0)
logger.info("Accumulator created with initial value: 0")
logger.info("Flight_Schedule.csv")
df = spark.read.csv("Flight_Schedule.csv", header=True, inferSchema=True)
logger.info("Dataset loaded successfully")

logger.info("Converting DataFrame to RDD")
rdd = df.rdd
logger.info("DataFrame converted to RDD successfully")  # Fixed typo

# Function to check for null arrival times
def check_null_arrival(row):
    if row["scheduledArrivalTime"] is None:
        logger.debug(f"Found null scheduledArrivalTime in row: {row}")
        null_arrival_counter.add(1)
    else:
        logger.debug(f"scheduledArrivalTime is not null in row: {row}")

# Process RDD
logger.info("Processing RDD to count null arrival times")
rdd.foreach(check_null_arrival)
logger.info("RDD processing completed")

# Print result
logger.info(f"Total null arrival times: {null_arrival_counter.value}")
print(f"Total Null Arrival Times: {null_arrival_counter.value}")

# Stop Spark session
logger.info("Stopping Spark session")

2025-04-19 13:16:29,438 - INFO - Creating accumulator for null arrival times
2025-04-19 13:16:29,440 - INFO - Accumulator created with initial value: 0
2025-04-19 13:16:29,442 - INFO - Flight_Schedule.csv
2025-04-19 13:16:30,010 - INFO - Dataset loaded successfully
2025-04-19 13:16:30,012 - INFO - Converting DataFrame to RDD
2025-04-19 13:16:30,103 - INFO - DataFrame converted to RDD successfully
2025-04-19 13:16:30,105 - INFO - Processing RDD to count null arrival times
2025-04-19 13:16:30,922 - INFO - RDD processing completed
2025-04-19 13:16:30,923 - INFO - Total null arrival times: 10774
2025-04-19 13:16:30,924 - INFO - Stopping Spark session


Total Null Arrival Times: 10774


In [14]:
import logging
# from pyspark.sql import SparkSession

# Configure logging with DEBUG level
logging.basicConfig(
    level=logging.DEBUG,  # Changed from INFO to DEBUG
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("simple_accumulator.log"),
        logging.StreamHandler()
    ]
)

logger = logging.getLogger(__name__)

# Initialize Spark session
# logger.info("Initializing Spark session")
# spark = SparkSession.builder.appName("SimpleAccumulator").getOrCreate()
# sc = spark.sparkContext
# logger.info("Spark session initialized successfully")

# Create a simple accumulator
logger.info("Creating accumulator for null arrival times")
null_arrival_counter = sc.accumulator(0)
logger.info("Accumulator created with initial value: 0")

# Load the dataset
logger.info("Loading dataset from /opt/notebooks/Flight_Schedule.csv")
df = spark.read.csv("Flight_Schedule.csv", header=True, inferSchema=True)
logger.info("Dataset loaded successfully")

# Convert to RDD
logger.info("Converting DataFrame to RDD")
rdd = df.rdd
logger.info("DataFrame converted to RDD successfully")  # Fixed typo

# Function to check for null arrival times
def check_null_arrival(row):
    if row["scheduledArrivalTime"] is None:
        logger.debug(f"Found null scheduledArrivalTime in row: {row}")
        null_arrival_counter.add(1)
    else:
        logger.debug(f"scheduledArrivalTime is not null in row: {row}")

# Process RDD
logger.info("Processing RDD to count null arrival times")
rdd.foreach(check_null_arrival)
logger.info("RDD processing completed")

# Print result
logger.info(f"Total null arrival times: {null_arrival_counter.value}")
print(f"Total Null Arrival Times: {null_arrival_counter.value}")

# Stop Spark session
logger.info("Stopping Spark session")
spark.stop()
logger.info("Spark session stopped successfully")

2025-04-19 13:18:42,720 - INFO - Creating accumulator for null arrival times
2025-04-19 13:18:42,722 - INFO - Accumulator created with initial value: 0
2025-04-19 13:18:42,724 - INFO - Loading dataset from /opt/notebooks/Flight_Schedule.csv
2025-04-19 13:18:43,173 - INFO - Dataset loaded successfully
2025-04-19 13:18:43,174 - INFO - Converting DataFrame to RDD
2025-04-19 13:18:43,202 - INFO - DataFrame converted to RDD successfully
2025-04-19 13:18:43,204 - INFO - Processing RDD to count null arrival times
2025-04-19 13:18:43,996 - INFO - RDD processing completed
2025-04-19 13:18:43,998 - INFO - Total null arrival times: 10774
2025-04-19 13:18:43,999 - INFO - Stopping Spark session


Total Null Arrival Times: 10774


2025-04-19 13:18:45,003 - INFO - Spark session stopped successfully


In [None]:
import logging
from pyspark.sql import SparkSession
from pyspark import AccumulatorParam

# Configure logging
logging.basicConfig(
    level=logging.INFO,  # We can keep this at INFO since we'll log debug info differently
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("simple_accumulator.log"),
        logging.StreamHandler()
    ]
)

logger = logging.getLogger(__name__)

# Initialize Spark session
logger.info("Initializing Spark session")
spark = SparkSession.builder.appName("SimpleAccumulator").getOrCreate()
sc = spark.sparkContext
logger.info("Spark session initialized successfully")

# Create a simple accumulator for counting null arrival times
logger.info("Creating accumulator for null arrival times")
null_arrival_counter = sc.accumulator(0)
logger.info("Accumulator created with initial value: 0")

# Create a custom accumulator to collect debug information
class ListAccumulator(AccumulatorParam):
    def zero(self, initialValue):
        return []
    
    def addInPlace(self, v1, v2):
        v1.extend(v2)
        return v1

# Create an accumulator to collect debug messages
logger.info("Creating accumulator for debug messages")
debug_messages = sc.accumulator([], ListAccumulator())
logger.info("Debug messages accumulator created successfully")

# Load the dataset
logger.info("Loading dataset from /opt/notebooks/Flight_Schedule.csv")
df = spark.read.csv("/opt/notebooks/Flight_Schedule.csv", header=True, inferSchema=True)
logger.info("Dataset loaded successfully")

# Convert to RDD
logger.info("Converting DataFrame to RDD")
rdd = df.rdd
logger.info("DataFrame converted to RDD successfully")

# Function to check for null arrival times and collect debug info
def check_null_arrival(row):
    if row["scheduledArrivalTime"] is None:
        debug_messages.add([f"Found null scheduledArrivalTime in row: {row}"])
        null_arrival_counter.add(1)
    else:
        debug_messages.add([f"scheduledArrivalTime is not null in row: {row}"])

# Process RDD
logger.info("Processing RDD to count null arrival times and collect debug info")
rdd.foreach(check_null_arrival)
logger.info("RDD processing completed")

# Log the collected debug messages
logger.info("Logging collected debug messages")
for msg in debug_messages.value:
    logger.info(msg)

# Print result
logger.info(f"Total null arrival times: {null_arrival_counter.value}")
print(f"Total Null Arrival Times: {null_arrival_counter.value}")

# Stop Spark session
logger.info("Stopping Spark session")
spark.stop()
logger.info("Spark session stopped successfully")

2025-04-19 13:23:03,803 - INFO - Initializing Spark session
2025-04-19 13:23:03,965 - INFO - Spark session initialized successfully
2025-04-19 13:23:03,966 - INFO - Creating accumulator for null arrival times
2025-04-19 13:23:03,967 - INFO - Accumulator created with initial value: 0
2025-04-19 13:23:03,969 - INFO - Creating accumulator for debug messages
2025-04-19 13:23:03,970 - INFO - Debug messages accumulator created successfully
2025-04-19 13:23:03,972 - INFO - Loading dataset from /opt/notebooks/Flight_Schedule.csv
2025-04-19 13:23:04,492 - INFO - Dataset loaded successfully
2025-04-19 13:23:04,493 - INFO - Converting DataFrame to RDD
2025-04-19 13:23:04,544 - INFO - DataFrame converted to RDD successfully
2025-04-19 13:23:04,546 - INFO - Processing RDD to count null arrival times and collect debug info
2025-04-19 13:23:06,392 - INFO - RDD processing completed
2025-04-19 13:23:06,394 - INFO - Logging collected debug messages
2025-04-19 13:23:06,395 - INFO - Found null scheduledAr