In [11]:
# Comparison: Data Processing with Spark vs. Pandas
# -------------------------------------------------
#
# Goal of this Notebook:
# - Compare the data processing speed between Apache Spark and Pandas.
# - Analyze the data size limits where Pandas reaches its boundaries and Spark shows its advantages.
#
# Overview:
# - The first code block implements data processing with Spark.
# - The second code block implements the same logic using Pandas.
# - Both approaches are compared in terms of runtime and memory usage.
#
# Prerequisites:
# - Python version: 3.8 or higher
# - Apache Spark: 3.5.0
# - Installed libraries: pyspark, pandas, requests, folium
#
# Workflow:
# 1. Download and prepare the data.
# 2. Process the data using both approaches.
# 3. Measure and compare the runtime.
#
# Let's start by importing the necessary libraries and setting up the Spark environment.

In [12]:
from pyspark.sql import SparkSession
import requests
from io import BytesIO
import zipfile
from concurrent.futures import ThreadPoolExecutor
import tempfile
import os
from pyspark.sql.functions import col, to_timestamp, count, lit
import folium  # Import for map visualization
import time  # Import for time measurement

In [13]:
# Global configuration variables
# Path for temporary storage
temp_storage_path = "./data/temp"  # Configurable storage path for temporary files
os.makedirs(temp_storage_path, exist_ok=True)  # Create the directory if it does not exist

In [15]:
# List of CSV (ZIP) URLs
csv_urls = [
    "https://web.ais.dk/aisdata/aisdk-2024-03-01.zip",
    "https://web.ais.dk/aisdata/aisdk-2024-03-02.zip"]
  #  "https://web.ais.dk/aisdata/aisdk-2024-03-03.zip",
  #  "https://web.ais.dk/aisdata/aisdk-2024-03-04.zip",
  # "https://web.ais.dk/aisdata/aisdk-2024-03-05.zip"
#]

In [16]:
# Step 1: Create a Spark session
spark = SparkSession.builder \
    .appName("AIS Data Processing") \
    .getOrCreate()

# Step 2: Function to download, extract, and save CSV files
def download_and_unzip_to_temp_csv(url):
    response = requests.get(url)
    response.raise_for_status()
    zipfile_bytes = BytesIO(response.content)
    with zipfile.ZipFile(zipfile_bytes, 'r') as z:
        csv_filename = z.namelist()[0]  # Name of the CSV file in the ZIP archive
        with z.open(csv_filename) as csv_file:
            temp_file_path = os.path.join(tempfile.gettempdir(), csv_filename)
            with open(temp_file_path, "wb") as temp_file:
                temp_file.write(csv_file.read())
            return temp_file_path

# Step 4: Parallel downloading and storing CSV files in temporary paths

# Start the timer
start_time = time.time()

# Count the total number of URLs
total_urls = len(csv_urls)

# Parallel downloading and storing CSV files
with ThreadPoolExecutor(max_workers=10) as executor:
    csv_file_paths = list(executor.map(download_and_unzip_to_temp_csv, csv_urls))

# Stop the timer
end_time = time.time()

# Convert elapsed time into minutes and seconds
elapsed_time = end_time - start_time
minutes = int(elapsed_time // 60)
seconds = int(elapsed_time % 60)
download_time = f"The download time for {total_urls} files with Spark is {minutes} minutes and {seconds} seconds."

# Print the download time
print(download_time)

# Automatically update the README file
try:
    # Open the file and read its contents
    with open("README.md", "r") as readme:
        lines = readme.readlines()
    
    # Create a new list of lines
    updated_lines = []
    section_found = False
    for line in lines:
        if line.strip() == "### Download Time Results with Spark":
            # Replace the existing value with the new one
            updated_lines.append(line)
            updated_lines.append(f"{download_time}\n")
            section_found = True
        elif not section_found or line.strip() != f"{download_time}":
            updated_lines.append(line)

    # If the section was not found, append it
    if not section_found:
        updated_lines.append("\n### Download Time Results with Spark\n")
        updated_lines.append(f"{download_time}\n")

    # Overwrite the file
    with open("README.md", "w") as readme:
        readme.writelines(updated_lines)

    print("The download time and URL count have been successfully updated in the README.")
except FileNotFoundError:
    # If the file does not exist, create it
    with open("README.md", "w") as readme:
        readme.write("### Download Time Results with Spark\n")
        readme.write(f"{download_time}\n")
    print("The README file was created, and the download time has been added.")
except Exception as e:
    print(f"Error writing to the README file: {e}")

Die Download-Zeit für 2 Dateien mit Spark beträgt 6 Minuten und 46 Sekunden.
Die Download-Zeit und URL-Anzahl wurden erfolgreich aktualisiert.


In [None]:
# Step 5: Read CSV files with Spark and combine them
# Create a list of DataFrames for each CSV file
dataframes = [spark.read.csv(path, header=True, inferSchema=True) for path in csv_file_paths]

# Combine all DataFrames into a single large DataFrame
combined_df = dataframes[0]
for df in dataframes[1:]:
    combined_df = combined_df.union(df)

In [16]:
# Step 6: Display some sample rows to show possible MMSI numbers
combined_df.show(10)

# Print the original number of entries
print(f"The original dataset contains {combined_df.count()} entries.")

+-------------------+--------------+---------+---------+---------+--------------------+----+----+-----+-------+-------+--------+----+---------+----------+-----+------+------------------------------+-------+-----------+----+----------------+----+----+----+----+
|        # Timestamp|Type of mobile|     MMSI| Latitude|Longitude| Navigational status| ROT| SOG|  COG|Heading|    IMO|Callsign|Name|Ship type|Cargo type|Width|Length|Type of position fixing device|Draught|Destination| ETA|Data source type|   A|   B|   C|   D|
+-------------------+--------------+---------+---------+---------+--------------------+----+----+-----+-------+-------+--------+----+---------+----------+-----+------+------------------------------+-------+-----------+----+----------------+----+----+----+----+
|01/03/2024 00:00:00|       Class A|219000873| 56.99091|10.304543|Under way using e...|NULL| 0.0| 30.2|   NULL|Unknown| Unknown|NULL|Undefined|      NULL| NULL|  NULL|                     Undefined|   NULL|    Unknown



Der Originale Datensatz hat 31817670 Einträge.


                                                                                

In [None]:
# Step 5: Read CSV files with Spark and combine them
# Create a list of DataFrames for each CSV file
dataframes = [spark.read.csv(path, header=True, inferSchema=True) for path in csv_file_paths]

# Combine all DataFrames into one large DataFrame
combined_df = dataframes[0]
for df in dataframes[1:]:
    combined_df = combined_df.union(df)

In [17]:
########################################################
# 2. Filter out base stations as they do not display navigation data ("Type of mobile" != "Base Station")
########################################################

# Check if the column "Type of mobile" exists and filter
if "Type of mobile" in combined_df.columns:
    combined_df = combined_df.filter(col("Type of mobile") != "Base Station")
else:
    print("Warning: 'Type of mobile' column not found, skipping this step.")

print(f"The adjusted dataset contains {combined_df.count()} entries.")



Der angepasste Datensatz hat 29508390 Einträge.


                                                                                

In [18]:
########################################################
# 3. Keep only relevant columns to reduce data size
########################################################

relevant_columns = ["MMSI", "Latitude", "Longitude", "# Timestamp"]
combined_df = combined_df.select(*relevant_columns)

########################################################
# 4. Convert Timestamp to datetime format
########################################################

combined_df = combined_df.withColumn("# Timestamp", to_timestamp(col("# Timestamp"), "dd/MM/yyyy HH:mm:ss"))

########################################################
# 5. Filter MMSI numbers with enough data points to display meaningful routes
########################################################

# Count the number of data points per MMSI
mmsi_counts = combined_df.groupBy("MMSI").agg(count("*").alias("count"))

# Define a threshold (e.g., at least 50 points)
threshold = 50
valid_mmsi = mmsi_counts.filter(col("count") >= threshold).select("MMSI").rdd.flatMap(lambda x: x).collect()

# Filtered DataFrame containing only MMSI numbers with sufficient data points
filtered_by_count_df = combined_df.filter(col("MMSI").isin(valid_mmsi))

                                                                                

In [19]:
########################################################
# 6. Filter by specific MMSI and time range + plot the route
########################################################

mmsi_number = 219016832  # Replace with your desired MMSI

# Define start and end timestamps (in the format "dd/MM/yyyy HH:mm:ss")
start_str = "01/03/2024 00:00:00"  # Start time
end_str = "01/03/2024 06:59:59"    # End time

# Convert start and end times to datetime objects
start_dt = to_timestamp(lit(start_str), "dd/MM/yyyy HH:mm:ss")
end_dt = to_timestamp(lit(end_str), "dd/MM/yyyy HH:mm:ss")

# Check if the MMSI has enough data points
if mmsi_number not in valid_mmsi:
    print(f"MMSI {mmsi_number} does not have enough data points to display a meaningful route.")
else:
    # Filter by MMSI and time range
    route_df = filtered_by_count_df.filter(
        (col("MMSI") == mmsi_number) &
        (col("# Timestamp") >= start_dt) &
        (col("# Timestamp") <= end_dt)
    ).orderBy("# Timestamp")

    # Check if filtered data is available
    if route_df.count() == 0:
        print(f"No data for MMSI {mmsi_number} between {start_str} and {end_str}")
    else:
        # Convert data to Pandas DataFrame for plotting
        pandas_df = route_df.toPandas()

        # Create a map and plot the route
        mean_lat = pandas_df["Latitude"].mean()
        mean_lon = pandas_df["Longitude"].mean()
        
        route_map = folium.Map(location=[mean_lat, mean_lon], zoom_start=8)
        
        # Create a list of coordinates for the PolyLine
        coords = pandas_df[["Latitude", "Longitude"]].values.tolist()
        
        # Add the PolyLine to the map
        folium.PolyLine(coords, color="blue", weight=2.5, opacity=1).add_to(route_map)
        
        # Save the map
        route_map.save("ship_route.html")
        print("Route successfully saved as 'ship_route.html'.")

                                                                                

Route wurde erfolgreich als 'ship_route.html' gespeichert.
