In [2]:
from pyspark.sql import SparkSession
import requests
from io import BytesIO
import zipfile
from concurrent.futures import ThreadPoolExecutor
import tempfile
import os
from pyspark.sql.functions import col, to_timestamp, count, lit
import folium  # Import for map visualization
import time  # Import for time measurement

In [3]:
# Global configuration variables
# Path for local storage of CSV files
local_storage_path = "./data/csv_files"  # Configurable storage path for CSV files
os.makedirs(local_storage_path, exist_ok=True)  # Create the directory if it does not exist

In [4]:
# List of CSV (ZIP) URLs
csv_urls = [
    "https://web.ais.dk/aisdata/aisdk-2024-03-01.zip",
    "https://web.ais.dk/aisdata/aisdk-2024-03-02.zip"]
  #  "https://web.ais.dk/aisdata/aisdk-2024-03-03.zip",
  #  "https://web.ais.dk/aisdata/aisdk-2024-03-04.zip",
  # "https://web.ais.dk/aisdata/aisdk-2024-03-05.zip"
#]

### How does the system behave under Node/CPU/Memory/Hardware/... errors and failures?

### Simulating memory error

### OLD VERSION

In [1]:
from pyspark.sql import SparkSession
import requests
from io import BytesIO
import zipfile
from concurrent.futures import ThreadPoolExecutor
import tempfile
import os

# #Schritt 1: Spark-Session erstellen
# spark = SparkSession.builder \
#     .appName("AIS Data Processing") \
#     .getOrCreate()

spark = SparkSession.builder \
    .appName("AIS Data Processing") \
    .master("local[2]") \
    .config("spark.executor.memory", "256m") \
    .config("spark.sql.shuffle.partitions", "2") \
    .getOrCreate()

# Schritt 2: Funktion zum Herunterladen, Entpacken und Speichern von CSV-Dateien
def download_and_unzip_to_temp_csv(url):
    response = requests.get(url)
    response.raise_for_status()
    zipfile_bytes = BytesIO(response.content)
    with zipfile.ZipFile(zipfile_bytes, 'r') as z:
        csv_filename = z.namelist()[0]  # Der Name der CSV-Datei im ZIP-Archiv
        with z.open(csv_filename) as csv_file:
            temp_file_path = os.path.join(tempfile.gettempdir(), csv_filename)
            with open(temp_file_path, "wb") as temp_file:
                temp_file.write(csv_file.read())
            return temp_file_path

# Schritt 3: Liste der ZIP-URLs
csv_urls = [
    "https://web.ais.dk/aisdata/aisdk-2024-03-01.zip",
    "https://web.ais.dk/aisdata/aisdk-2024-03-02.zip",
    "https://web.ais.dk/aisdata/aisdk-2024-03-03.zip",
    "https://web.ais.dk/aisdata/aisdk-2024-03-04.zip",
    "https://web.ais.dk/aisdata/aisdk-2024-03-05.zip"
]

# Schritt 4: Paralleles Herunterladen und Speichern der CSV-Dateien in temporären Pfaden
with ThreadPoolExecutor(max_workers=10) as executor:
    csv_file_paths = list(executor.map(download_and_unzip_to_temp_csv, csv_urls))

# Schritt 5: CSV-Dateien mit Spark einlesen und kombinieren
# Erstelle eine Liste von DataFrames für jede CSV-Datei
dataframes = [spark.read.csv(path, header=True, inferSchema=True) for path in csv_file_paths]

# Kombiniere alle DataFrames zu einem großen DataFrame
combined_df = dataframes[0]
for df in dataframes[1:]:
    combined_df = combined_df.union(df)

# Schritt 6: Einige Beispielzeilen ausgeben, um mögliche MMSI-Nummern anzuzeigen
combined_df.show(10)

Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: org.apache.spark.SparkIllegalArgumentException: [INVALID_EXECUTOR_MEMORY] Executor memory 268435456 must be at least 471859200. Please increase executor memory using the --executor-memory option or "spark.executor.memory" in Spark configuration.
	at org.apache.spark.memory.UnifiedMemoryManager$.getMaxMemory(UnifiedMemoryManager.scala:230)
	at org.apache.spark.memory.UnifiedMemoryManager$.apply(UnifiedMemoryManager.scala:201)
	at org.apache.spark.SparkEnv$.create(SparkEnv.scala:320)
	at org.apache.spark.SparkEnv$.createDriverEnv(SparkEnv.scala:194)
	at org.apache.spark.SparkContext.createSparkEnv(SparkContext.scala:284)
	at org.apache.spark.SparkContext.<init>(SparkContext.scala:478)
	at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
	at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
	at sun.reflect.NativeConstructorAccessorImpl.newInstance(Unknown Source)
	at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(Unknown Source)
	at java.lang.reflect.Constructor.newInstance(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:238)
	at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
	at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Unknown Source)


### NEW VERSION

In [7]:
# Step 1: Create a Spark session
spark = SparkSession.builder \
    .appName("AIS Data Processing") \
    .getOrCreate()

spark = SparkSession.builder \
    .appName("AIS Data Processing") \
    .master("local[2]") \
    .config("spark.executor.memory", "256m") \
    .config("spark.sql.shuffle.partitions", "2") \
    .getOrCreate()

# Step 2: Function to download, extract, and save CSV files locally if not already present
def download_and_unzip_to_csv(url):
    # Extract ZIP filename and corresponding CSV filename
    zip_filename = url.split("/")[-1]
    csv_filename = zip_filename.replace(".zip", ".csv")
    csv_filepath = os.path.join(local_storage_path, csv_filename)
    
    # Check if the CSV file already exists
    if os.path.exists(csv_filepath):
        print(f"File already exists: {csv_filepath}, skipping download.")
        return csv_filepath  # Return the path to the existing file
    
    # Download and extract the ZIP file
    print(f"Downloading and extracting: {url}")
    response = requests.get(url)
    response.raise_for_status()
    zipfile_bytes = BytesIO(response.content)
    with zipfile.ZipFile(zipfile_bytes, 'r') as z:
        with z.open(z.namelist()[0]) as csv_file:
            # Save the extracted CSV locally
            with open(csv_filepath, "wb") as output_file:
                output_file.write(csv_file.read())
    
    return csv_filepath  # Return the path to the saved file

# Step 4: Parallel downloading and storing CSV files locally

# Start the timer
start_time = time.time()

# Count the total number of URLs
total_urls = len(csv_urls)

# Parallel downloading and storing CSV files
with ThreadPoolExecutor(max_workers=10) as executor:
    csv_file_paths = list(executor.map(download_and_unzip_to_csv, csv_urls))

# Stop the timer
end_time = time.time()

# Convert elapsed time into minutes and seconds
elapsed_time = end_time - start_time
minutes = int(elapsed_time // 60)
seconds = int(elapsed_time % 60)
download_time = f"The download time for {total_urls} files with Spark is {minutes} minutes and {seconds} seconds."

# Print the download time
print(download_time)

# Automatically update the README file
try:
    # Open the file and read its contents
    with open("README.md", "r") as readme:
        lines = readme.readlines()
    
    # Create a new list of lines
    updated_lines = []
    section_found = False
    for line in lines:
        if line.strip() == "### Download Time Results with Spark":
            # Replace the existing value with the new one
            updated_lines.append(line)
            updated_lines.append(f"{download_time}\n")
            section_found = True
        elif not section_found or line.strip() != f"{download_time}":
            updated_lines.append(line)

    # If the section was not found, append it
    if not section_found:
        updated_lines.append("\n### Download Time Results with Spark\n")
        updated_lines.append(f"{download_time}\n")

    # Overwrite the file
    with open("README.md", "w") as readme:
        readme.writelines(updated_lines)

    print("The download time and URL count have been successfully updated in the README.")
except FileNotFoundError:
    # If the file does not exist, create it
    with open("README.md", "w") as readme:
        readme.write("### Download Time Results with Spark\n")
        readme.write(f"{download_time}\n")
    print("The README file was created, and the download time has been added.")
except Exception as e:
    print(f"Error writing to the README file: {e}")

Downloading and extracting: https://web.ais.dk/aisdata/aisdk-2024-03-01.zip
Downloading and extracting: https://web.ais.dk/aisdata/aisdk-2024-03-02.zip
The download time for 2 files with Spark is 3 minutes and 49 seconds.
The README file was created, and the download time has been added.


### What happens during network interruptions and partitioning?

### Simulating Download fail

In [10]:
import random
# Step 1: Create a Spark session
spark = SparkSession.builder \
    .appName("AIS Data Processing") \
    .getOrCreate()

# Step 2: Function to download, extract, and save CSV files locally if not already present
def download_and_unzip_to_csv(url):
    # Extract ZIP filename and corresponding CSV filename
    zip_filename = url.split("/")[-1]
    csv_filename = zip_filename.replace(".zip", ".csv")
    csv_filepath = os.path.join(local_storage_path, csv_filename)
    
    # Check if the CSV file already exists
    if os.path.exists(csv_filepath):
        print(f"File already exists: {csv_filepath}, skipping download.")
        return csv_filepath  # Return the path to the existing file
        
    if random.random() < 0.5:  # 30% fail
        raise Exception(f"Simulated download failure for URL: {url}")
    
    # Download and extract the ZIP file
    print(f"Downloading and extracting: {url}")
    response = requests.get(url)
    response.raise_for_status()
    zipfile_bytes = BytesIO(response.content)
    with zipfile.ZipFile(zipfile_bytes, 'r') as z:
        with z.open(z.namelist()[0]) as csv_file:
            # Save the extracted CSV locally
            with open(csv_filepath, "wb") as output_file:
                output_file.write(csv_file.read())
    
    return csv_filepath  # Return the path to the saved file

# Step 4: Parallel downloading and storing CSV files locally

# Start the timer
start_time = time.time()

# Count the total number of URLs
total_urls = len(csv_urls)

# Parallel downloading and storing CSV files
with ThreadPoolExecutor(max_workers=10) as executor:
    csv_file_paths = list(executor.map(download_and_unzip_to_csv, csv_urls))

# Stop the timer
end_time = time.time()

# Convert elapsed time into minutes and seconds
elapsed_time = end_time - start_time
minutes = int(elapsed_time // 60)
seconds = int(elapsed_time % 60)
download_time = f"The download time for {total_urls} files with Spark is {minutes} minutes and {seconds} seconds."

# Print the download time
print(download_time)

# Automatically update the README file
try:
    # Open the file and read its contents
    with open("README.md", "r") as readme:
        lines = readme.readlines()
    
    # Create a new list of lines
    updated_lines = []
    section_found = False
    for line in lines:
        if line.strip() == "### Download Time Results with Spark":
            # Replace the existing value with the new one
            updated_lines.append(line)
            updated_lines.append(f"{download_time}\n")
            section_found = True
        elif not section_found or line.strip() != f"{download_time}":
            updated_lines.append(line)

    # If the section was not found, append it
    if not section_found:
        updated_lines.append("\n### Download Time Results with Spark\n")
        updated_lines.append(f"{download_time}\n")

    # Overwrite the file
    with open("README.md", "w") as readme:
        readme.writelines(updated_lines)

    print("The download time and URL count have been successfully updated in the README.")
except FileNotFoundError:
    # If the file does not exist, create it
    with open("README.md", "w") as readme:
        readme.write("### Download Time Results with Spark\n")
        readme.write(f"{download_time}\n")
    print("The README file was created, and the download time has been added.")
except Exception as e:
    print(f"Error writing to the README file: {e}")

Downloading and extracting: https://web.ais.dk/aisdata/aisdk-2024-03-02.zip


Exception: Simulated download failure for URL: https://web.ais.dk/aisdata/aisdk-2024-03-01.zip

### Simulating Internet delay

In [5]:
import random
# Step 1: Create a Spark session
spark = SparkSession.builder \
    .appName("AIS Data Processing") \
    .getOrCreate()

# Step 2: Function to download, extract, and save CSV files locally if not already present
def download_and_unzip_to_csv(url):
    # Extract ZIP filename and corresponding CSV filename
    zip_filename = url.split("/")[-1]
    csv_filename = zip_filename.replace(".zip", ".csv")
    csv_filepath = os.path.join(local_storage_path, csv_filename)
    
    # Check if the CSV file already exists
    if os.path.exists(csv_filepath):
        print(f"File already exists: {csv_filepath}, skipping download.")
        return csv_filepath  # Return the path to the existing file
    # delay
    delay = random.uniform(0, 5)
    print(f"Simulating network delay: {delay:.2f} seconds for {url}")
    time.sleep(delay)
    
    # Download and extract the ZIP file
    print(f"Downloading and extracting: {url}")
    response = requests.get(url)
    response.raise_for_status()
    zipfile_bytes = BytesIO(response.content)
    with zipfile.ZipFile(zipfile_bytes, 'r') as z:
        with z.open(z.namelist()[0]) as csv_file:
            # Save the extracted CSV locally
            with open(csv_filepath, "wb") as output_file:
                output_file.write(csv_file.read())
    
    return csv_filepath  # Return the path to the saved file

# Step 4: Parallel downloading and storing CSV files locally

# Start the timer
start_time = time.time()

# Count the total number of URLs
total_urls = len(csv_urls)

# Parallel downloading and storing CSV files
with ThreadPoolExecutor(max_workers=10) as executor:
    csv_file_paths = list(executor.map(download_and_unzip_to_csv, csv_urls))

# Stop the timer
end_time = time.time()

# Convert elapsed time into minutes and seconds
elapsed_time = end_time - start_time
minutes = int(elapsed_time // 60)
seconds = int(elapsed_time % 60)
download_time = f"The download time for {total_urls} files with Spark is {minutes} minutes and {seconds} seconds."

# Print the download time
print(download_time)

# Automatically update the README file
try:
    # Open the file and read its contents
    with open("README.md", "r") as readme:
        lines = readme.readlines()
    
    # Create a new list of lines
    updated_lines = []
    section_found = False
    for line in lines:
        if line.strip() == "### Download Time Results with Spark":
            # Replace the existing value with the new one
            updated_lines.append(line)
            updated_lines.append(f"{download_time}\n")
            section_found = True
        elif not section_found or line.strip() != f"{download_time}":
            updated_lines.append(line)

    # If the section was not found, append it
    if not section_found:
        updated_lines.append("\n### Download Time Results with Spark\n")
        updated_lines.append(f"{download_time}\n")

    # Overwrite the file
    with open("README.md", "w") as readme:
        readme.writelines(updated_lines)

    print("The download time and URL count have been successfully updated in the README.")
except FileNotFoundError:
    # If the file does not exist, create it
    with open("README.md", "w") as readme:
        readme.write("### Download Time Results with Spark\n")
        readme.write(f"{download_time}\n")
    print("The README file was created, and the download time has been added.")
except Exception as e:
    print(f"Error writing to the README file: {e}")

Simulating network delay: 0.47 seconds for https://web.ais.dk/aisdata/aisdk-2024-03-01.zipSimulating network delay: 0.75 seconds for https://web.ais.dk/aisdata/aisdk-2024-03-02.zip

Downloading and extracting: https://web.ais.dk/aisdata/aisdk-2024-03-01.zip
Downloading and extracting: https://web.ais.dk/aisdata/aisdk-2024-03-02.zip
The download time for 2 files with Spark is 3 minutes and 19 seconds.
The download time and URL count have been successfully updated in the README.


###  How do error handling mechanisms affect efficiency/scale/latency/throughput/... etc.? Are there any worst/best case considerations?

In [7]:
import os
import time
import random
import requests
from io import BytesIO
import zipfile
from concurrent.futures import ThreadPoolExecutor, as_completed
from pyspark.sql import SparkSession

# Local storage path
local_storage_path = "./data"
os.makedirs(local_storage_path, exist_ok=True)

# Create Spark Session
spark = SparkSession.builder \
    .appName("AIS Data Processing with Fault Tolerance") \
    .getOrCreate()

# Example CSV URLs
csv_urls = [
    "https://web.ais.dk/aisdata/aisdk-2024-03-01.zip",
    "https://web.ais.dk/aisdata/aisdk-2024-03-02.zip",
    "https://web.ais.dk/aisdata/aisdk-2024-03-03.zip"
]

# Simulate download and extraction with failure and delay
def download_and_unzip_to_csv(url):
    """
    Downloads a ZIP file, extracts its contents, and saves as a CSV.
    Introduces random failures and delays to simulate errors.
    """
    zip_filename = url.split("/")[-1]
    csv_filename = zip_filename.replace(".zip", ".csv")
    csv_filepath = os.path.join(local_storage_path, csv_filename)

    # Check if the CSV already exists
    if os.path.exists(csv_filepath):
        print(f"File already exists: {csv_filepath}, skipping download.")
        return csv_filepath  # Return existing file path

    # Simulate network delay
    delay = random.uniform(0, 5)  # Random delay between 0-5 seconds
    print(f"Simulating network delay: {delay:.2f} seconds for {url}")
    time.sleep(delay)

    # Simulate random failure
    if random.random() < 0.3:  # 30% chance of failure
        raise Exception(f"Simulated download failure for URL: {url}")

    try:
        # Download and extract the ZIP file
        print(f"Downloading and extracting: {url}")
        response = requests.get(url)
        response.raise_for_status()
        zipfile_bytes = BytesIO(response.content)
        with zipfile.ZipFile(zipfile_bytes, 'r') as z:
            with z.open(z.namelist()[0]) as csv_file:
                with open(csv_filepath, "wb") as output_file:
                    output_file.write(csv_file.read())
    except Exception as e:
        print(f"Error processing {url}: {e}")
        return None  # Return None for failed downloads

    return csv_filepath  # Return path to saved file

# Start timer
start_time = time.time()

# Parallel downloading with fault tolerance
total_urls = len(csv_urls)
successful_downloads = 0
failed_downloads = 0
download_times = []

with ThreadPoolExecutor(max_workers=10) as executor:
    future_to_url = {executor.submit(download_and_unzip_to_csv, url): url for url in csv_urls}
    for future in as_completed(future_to_url):
        url = future_to_url[future]
        try:
            result = future.result()
            if result:
                successful_downloads += 1
            else:
                failed_downloads += 1
        except Exception as e:
            print(f"Unhandled error for URL {url}: {e}")
            failed_downloads += 1

# Stop timer
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time
minutes = int(elapsed_time // 60)
seconds = int(elapsed_time % 60)
total_time = f"Total time: {minutes} minutes and {seconds} seconds."

# Print results
print(f"\nDownload Summary:")
print(f"Total URLs: {total_urls}")
print(f"Successful Downloads: {successful_downloads}")
print(f"Failed Downloads: {failed_downloads}")
print(total_time)

# Automatically update the README file
try:
    with open("README.md", "a") as readme:
        readme.write("\n### Download Time and Results\n")
        readme.write(f"Total URLs: {total_urls}\n")
        readme.write(f"Successful Downloads: {successful_downloads}\n")
        readme.write(f"Failed Downloads: {failed_downloads}\n")
        readme.write(f"{total_time}\n")
    print("The README file was updated successfully.")
except Exception as e:
    print(f"Error writing to the README file: {e}")


Simulating network delay: 1.38 seconds for https://web.ais.dk/aisdata/aisdk-2024-03-01.zip
Simulating network delay: 3.42 seconds for https://web.ais.dk/aisdata/aisdk-2024-03-02.zip
Simulating network delay: 0.76 seconds for https://web.ais.dk/aisdata/aisdk-2024-03-03.zip
Downloading and extracting: https://web.ais.dk/aisdata/aisdk-2024-03-03.zip
Downloading and extracting: https://web.ais.dk/aisdata/aisdk-2024-03-01.zip
Unhandled error for URL https://web.ais.dk/aisdata/aisdk-2024-03-02.zip: Simulated download failure for URL: https://web.ais.dk/aisdata/aisdk-2024-03-02.zip

Download Summary:
Total URLs: 3
Successful Downloads: 2
Failed Downloads: 1
Total time: 3 minutes and 25 seconds.
The README file was updated successfully.


#### Fault Tolerance on Cluster - Spark Standalone Mode

In [None]:
#set up spark in cluster mode with two executors

spark = SparkSession.builder \
    .master("spark://127.0.0.1:7077") \
    .config("spark.executor.instances", 2) \
    .config("spark.executor.cores","2") \
    .config("spark.executor.memory","4g") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")