In [12]:
from pyspark.sql import SparkSession
import requests
from io import BytesIO
import pandas as pd
import zipfile
from concurrent.futures import ThreadPoolExecutor
import folium
import time  # Import für Zeitmessung
import os

# Directory for storing downloaded and extracted CSV files
local_storage_path = "./data/csv_files"
os.makedirs(local_storage_path, exist_ok=True)  # Create the directory if it does not exist

# Spark-Session starten
spark = SparkSession.builder \
    .appName("ShipRouteOptimization") \
    .getOrCreate()

# Optional: Log-Level reduzieren für weniger Output
spark.sparkContext.setLogLevel("WARN")

In [13]:
# Step 1: Function to download, extract, and save CSV from ZIP archive if not already downloaded
def download_and_unzip_csv(url):
    # Get the name of the ZIP file from the URL
    zip_filename = url.split("/")[-1]
    csv_filename = zip_filename.replace(".zip", ".csv")
    csv_filepath = os.path.join(local_storage_path, csv_filename)
    
    # Check if the file already exists
    if os.path.exists(csv_filepath):
        print(f"File already exists: {csv_filepath}, skipping download.")
        return pd.read_csv(csv_filepath)  # Load the CSV into a DataFrame
    
    # Download and extract the ZIP file
    print(f"Downloading and extracting: {url}")
    response = requests.get(url)
    response.raise_for_status()
    zipfile_bytes = BytesIO(response.content)
    with zipfile.ZipFile(zipfile_bytes, 'r') as z:
        with z.open(z.namelist()[0]) as csv_file:
            # Save the extracted CSV locally
            with open(csv_filepath, "wb") as output_file:
                output_file.write(csv_file.read())
    
    # Load the CSV into a DataFrame
    return pd.read_csv(csv_filepath)

In [14]:
# Step 2: List of CSV (ZIP) URLs
csv_urls = [
    "https://web.ais.dk/aisdata/aisdk-2024-03-01.zip",
    "https://web.ais.dk/aisdata/aisdk-2024-03-02.zip"
]

In [15]:
# Step 3: Parallel downloading, processing, and storing CSV files
start_time = time.time()  # Start time

with ThreadPoolExecutor(max_workers=10) as executor:
    dfs = list(executor.map(download_and_unzip_csv, csv_urls))  # Download or load files in parallel

end_time = time.time()  # End time

# Convert elapsed time to minutes and seconds
elapsed_time = end_time - start_time
minutes = int(elapsed_time // 60)
seconds = int(elapsed_time % 60)
total_files = len(csv_urls)  # Count the number of files
download_time = f"The download time for {total_files} files with Pandas is {minutes} minutes and {seconds} seconds."

# Combine all DataFrames into a single large DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Print the download time and dataset information
print(download_time)
print(f"The combined dataset contains {len(combined_df)} entries.")

# Automatically update the README file
try:
    # Read the current contents of the README file
    with open("README.md", "r") as readme:
        lines = readme.readlines()

    # Update or add the section
    updated_lines = []
    section_found = False
    for line in lines:
        if line.strip() == "### Download Time Results with Pandas":
            # Replace the next line with the updated time
            updated_lines.append(line)
            updated_lines.append(f"{download_time}\n")
            section_found = True
        elif not section_found or line.strip() != download_time:
            updated_lines.append(line)

    # If the section was not found, append it at the end
    if not section_found:
        updated_lines.append("\n### Download Time Results with Pandas\n")
        updated_lines.append(f"{download_time}\n")

    # Write the updated content back to the README file
    with open("README.md", "w") as readme:
        readme.writelines(updated_lines)

    print("The download time for Pandas was successfully updated in the README.")
except FileNotFoundError:
    # If the README does not exist, create it
    with open("README.md", "w") as readme:
        readme.write("### Download Time Results with Pandas\n")
        readme.write(f"{download_time}\n")
    print("The README file was created and the download time was added.")
except Exception as e:
    print(f"Error writing to the README file: {e}")

File already exists: ./data/csv_files/aisdk-2024-03-01.csv, skipping download.File already exists: ./data/csv_files/aisdk-2024-03-02.csv, skipping download.

The download time for 2 files with Pandas is 1 minutes and 34 seconds.
The combined dataset contains 31817670 entries.
The download time for Pandas was successfully updated in the README.


In [16]:
# Step 4: Combine all DataFrames into a single large DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Display a few sample rows to show possible MMSI numbers
print(combined_df.head(10))

           # Timestamp Type of mobile       MMSI   Latitude  Longitude  \
0  01/03/2024 00:00:00        Class A  219000873  56.990910  10.304543   
1  01/03/2024 00:00:00   Base Station    2190068  56.447260  10.945872   
2  01/03/2024 00:00:00        Class A  219016683  56.800165   9.024933   
3  01/03/2024 00:00:00        Class A  219000615  56.967093   9.224287   
4  01/03/2024 00:00:00   Base Station    2190071  57.110043   8.648282   
5  01/03/2024 00:00:00        Class A  219017664  56.974950   8.922530   
6  01/03/2024 00:00:00        Class A  219002686  56.795143   8.863960   
7  01/03/2024 00:00:00        Class A  219030053  57.058252   9.900817   
8  01/03/2024 00:00:00        Class A  219670000  55.463782   8.444915   
9  01/03/2024 00:00:00        Class A  211417590  54.524345  12.675237   

          Navigational status  ROT  SOG    COG  Heading  ... Length  \
0      Under way using engine  NaN  0.0   30.2      NaN  ...    NaN   
1               Unknown value  NaN  NaN    

In [7]:
print(f"The original dataset contains {len(combined_df)} entries.")

The original dataset contains 31817670 entries.


In [8]:
########################################################
# 2. Filter out base stations ("Type of mobile" != "Base Station")
########################################################

# Keep only rows that are not "Base Station"
if "Type of mobile" in combined_df.columns:
    combined_df = combined_df[combined_df["Type of mobile"] != "Base Station"]
else:
    print("Warning: 'Type of mobile' column not found, skipping this step.")

In [9]:
print(f"The adjusted dataset contains {len(combined_df)} entries.")

The adjusted dataset contains 29508390 entries.


In [10]:
########################################################
# 3. Keep only relevant columns to reduce data size
########################################################

relevant_columns = ["MMSI", "Latitude", "Longitude", "# Timestamp"]
combined_df = combined_df[relevant_columns]

########################################################
# 4. Convert Timestamp to datetime format
########################################################

combined_df['# Timestamp'] = pd.to_datetime(combined_df['# Timestamp'], format='%d/%m/%Y %H:%M:%S', errors='coerce')

########################################################
# 5. Filter MMSI numbers with enough data points
########################################################

# Determine the number of data points per MMSI
mmsi_counts = combined_df.groupby("MMSI").size().reset_index(name="count")

# Define a threshold (e.g., at least 50 points)
threshold = 50
valid_mmsi = mmsi_counts[mmsi_counts["count"] >= threshold]["MMSI"].unique()

# Filtered DataFrame containing only MMSI numbers with enough data points
filtered_by_count_df = combined_df[combined_df["MMSI"].isin(valid_mmsi)]

In [11]:
########################################################
# 6. Filter by specific MMSI and time range + plot the route
########################################################

mmsi_number = 219016832  # Replace with your desired MMSI

# Define start and end time (in the format "dd/mm/yyyy HH:MM:SS")
start_str = "01/03/2024 00:00:00"  # March 3, 2024, 00:00
end_str = "01/03/2024 06:59:59"    # March 3, 2024, 06:59

# Convert start and end times to datetime objects
start_dt = pd.to_datetime(start_str, format="%d/%m/%Y %H:%M:%S", errors="coerce")
end_dt = pd.to_datetime(end_str, format="%d/%m/%Y %H:%M:%S", errors="coerce")

# Check if the MMSI has enough data points
if mmsi_number not in valid_mmsi:
    print(f"MMSI {mmsi_number} does not have enough data points to display a meaningful route.")
else:
    # Filter by MMSI and time range
    route_df = filtered_by_count_df[
        (filtered_by_count_df["MMSI"] == mmsi_number) &
        (filtered_by_count_df["# Timestamp"] >= start_dt) &
        (filtered_by_count_df["# Timestamp"] <= end_dt)
    ].sort_values("# Timestamp")

    # Check if filtered data is available
    if route_df.empty:
        print(f"No data for MMSI {mmsi_number} between {start_dt} and {end_dt}")
    else:
        # Create a map and plot the route
        mean_lat = route_df["Latitude"].mean()
        mean_lon = route_df["Longitude"].mean()
        
        route_map = folium.Map(location=[mean_lat, mean_lon], zoom_start=8)
        
        # List of coordinates for PolyLine
        coords = route_df[["Latitude", "Longitude"]].values.tolist()
        
        # Add the PolyLine
        folium.PolyLine(coords, color="blue", weight=2.5, opacity=1).add_to(route_map)
        
        # Optional: Mark points (commented out if not needed)
        # for _, row in route_df.iterrows():
        #     folium.CircleMarker(
        #         location=[row['Latitude'], row['Longitude']],
        #         radius=2,
        #         color='red',
        #         fill=True,
        #         fill_color='red',
        #         fill_opacity=0.7,
        #         popup=f"Timestamp: {row['# Timestamp']}"
        #     ).add_to(route_map)
        
        # Save or display the map
        route_map.save("ship_route.html")
        route_map