In [None]:
# Standard Library
import ast
import json
import random
import re
import time
from datetime import datetime, timedelta

# Third-Party Libraries
import folium
from folium.plugins import HeatMap  # Include if used
import geopandas as gpd
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import seaborn as sns
from tqdm import tqdm

# Geospatial Libraries
from shapely.geometry import Point, LineString, mapping
from shapely.ops import transform
from pyproj import Transformer

# Machine Learning / Clustering
from sklearn.cluster import DBSCAN
from sklearn.ensemble import IsolationForest
from scipy.spatial import ConvexHull


In [None]:
# loading the dataset
df = pd.read_csv('fin_data_with_area.csv')


In [None]:
# filtering the dataset to include only rows where 'PORT' is 'no_port'
df = df[df['PORT'] == 'no_port']

In [None]:
# Haversine formula to calculate great-circle distance
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth's radius in km
    dlat = np.radians(lat2 - lat1)
    dlon = np.radians(lon2 - lon1)
    a = np.sin(dlat / 2)**2 + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c

In [None]:
# Calculate speed in km/h for each vessel group
def calculate_speed(group):
    group['prev_time'] = group['DATE TIME (UTC)'].shift(1)
    group['time_diff_hours'] = (group['DATE TIME (UTC)'] - group['prev_time']).dt.total_seconds() / 3600
    group['prev_LAT'] = group['LATITUDE'].shift(1)
    group['prev_LON'] = group['LONGITUDE'].shift(1)
    group['distance_km'] = group.apply(
        lambda row: haversine(row['prev_LAT'], row['prev_LON'], row['LATITUDE'], row['LONGITUDE']) if pd.notna(row['prev_LAT']) else np.nan,
        axis=1
    )
    group['speed_kmh'] = group['distance_km'] / group['time_diff_hours']
    return group

# Ensure datetime format
df['DATE TIME (UTC)'] = pd.to_datetime(df['DATE TIME (UTC)'], errors='coerce')

# Apply speed calculation by MMSI group
df = df.groupby('MMSI', group_keys=False).apply(calculate_speed)

In [None]:
#converting from km/h to knots
df['speed_knots'] = df['speed_kmh'] * 0.539957

In [None]:
#correcting improbable speed values
df.loc[df['SPEED'] >= 20, 'SPEED'] = df['speed_knots'] 
df.loc[df['SPEED'] < 1, 'SPEED'] = df['speed_knots'] 

In [None]:
#deleting rows with speed greater than 25 knots as improbable
df = df[df['SPEED'] < 25]

NULL SPEED

In [None]:
#creating the null speed dataframe by filtering rows where SPEED is less than 1 knot
null_speed = df[df['SPEED'] < 1]

In [None]:

#uploading imp infrastructure data
def load_dict_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    content = re.sub(r'"([^"]+)"\s*:', r'"\1":', content)
    return ast.literal_eval("{" + content + "}")

# Load all datasets
telco_cables_dict = load_dict_from_file('telco_cables.txt')
pow_cables_dict = load_dict_from_file('power_cables.txt')
gas_pipe_dict = load_dict_from_file('gas_pipe.txt')


In [None]:
# Add new columns initialized to 0
null_speed['ns_near_telco_cable'] = 0
null_speed['ns_near_power_cable'] = 0
null_speed['ns_near_gas_pipe'] = 0


# Create map centered on first cable point
first_cable_points = list(telco_cables_dict.values())[0]
center_point = first_cable_points[0]
m = folium.Map(location=[center_point[1], center_point[0]], zoom_start=6)

# Add telco cables and buffers to map
for cable_name, cable_points in telco_cables_dict.items():
    cable_line = LineString(cable_points)
    cable_buffer = cable_line.buffer(0.1)
    folium.PolyLine(
        locations=[(point[1], point[0]) for point in cable_points],
        color='red', weight=3, opacity=0.8, tooltip=cable_name
    ).add_to(m)
    cable_buffer_geojson = mapping(cable_buffer)
    folium.GeoJson(
        cable_buffer_geojson,
        style_function=lambda x: {
            'fillColor': 'blue', 'color': 'blue', 'weight': 1, 'fillOpacity': 0.3
        }
    ).add_to(m)

# Add gas pipes and buffers to map
for cable_name2, cable_points2 in gas_pipe_dict.items():
    cable_line2 = LineString(cable_points2)
    cable_buffer2 = cable_line2.buffer(0.1)
    folium.PolyLine(
        locations=[(point[1], point[0]) for point in cable_points2],
        color='black', weight=3, opacity=0.8, tooltip=cable_name2
    ).add_to(m)
    cable_buffer_geojson2 = mapping(cable_buffer2)
    folium.GeoJson(
        cable_buffer_geojson2,
        style_function=lambda x: {
            'fillColor': 'green', 'color': 'green', 'weight': 1, 'fillOpacity': 0.3
        }
    ).add_to(m)

# Add power cables and buffers to map
for cable_name3, cable_points3 in pow_cables_dict.items():
    cable_line3 = LineString(cable_points3)
    cable_buffer3 = cable_line3.buffer(0.1)
    folium.PolyLine(
        locations=[(point[1], point[0]) for point in cable_points3],
        color='pink', weight=3, opacity=0.8, tooltip=cable_name3
    ).add_to(m)
    cable_buffer_geojson3 = mapping(cable_buffer3)
    folium.GeoJson(
        cable_buffer_geojson3,
        style_function=lambda x: {
            'fillColor': 'yellow', 'color': 'yellow', 'weight': 1, 'fillOpacity': 0.3
        }
    ).add_to(m)



# Check each point, mark if near infrastructure, update columns
for _, row in tqdm(null_speed.dropna(subset=['LATITUDE', 'LONGITUDE']).iterrows()):
    lon, lat = row['LONGITUDE'], row['LATITUDE']
    point = Point(lon, lat)
    point_added = False

    # Check telco cables
    for cable_name, cable_points in telco_cables_dict.items():
        cable_line = LineString(cable_points)
        cable_buffer = cable_line.buffer(0.1)
        if cable_buffer.contains(point):
            folium.CircleMarker(
                location=(lat, lon), radius=1, color='purple',
                fill=True, fill_color='purple', fill_opacity=0.8
            ).add_to(m)
            point_added = True
            null_speed.at[_, 'ns_near_telco_cable'] = 1
            break

    # Check gas pipes
    for cable_name2, cable_points2 in gas_pipe_dict.items():
        cable_line2 = LineString(cable_points2)
        cable_buffer2 = cable_line2.buffer(0.1)
        if cable_buffer2.contains(point):
            folium.CircleMarker(
                location=(lat, lon), radius=1, color='blue',
                fill=True, fill_color='blue', fill_opacity=0.8
            ).add_to(m)
            point_added = True
            null_speed.at[_, 'ns_near_gas_pipe'] = 1
            break

    # Check power cables
    for cable_name3, cable_points3 in pow_cables_dict.items():
        cable_line3 = LineString(cable_points3)
        cable_buffer3 = cable_line3.buffer(0.1)
        if cable_buffer3.contains(point):
            folium.CircleMarker(
                location=(lat, lon), radius=1, color='orange',
                fill=True, fill_color='orange', fill_opacity=0.8
            ).add_to(m)
            point_added = True
            null_speed.at[_, 'ns_near_power_cable'] = 1
            break


    # If not near any infrastructure, keep columns at 0
    if not point_added:
        null_speed.at[_, 'ns_near_telco_cable'] = 0
        null_speed.at[_, 'ns_near_gas_pipe'] = 0
        null_speed.at[_, 'ns_near_power_cable'] = 0


# Save map to HTML file
m.save("all_cables_null_speed_map.html")




SLOW SPEED

In [None]:
#creatng a dataframe for vessels under way

moving_vessel = df[
    df['NAVSTAT'].isin([0, 2, 3, 8, 15]) & (df['SPEED'] >= 1)
]

In [None]:
#reset index for moving_vessel
moving_vessel.reset_index(drop=True, inplace=True)

In [None]:
# Sort DataFrame by MMSI and date
moving_vessel = moving_vessel.sort_values(by=['MMSI', 'DATE TIME (UTC)'])

# Calculate speed difference within each MMSI group
moving_vessel['SpeedDiff'] = moving_vessel.groupby('MMSI')['SPEED'].diff()

In [None]:
# IQR calculation for speed
Q1 = moving_vessel['SPEED'].quantile(0.25)
Q3 = moving_vessel['SPEED'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print(f"Lower Bound: {lower_bound}, Upper Bound: {upper_bound}")

In [None]:
lower_bound = lower_bound.round(2)
upper_bound = upper_bound.round(2)

In [None]:
#IQR visualization with distribution plot and bounds
moving_vessel['SPEED'].hist(bins=10, color='blue', edgecolor='black')
plt.title('Speed Distribution of Under Way Vessels')
plt.xlabel('Speed')
plt.ylabel('Frequency')
plt.grid(axis='y', alpha=0.05)

plt.axvline(lower_bound, color='red', linestyle='dashed', linewidth=2, label=f'Lower Bound: {lower_bound}')
plt.axvline(upper_bound, color='green', linestyle='dashed', linewidth=2, label=f'Upper Bound: {upper_bound}')
plt.legend()

plt.show()

In [None]:
#creating slow Speed dataframe
df_slow = moving_vessel[moving_vessel['SPEED'] <= lower_bound]

In [None]:
#creating fast Speed dataframe
df_fast = moving_vessel[moving_vessel['SPEED'] >= upper_bound]

In [None]:
moving_vessel['SpeedDiff'].fillna(0, inplace=True)

In [None]:
# Add new columns initialized to 0
df_slow['ss_near_telco_cable'] = 0
df_slow['ss_near_power_cable'] = 0
df_slow['ss_near_gas_pipe'] = 0



# Set up coordinate transformers (lat/lon <-> metric)
project_to_meters = Transformer.from_crs("EPSG:4326", "EPSG:3857", always_xy=True).transform
project_to_degrees = Transformer.from_crs("EPSG:3857", "EPSG:4326", always_xy=True).transform

# Create map centered on first cable point
first_cable_points = list(telco_cables_dict.values())[0]
center_point = first_cable_points[0]
m = folium.Map(location=[center_point[1], center_point[0]], zoom_start=6)

# Add telco cables and 10km buffers to map
for cable_name, cable_points in telco_cables_dict.items():
    cable_line = LineString(cable_points)
    cable_line_proj = transform(project_to_meters, cable_line)
    buffer_proj = cable_line_proj.buffer(10000)
    buffer_latlon = transform(project_to_degrees, buffer_proj)
    folium.PolyLine(
        locations=[(pt[1], pt[0]) for pt in cable_points],
        color='red', weight=3, opacity=0.8, tooltip=cable_name
    ).add_to(m)
    folium.GeoJson(
        mapping(buffer_latlon),
        style_function=lambda x: {
            'fillColor': 'blue', 'color': 'blue', 'weight': 1, 'fillOpacity': 0.3
        }
    ).add_to(m)

# Add power cables and 10km buffers to map
for cable_name, cable_points in pow_cables_dict.items():
    cable_line = LineString(cable_points)
    cable_line_proj = transform(project_to_meters, cable_line)
    buffer_proj = cable_line_proj.buffer(10000)
    buffer_latlon = transform(project_to_degrees, buffer_proj)
    folium.PolyLine(
        locations=[(pt[1], pt[0]) for pt in cable_points],
        color='orange', weight=3, opacity=0.8, tooltip=cable_name
    ).add_to(m)
    folium.GeoJson(
        mapping(buffer_latlon),
        style_function=lambda x: {
            'fillColor': 'yellow', 'color': 'yellow', 'weight': 1, 'fillOpacity': 0.3
        }
    ).add_to(m)

# Add gas pipes and 10km buffers to map
for cable_name, cable_points in gas_pipe_dict.items():
    cable_line = LineString(cable_points)
    cable_line_proj = transform(project_to_meters, cable_line)
    buffer_proj = cable_line_proj.buffer(10000)
    buffer_latlon = transform(project_to_degrees, buffer_proj)
    folium.PolyLine(
        locations=[(pt[1], pt[0]) for pt in cable_points],
        color='black', weight=3, opacity=0.8, tooltip=cable_name
    ).add_to(m)
    folium.GeoJson(
        mapping(buffer_latlon),
        style_function=lambda x: {
            'fillColor': 'green', 'color': 'green', 'weight': 1, 'fillOpacity': 0.3
        }
    ).add_to(m)

# Check each point, mark if near a cable, update columns
for _, row in tqdm(df_slow.dropna(subset=['LATITUDE', 'LONGITUDE']).iterrows()):
    lon, lat = row['LONGITUDE'], row['LATITUDE']
    point = Point(lon, lat)
    point_added = False

    # Check telco cables
    for cable_name, cable_points in telco_cables_dict.items():
        cable_line = LineString(cable_points)
        cable_buffer = cable_line.buffer(0.1)
        if cable_buffer.contains(point):
            folium.CircleMarker(
                location=(lat, lon),
                radius=1,
                color='purple',
                fill=True,
                fill_color='purple',
                fill_opacity=0.8
            ).add_to(m)
            point_added = True
            df_slow.at[_, 'ss_near_telco_cable'] = 1

    # Check gas pipes
    for cable_name2, cable_points2 in gas_pipe_dict.items():
        cable_line2 = LineString(cable_points2)
        cable_buffer2 = cable_line2.buffer(0.1)
        if cable_buffer2.contains(point):
            folium.CircleMarker(
                location=(lat, lon),
                radius=1,
                color='blue',
                fill=True,
                fill_color='blue',
                fill_opacity=0.8
            ).add_to(m)
            point_added = True
            df_slow.at[_, 'ss_near_gas_pipe'] = 1

    # Check power cables
    for cable_name3, cable_points3 in pow_cables_dict.items():
        cable_line3 = LineString(cable_points3)
        cable_buffer3 = cable_line3.buffer(0.1)
        if cable_buffer3.contains(point):
            folium.CircleMarker(
                location=(lat, lon),
                radius=1,
                color='orange',
                fill=True,
                fill_color='orange',
                fill_opacity=0.8
            ).add_to(m)
            point_added = True
            df_slow.at[_, 'ss_near_power_cable'] = 1

    # If not near any cable, keep columns at 0
    if not point_added:
        df_slow.at[_, 'ss_near_telco_cable'] = 0
        df_slow.at[_, 'ss_near_gas_pipe'] = 0
        df_slow.at[_, 'ss_near_power_cable'] = 0

# Save map to HTML file
m.save("all_cables_slow_speed_map.html")
