In [1]:
# Standard Library
import ast
import json
import random
import re
import time
from datetime import timedelta

# Third-Party Libraries
import folium
import geopandas as gpd
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import seaborn as sns
from tqdm import tqdm
import glob

# Geospatial Libraries
from shapely.geometry import Point, LineString, mapping

# Machine Learning / Clustering
from sklearn.cluster import DBSCAN
from sklearn.ensemble import IsolationForest
from scipy.spatial import ConvexHull


In [None]:
df = pd.read_csv(path)

In [None]:
# Directory containing CSV files
csv_dir = "AIS_data_split"

# Coordinate bounds
min_lat = 52.6535
max_lat = 67.0806
min_lon = 9.3656
max_lon = 37.4689

# Read and merge all CSV files
all_files = glob.glob(os.path.join(csv_dir, "*.csv"))
dfs = []

for file in all_files:
    df = pd.read_csv(file)

    # Ensure required columns exist
    if 'LATITUDE' in df.columns and 'LONGITUDE' in df.columns:
        # Filter by coordinates
        df_filtered = df[
            (df['LATITUDE'] >= min_lat) & (df['LATITUDE'] <= max_lat) &
            (df['LONGITUDE'] >= min_lon) & (df['LONGITUDE'] <= max_lon)
        ]
        dfs.append(df_filtered)

# Combine all filtered data into one DataFrame
merged_24_25 = pd.concat(dfs, ignore_index=True)

In [None]:
# List of file paths to your multiple GeoJSON files
file_paths = [
    'den.json',
    'fin.json',
    'pol.json',
    'est.json',
    'lit.json',
    'ger.json',
    'lat.json',
    'swe.json'
]

In [None]:
# Step 1: Load GeoJSON files into a single GeoDataFrame
economic_areas_gdf_list = [gpd.read_file(file) for file in file_paths]
economic_areas_gdf = pd.concat(economic_areas_gdf_list, ignore_index=True)

# Step 2: Create GeoDataFrame with point geometry
geometry = [Point(lon, lat) for lon, lat in zip(merged_24_25['LONGITUDE'], merged_24_25['LATITUDE'])]
gdf = gpd.GeoDataFrame(merged_24_25, geometry=geometry)

# Step 3: Set same CRS for both GeoDataFrames
gdf = gdf.set_crs(economic_areas_gdf.crs, allow_override=True)

# Step 4: Spatial join to find points inside polygons
gdf_with_area = gpd.sjoin(gdf, economic_areas_gdf, how='left', predicate='within')

# Step 5: Add area label (e.g., 'territory1') to GeoDataFrame
gdf['area_label'] = gdf_with_area['territory1']

# Fill missing area labels with 'Russia'
gdf['area_label'] = gdf['area_label'].fillna('Russia')


In [None]:
# renaming the dataframe
traffic_df = gdf

In [None]:
# filtering the datset with the coordinates of the Baltic Sea
filtered_traffic_df = traffic_df[
    (traffic_df['LATITUDE'] >= 54) & 
    (traffic_df['LONGITUDE'] <= 30.25) & 
    (traffic_df['LONGITUDE'] >= 9.28)
]
print("Filtered dataset shape:", filtered_traffic_df.shape)

In [None]:
# checking traffic data
# Load infrastructure data from file
def load_dict_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    content = re.sub(r'"([^"]+)"\s*:', r'"\1":', content)
    return ast.literal_eval("{" + content + "}")

# Load all datasets
telco_cables_dict = load_dict_from_file('telco_cables.txt')
pow_cables_dict = load_dict_from_file('power_cables.txt')
gas_pipe_dict = load_dict_from_file('gas_pipe.txt')

# Add new columns, initialized to 0
traffic_df['near_telco_cable'] = 0
traffic_df['near_power_cable'] = 0
traffic_df['near_gas_pipe'] = 0

from shapely.geometry import LineString, mapping
from shapely.ops import transform
from pyproj import Transformer
import folium

# Transformer for EPSG:4326 (lat/lon) to EPSG:3857 (meters) and back
project_to_meters = Transformer.from_crs("EPSG:4326", "EPSG:3857", always_xy=True).transform
project_to_degrees = Transformer.from_crs("EPSG:3857", "EPSG:4326", always_xy=True).transform

# Create map centered on first cable point
first_cable_points = list(telco_cables_dict.values())[0]
center_point = first_cable_points[0]
m = folium.Map(location=[center_point[1], center_point[0]], zoom_start=6)

# Plot telco cables and buffers
for cable_name, cable_points in telco_cables_dict.items():
    cable_line = LineString(cable_points)
    cable_line_proj = transform(project_to_meters, cable_line)
    buffer_proj = cable_line_proj.buffer(10000)
    buffer_latlon = transform(project_to_degrees, buffer_proj)
    folium.PolyLine(
        locations=[(pt[1], pt[0]) for pt in cable_points],
        color='red',
        weight=3,
        opacity=0.8,
        tooltip=cable_name
    ).add_to(m)
    folium.GeoJson(
        mapping(buffer_latlon),
        style_function=lambda x: {
            'fillColor': 'blue',
            'color': 'blue',
            'weight': 1,
            'fillOpacity': 0.3
        }
    ).add_to(m)

# Plot power cables and buffers
for cable_name, cable_points in pow_cables_dict.items():
    cable_line = LineString(cable_points)
    cable_line_proj = transform(project_to_meters, cable_line)
    buffer_proj = cable_line_proj.buffer(10000)
    buffer_latlon = transform(project_to_degrees, buffer_proj)
    folium.PolyLine(
        locations=[(pt[1], pt[0]) for pt in cable_points],
        color='orange',
        weight=3,
        opacity=0.8,
        tooltip=cable_name
    ).add_to(m)
    folium.GeoJson(
        mapping(buffer_latlon),
        style_function=lambda x: {
            'fillColor': 'yellow',
            'color': 'yellow',
            'weight': 1,
            'fillOpacity': 0.3
        }
    ).add_to(m)

# Plot gas pipes and buffers
for cable_name, cable_points in gas_pipe_dict.items():
    cable_line = LineString(cable_points)
    cable_line_proj = transform(project_to_meters, cable_line)
    buffer_proj = cable_line_proj.buffer(10000)
    buffer_latlon = transform(project_to_degrees, buffer_proj)
    folium.PolyLine(
        locations=[(pt[1], pt[0]) for pt in cable_points],
        color='black',
        weight=3,
        opacity=0.8,
        tooltip=cable_name
    ).add_to(m)
    folium.GeoJson(
        mapping(buffer_latlon),
        style_function=lambda x: {
            'fillColor': 'green',
            'color': 'green',
            'weight': 1,
            'fillOpacity': 0.3
        }
    ).add_to(m)

# Check if traffic points are within any cable buffer and add to map
for _, row in tqdm(traffic_df.dropna(subset=['LATITUDE', 'LONGITUDE']).iterrows()):
    lon, lat = row['LONGITUDE'], row['LATITUDE']
    point = Point(lon, lat)
    point_added = False

    # Check if point is near any telco cable
    for cable_name, cable_points in telco_cables_dict.items():
        cable_line = LineString(cable_points)
        cable_buffer = cable_line.buffer(0.1)
        if cable_buffer.contains(point):
            folium.CircleMarker(
                location=(lat, lon),
                radius=1,
                color='purple',
                fill=True,
                fill_color='purple',
                fill_opacity=0.8
            ).add_to(m)
            point_added = True
            traffic_df.at[_,'near_telco_cable'] = 1 

    # Check if point is near any gas pipe
    for cable_name2, cable_points2 in gas_pipe_dict.items():
        cable_line2 = LineString(cable_points2)
        cable_buffer2 = cable_line2.buffer(0.1)
        if cable_buffer2.contains(point):
            folium.CircleMarker(
                location=(lat, lon),
                radius=1,
                color='blue',
                fill=True,
                fill_color='blue',
                fill_opacity=0.8
            ).add_to(m)
            point_added = True
            traffic_df.at[_, 'near_gas_pipe'] = 1 

    # Check if point is near any power cable
    for cable_name3, cable_points3 in pow_cables_dict.items():
        cable_line3 = LineString(cable_points3)
        cable_buffer3 = cable_line3.buffer(0.1)
        if cable_buffer3.contains(point):
            folium.CircleMarker(
                location=(lat, lon),
                radius=1,
                color='orange',
                fill=True,
                fill_color='orange',
                fill_opacity=0.8
            ).add_to(m)
            point_added = True
            traffic_df.at[_, 'near_power_cable'] = 1 

    # If point not added, keep all flags as 0
    if not point_added:
        traffic_df.at[_, 'near_telco_cable'] = 0
        traffic_df.at[_, 'near_gas_pipe'] = 0
        traffic_df.at[_, 'near_power_cable'] = 0

In [None]:
#adding ports to the datframe
#uploading port data
ports = pd.read_csv('World_Port_Index.csv')


In [None]:
#applying the same coordinate rounding to ports and traffic_df

ports['LATITUDE'] = ports['LATITUDE'].round(2)
ports['LONGITUDE'] = ports['LONGITUDE'].round(2)

traffic_df['LAT'] = traffic_df['LATITUDE'].round(2)
traffic_df['LONG'] = traffic_df['LONGITUDE'].round(2)

In [None]:
#loop to find matching ports
for _, row in traffic_df.iterrows():
    lat = row['LAT']
    lon = row['LONG']
    
    # Check if the rounded coordinates match any port
    matching_ports = ports[(ports['LATITUDE'] == lat) & (ports['LONGITUDE'] == lon)]
    
    if not matching_ports.empty:
        traffic_df.at[_, 'port'] = matching_ports.iloc[0]['PORT_NAME']
    else:
        traffic_df.at[_, 'port'] = 'no_port'

In [None]:
# Save the updated DataFrame to a new CSV file
traffic_df.to_csv('traffic_df.csv', index=False)