In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import folium
import json
import random
import requests
import ast
import re
import time
import datetime
from tqdm import tqdm
from shapely.geometry import Point, LineString, mapping
from sklearn.cluster import DBSCAN, KMeans
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, PowerTransformer
from sklearn.metrics import silhouette_score
from scipy.stats import skew
from scipy.spatial import ConvexHull
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster

# pre processing for cluster #

In [None]:
# uploading clustering prepared dataframe
df = pd.read_csv('final_to_cluster.csv')

In [None]:
# dropping non useful columns
df["MMSI_CHANGES"] = df["MMSI_CHANGES"].astype(int)
df = df.drop(columns=["BUILT"])
df = df.drop(columns=["FLAGNAME"])
df = df.drop(columns=["Name"])
df = df.drop(columns=["Name_null"])
df = df.drop(columns=["IMO_null"])
df = df.drop(columns=["IMO"])

In [None]:
# List of numerical features (including boolean columns)
numerical_cols = [
    'counts_slow_speed', 'sum_telco_slow', 'sum_power_slow', 
    'sum_gas_slow', 'counts_null_speed', 
    'sum_telco_null','sum_power_null','sum_gas_null', 'counts_cc', 'sum_telco_cc', 'sum_power_cc','sum_gas_cc', 'counts_zz', 'counts_dark','sum_telco_zz','sum_power_zz','sum_gas_zz','sum_telco_dark', 
    'sum_power_dark', 'sum_gas_dark',
    'MMSI_CHANGES', 'OLDER15Y', 'FOC'
]


In [None]:
# columns to be transformed
transform_cols = [
    'counts_slow_speed', 'sum_telco_slow', 'sum_power_slow', 
    'sum_gas_slow', 'counts_null_speed', 
    'sum_telco_null','sum_power_null','sum_gas_null', 'counts_cc', 'sum_telco_cc', 'sum_power_cc','sum_gas_cc', 'counts_zz', 'counts_dark','sum_telco_zz','sum_power_zz','sum_gas_zz','sum_telco_dark', 
    'sum_power_dark', 'sum_gas_dark'
    
]

In [None]:


# Apply Yeo-Johnson to stubborn columns
pt = PowerTransformer(method='yeo-johnson')
for col in transform_cols:
    df[col] = pt.fit_transform(df[[col]])

In [None]:
#check skreweness of numerical columns
skewness = df[numerical_cols].skew()
print(skewness)

In [None]:
#scaling numerical columns
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(df[numerical_cols])

# k means #

In [None]:
# finding optimal k with silhouette score and elbow method
#defining the range of k values to test
k_values = range(2, 10)
inertia = []
silhouette_scores = []

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    clusters = kmeans.fit_predict(X_scaled)  # X_scaled is your MinMax-scaled data
    inertia.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_scaled, clusters))

# Create side-by-side subplots
fig, axs = plt.subplots(1, 2, figsize=(12, 5))

# Elbow Method plot
axs[0].plot(k_values, inertia, marker='o')
axs[0].set_xlabel('Number of Clusters (k)')
axs[0].set_ylabel('Inertia')
axs[0].set_title('Elbow Method')

# Silhouette Score plot
axs[1].plot(k_values, silhouette_scores, marker='o', color='orange')
axs[1].set_xlabel('Number of Clusters (k)')
axs[1].set_ylabel('Silhouette Score')
axs[1].set_title('Silhouette Scores')

# Adjust layout and display
plt.tight_layout()
plt.show()

In [None]:
# kmeans with 2 clusters
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(X_scaled)
df['cluster'] = clusters

In [None]:
#inspecting the kmeans cluster results
summary = df.groupby('cluster').mean()
print(summary)

# Hierarchical clustering #

In [None]:
# Assuming X_scaled is your MinMax-scaled data (as a numpy array or DataFrame)
# 1. Compute the linkage matrix
Z = linkage(X_scaled, method='ward')  

# 2. Plot the dendrogram
plt.figure(figsize=(12, 6))
dendrogram(Z)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample index')
plt.ylabel('Distance')
plt.show()

# 3. setting two as the number of clusters 
clusters = fcluster(Z, t=2, criterion='maxclust')
df['hierarchical_cluster'] = clusters

In [None]:
#inspecting hierarchical clustering results
summary_h = df.groupby('hierarchical_cluster').mean()
print(summary_h)

In [None]:
# Kmeans Cluster: if 0 -> "High", if 1 -> "Low"
df['cluster'] = df['cluster'].replace({1: 'High', 0: 'Low'})

# Hierarchical_cluster: if 2 -> "High", if 1 -> "Low"
df['hierarchical_cluster'] = df['hierarchical_cluster'].replace({1: 'Low', 2: 'High'})

In [None]:
# save the clustering results to a new CSV file
df.to_csv('clustering_results.csv', index=False)

# Validation #

In [None]:
# uploading the main dataset
master_df = pd.read_csv('traffic_df.csv')

In [None]:
#keep only the columns of interest
master_df = master_df[['MMSI', 'IMO', 'NAME']]

In [None]:
#drop duplicates based on MMSI and IMO
master_df = master_df.drop_duplicates(subset=['MMSI', 'IMO'])

In [None]:
#merging dataset with main dataset
merged_df = pd.merge(master_df, df, on="MMSI", how="inner")

In [None]:
# creating the list of IMO EU sanctioned vessels
IMO_sanctioned = [
    9179842, 8517839, 9915090, 9915105, 9353113, 9289520, 9402471, 9339337, 9318553, 9318541,
    9333436, 9161003, 9358010, 9763837, 9763849, 9830769, 9259185, 9079169, 9331141, 9394935,
    9331153, 9144782, 9277735, 9312884, 9270529, 9312896, 9341067, 9299666, 9256602, 9273052,
    9323376, 9737187, 9413547, 9296391, 9274331, 9274343, 9385233, 9826902, 9511533, 9341081,
    9306794, 9292204, 9299692, 9413561, 9333424, 9412347, 9610781, 9299898, 9422445, 9421972,
    9265756, 9271585, 9842188, 9511521, 9412335, 9316127, 9327372, 9436941, 9849887, 9276030,
    9610793, 9577082, 9577094, 9522324, 9382798, 9354301, 9354313, 9235713, 9321847, 9513139,
    9321976, 9291250, 9866380, 9341079, 9281683, 9953535, 9953523, 9953509, 9953511, 9301419,
    9412359, 9411020, 9901037, 9301421, 9301407, 9301380, 9301392, 9231212, 9350654, 9332822,
    9251274, 9319703, 9404948, 9292503, 9346732, 9288710, 9288746, 9281891, 9276561, 9288708,
    9321706, 9327413, 9332810, 9288693, 9843560, 9306782, 9258002, 9299719, 9256066, 9339301,
    9322956, 9309576, 9329667, 9421960, 9752084, 9309588, 9258868, 9288851, 9194983, 9271951,
    9332781, 9282479, 9281011, 9321689, 9255660, 9285835, 9318539, 9299769, 9258026, 9308065,
    9409467, 9266475, 9209972, 9216298, 9360130, 9418494, 9286657, 9353096, 9413573, 9243148,
    9286073, 9610810, 9282493, 9333400, 9388792, 9419137, 9610808, 9247431, 9288899, 9544281,
    9317949, 9288722, 9397559
]

In [None]:
# cross-referencing the IMO sanctioned vessels with the merged dataframe
sanctioned_df = merged_df[merged_df['IMO'].isin(IMO_sanctioned)]

In [None]:
sanctioned_df[['MMSI', 'IMO', 'NAME', 'cluster', 'hierarchical_cluster']]