In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import IsolationForest
import numpy as np

# Load data
data = pd.read_csv('ship_data.csv')  # Replace with the actual path to the CSV file

# Convert BaseDateTime to datetime object
data['BaseDateTime'] = pd.to_datetime(data['BaseDateTime'])

# Extract time-based features
data['hour'] = data['BaseDateTime'].dt.hour
data['day_of_week'] = data['BaseDateTime'].dt.dayofweek
data['month'] = data['BaseDateTime'].dt.month

# Encode Status
label_encoder_status = LabelEncoder()
data['status_encoded'] = label_encoder_status.fit_transform(data['Status'])


features = ['SOG', 'COG', 'Heading', 'status_encoded', 'hour', 'day_of_week', 'month']
X = data[features]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train Isolation Forest
iso_forest = IsolationForest(contamination=0.01, random_state=42)
iso_forest.fit(X_scaled)

# Predict anomalies
data['anomaly'] = iso_forest.predict(X_scaled)  # -1 for anomaly, 1 for normal
anomalies = data[data['anomaly'] == -1]

# Output the anomalies
print(anomalies)

              MMSI        BaseDateTime       LAT        LON   SOG    COG  \
81       367306860 2024-01-01 00:00:03  30.39124  -81.62488  14.7   19.5   
151      316052438 2024-01-01 00:00:00  49.28735 -123.40837  36.9   76.5   
206      367431620 2024-01-01 00:00:03  41.10692  -73.05079   9.5   51.6   
223      366969980 2024-01-01 00:00:04  37.80301 -122.38852   8.5  165.0   
244      367652650 2024-01-01 00:00:04  33.39768 -118.31364  17.3   16.4   
...            ...                 ...       ...        ...   ...    ...   
7296269  564519000 2024-01-01 23:37:57  13.77840  144.58255  13.4    0.6   
7296270  564519000 2024-01-01 23:46:07  13.80869  144.58286  13.3    0.6   
7296271  564519000 2024-01-01 23:47:16  13.81304  144.58292  13.4    0.7   
7296272  564519000 2024-01-01 23:48:27  13.81743  144.58299  13.5    0.9   
7296273  564519000 2024-01-01 23:53:06  13.83440  144.58546  12.9   21.5   

         Heading      VesselName         IMO CallSign  ...  Length  Width  \
81        

In [2]:
anomalies.to_csv('anomalies.csv', index=False)  # Save the anomalies to a CSV file

In [10]:
type_1 = anomalies[anomalies['MMSI']==367145870]

In [4]:
type_1

Unnamed: 0,MMSI,BaseDateTime,LAT,LON,SOG,COG,Heading,VesselName,IMO,CallSign,...,Length,Width,Draft,Cargo,TransceiverClass,hour,day_of_week,month,status_encoded,anomaly
1063,367145870,2024-01-01 00:00:00,41.62224,-70.28484,33.8,58.7,60.0,IYANOUGH,IMO9375719,WDD4527,...,47.0,12.0,1.6,49.0,A,0,0,1,8,-1
7679,367145870,2024-01-01 00:01:02,41.62768,-70.27387,32.1,48.7,37.0,IYANOUGH,IMO9375719,WDD4527,...,47.0,12.0,1.6,49.0,A,0,0,1,8,-1
14817,367145870,2024-01-01 00:02:04,41.63533,-70.27015,17.1,9.2,10.0,IYANOUGH,IMO9375719,WDD4527,...,47.0,12.0,1.6,49.0,A,0,0,1,8,-1
17597,367145870,2024-01-01 00:03:06,41.63770,-70.26955,7.8,11.6,12.0,IYANOUGH,IMO9375719,WDD4527,...,47.0,12.0,1.6,49.0,A,0,0,1,8,-1
23088,367145870,2024-01-01 00:04:08,41.63995,-70.26903,8.0,8.4,9.0,IYANOUGH,IMO9375719,WDD4527,...,47.0,12.0,1.6,49.0,A,0,0,1,8,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7268693,367145870,2024-01-01 23:25:42,41.32407,-70.10913,33.7,340.3,345.0,IYANOUGH,IMO9375719,WDD4527,...,47.0,12.0,1.6,49.0,A,23,0,1,8,-1
7276157,367145870,2024-01-01 23:37:04,41.42372,-70.16091,34.2,339.8,342.0,IYANOUGH,IMO9375719,WDD4527,...,47.0,12.0,1.6,49.0,A,23,0,1,8,-1
7281996,367145870,2024-01-01 23:43:16,41.47645,-70.19625,34.5,331.1,333.0,IYANOUGH,IMO9375719,WDD4527,...,47.0,12.0,1.6,49.0,A,23,0,1,8,-1
7283910,367145870,2024-01-01 23:49:28,41.52752,-70.23509,34.0,328.1,331.0,IYANOUGH,IMO9375719,WDD4527,...,47.0,12.0,1.6,49.0,A,23,0,1,8,-1


In [13]:
type_1.to_csv('type_3.csv', index=False)  # Save the anomalies to a CSV file