In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

In [2]:
dt = pd.read_csv("AIS_MMSI_sampled.csv")
df = pd.DataFrame(dt)

In [3]:
display(df.head(5))

Unnamed: 0,MMSI,BaseDateTime,LAT,LON,SOG,COG,Heading,VesselName,IMO,CallSign,VesselType,Status,Length,Width,Draft,Cargo,TransceiverClass
0,368061000,2022-05-22T00:00:01,29.58393,-89.09923,0.0,56.7,53.0,C-CLIPPER,IMO9132167,WCX6887,90.0,0.0,67.0,17.0,4.9,90.0,A
1,316005713,2022-05-22T00:00:05,49.2976,-123.02928,0.1,166.7,511.0,PACIFIC FORCE,,,52.0,0.0,14.0,7.0,2.6,52.0,A
2,367357110,2022-05-22T00:00:01,39.08448,-81.76003,0.0,105.5,511.0,PERE MARQUETTE,,WDE4728,31.0,12.0,42.0,10.0,,57.0,A
3,367781550,2022-05-22T00:00:02,41.40401,-88.21831,5.4,22.4,17.0,GARLAND GASPARD,,WDJ4830,31.0,12.0,,,,31.0,A
4,368731000,2022-05-22T00:00:00,36.88328,-76.35181,0.0,0.0,511.0,CG FRANK DREW,IMO9177258,NKDL,90.0,5.0,53.0,10.0,2.4,90.0,A


In [4]:
# since we have MMSI we can drop VesselName, and IMO additionally callsign is not required for our algorithm
df = df.drop(columns=["VesselName","IMO","CallSign","Length","Width","Draft", "Cargo"])

In [5]:
print(df.isnull().sum())

MMSI                      0
BaseDateTime              0
LAT                       0
LON                       0
SOG                       0
COG                       0
Heading                   0
VesselType            38904
Status              3005730
TransceiverClass          0
dtype: int64


In [6]:
status = df.dropna(subset=['Status'])

In [7]:
print(status)

               MMSI         BaseDateTime       LAT        LON   SOG    COG  \
0         368061000  2022-05-22T00:00:01  29.58393  -89.09923   0.0   56.7   
1         316005713  2022-05-22T00:00:05  49.29760 -123.02928   0.1  166.7   
2         367357110  2022-05-22T00:00:01  39.08448  -81.76003   0.0  105.5   
3         367781550  2022-05-22T00:00:02  41.40401  -88.21831   5.4   22.4   
4         368731000  2022-05-22T00:00:00  36.88328  -76.35181   0.0    0.0   
...             ...                  ...       ...        ...   ...    ...   
11467257  303935004  2022-05-13T22:48:50  13.39938  144.84264   5.4   74.8   
11467258  303935004  2022-05-13T22:51:11  13.40015  144.84582   7.6   79.3   
11467259  303935004  2022-05-13T22:42:31  13.40065  144.84145   2.4  327.0   
11467260  303935004  2022-05-13T22:52:30  13.40059  144.84720   3.5   40.3   
11467261  219349000  2022-05-13T14:16:19  12.29541  141.76559  14.4  150.9   

          Heading  VesselType  Status TransceiverClass  
0     

In [8]:
print(status.isnull().sum())

MMSI                   0
BaseDateTime           0
LAT                    0
LON                    0
SOG                    0
COG                    0
Heading                0
VesselType          4745
Status                 0
TransceiverClass       0
dtype: int64


In [9]:
X_features = status.iloc[:,2:7].values
Y_features = status.iloc[:,8].values
X_train, X_test, Y_train, Y_test = train_test_split(X_features,Y_features, test_size = 0.3, random_state = 101)

In [10]:
clf = KNeighborsClassifier(n_neighbors = 1).fit(X_train, Y_train)
clf.fit(X_train,Y_train)

KNeighborsClassifier(n_neighbors=1)

In [11]:
pred = clf.predict(X_test)

In [12]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(Y_test,pred)
print("\nAccuracy Of KNN For The Given Dataset : ", accuracy)


Accuracy Of KNN For The Given Dataset :  0.8848632635534931


In [13]:
missing_status_values = df.iloc[:,2:7].values

In [14]:
status_predicted =  clf.predict(missing_status_values)

In [15]:
impute_status = pd.Series(status_predicted)

In [16]:
df['Status']

0            0.0
1            0.0
2           12.0
3           12.0
4            5.0
            ... 
11467257    15.0
11467258    15.0
11467259    15.0
11467260    15.0
11467261     0.0
Name: Status, Length: 11467262, dtype: float64

In [17]:
impute_status

0            0.0
1            0.0
2           12.0
3           12.0
4            5.0
            ... 
11467257    15.0
11467258    15.0
11467259    15.0
11467260    15.0
11467261     0.0
Length: 11467262, dtype: float64

In [18]:
df['Status'].fillna(impute_status,inplace= True)

In [19]:
df.iloc[40,8]

5.0

In [20]:
missing_vessel_types = df[df['VesselType'].isnull()]
print(missing_vessel_types["MMSI"].nunique())

109


In [21]:
df= df.dropna()

In [22]:
print(df.isnull().sum())

MMSI                0
BaseDateTime        0
LAT                 0
LON                 0
SOG                 0
COG                 0
Heading             0
VesselType          0
Status              0
TransceiverClass    0
dtype: int64


In [23]:
print(df["TransceiverClass"].nunique())

2


In [24]:
df['TransceiverClass'] = df['TransceiverClass'].map({"A": 1, "B": 0})

In [25]:
df[['Date', 'Time']] = df['BaseDateTime'].str.split('T', 1, expand=True)

In [26]:
import datetime as dt

In [27]:
df['Date'] = pd.DatetimeIndex(df['Date']).day

In [28]:
df['Time (mins)'] = df['Time'].str.split(':').apply(lambda x: int(x[0]) * 60 + int(x[1]) + int(x[2]) / 60)

In [29]:
df = df.drop(columns=['BaseDateTime','Time'])

In [30]:
df.head(5)

Unnamed: 0,MMSI,LAT,LON,SOG,COG,Heading,VesselType,Status,TransceiverClass,Date,Time (mins)
0,368061000,29.58393,-89.09923,0.0,56.7,53.0,90.0,0.0,1,22,0.016667
1,316005713,49.2976,-123.02928,0.1,166.7,511.0,52.0,0.0,1,22,0.083333
2,367357110,39.08448,-81.76003,0.0,105.5,511.0,31.0,12.0,1,22,0.016667
3,367781550,41.40401,-88.21831,5.4,22.4,17.0,31.0,12.0,1,22,0.033333
4,368731000,36.88328,-76.35181,0.0,0.0,511.0,90.0,5.0,1,22,0.0


In [32]:
df.to_csv('Cleaned_AIS_data.csv',index=False)