In [4]:
import pandas as pd
import numpy as np
import json
import datetime
import zipfile
import os
from sklearn.cluster import DBSCAN
from matplotlib import pyplot as plt

# Region of interest
min_lat = 4
max_lat = 24
min_lon = 110
max_lon = 130
# May need to adjust boundaries. Currently barely contains Taiwan, but has a lot of the Philippine Sea

# Remove data outside of our box
def spatial_filter(x, y_0, y_1, x_0, x_1):
    return x[(x['Latitude'] >= y_0) & (x['Latitude'] <= y_1) & (x['Longitude'] >= x_0) & (x['Longitude'] <= x_1)]

# For labeling rows that occur during an exercise
def label_exercise(date):
    date = pd.to_datetime(date, format=' %Y %b %d %H:%M:%S UTC', utc=True)
    date_ranges = [
        [pd.to_datetime("2019-08-01 00:00:00", format='%Y-%m-%d %H:%M:%S', utc=True), pd.to_datetime("2019-08-30 00:00:00", format='%Y-%m-%d %H:%M:%S', utc=True)],
        [pd.to_datetime("2020-10-01 00:00:00", format='%Y-%m-%d %H:%M:%S', utc=True), pd.to_datetime("2020-10-31 00:00:00", format='%Y-%m-%d %H:%M:%S', utc=True)],
        [pd.to_datetime("2021-11-01 00:00:00", format='%Y-%m-%d %H:%M:%S', utc=True), pd.to_datetime("2021-11-30 00:00:00", format='%Y-%m-%d %H:%M:%S', utc=True)],
        [pd.to_datetime("2022-08-01 00:00:00", format='%Y-%m-%d %H:%M:%S', utc=True), pd.to_datetime("2022-08-30 00:00:00", format='%Y-%m-%d %H:%M:%S', utc=True)],
        [pd.to_datetime("2022-04-01 00:00:00", format='%Y-%m-%d %H:%M:%S', utc=True), pd.to_datetime("2022-04-30 00:00:00", format='%Y-%m-%d %H:%M:%S', utc=True)]
    ]
    for i in range(len(date_ranges)):
        if date >= date_ranges[i][0] and date < date_ranges[i][1]:
            return i
    return -1

def convert_to_datetime(x):
    return pd.to_datetime(x, format=' %Y %b %d %H:%M:%S UTC', utc=True)

################################ CLEANING CODE ################################

def normalizelat(x):
    return (x - 12) / 8

def normalizelon(x):
    return (x - 120) / 10

def interpolations(df, maxsog): #interpolates headings and speed. Also detects impossible movements between real observations.
  df.insert(4, 'InterpHeading', -1.0)
  df.insert(5, 'InterpSOG', -1.0)
  switch = False
  if df.shape[0] <= 1: return
  #loop to add speed and heading
  for i in range(df.shape[0] - 1):
    denom1 = abs(df.iloc[i,2] - df.iloc[i+1,2])
    if denom1 != 0:
      df.iloc[i, 4] += np.arctan(abs(df.iloc[i,3] - df.iloc[i+1,3])/denom1) * (180/np.pi) + 1.0
    else:
      df.iloc[i, 4] += df.iloc[i + 1,4]
    df.iloc[i,5] += np.sqrt((df.iloc[i,3] - df.iloc[i+1,3])**2 + (df.iloc[i,2] - df.iloc[i+1,2])**2)/((df.iloc[i,1] - df.iloc[i+1,1])/np.timedelta64(1, 'h')) + 1.0
    if df.iloc[i,5] >= maxsog:
      switch = True
      break
  if switch:
    df.insert(6, 'ImpossibleLocation', 1) # 0 if false, 1 if true
  else:
    df.insert(6, 'ImpossibleLocation', 0) # 0 if false, 1 if true
  df.iloc[-1,5] = df.iloc[-2,5]
  df.iloc[-1, 4] = df.iloc[-2,4]
  #loop to add interpolated points
  x=0
  n = df.shape[0] - 1
  while x < n:
    delta = df.iloc[x,1] - df.iloc[x+1,1]
    if delta > np.timedelta64(1, 'h') and delta <= np.timedelta64(24, 'h'):
      lat = (df.iloc[x,2] + df.iloc[x+1,2])/2
      lon = (df.iloc[x,3] + df.iloc[x+1,3])/2
      newrows = pd.DataFrame(df.iloc[x+1]).T
      newrows.iloc[0,1] += (df.iloc[x,1] - df.iloc[x+1,1])/2
      df = pd.concat([df.iloc[:x+1], newrows, df.iloc[x+1:]])
      n = df.shape[0] - 1
    else:
      x += 1


# Load and process data
def load_and_process_data(zip_file_path, extract_to):
    # Extract files from zip
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    
    datalist = []  # List of dataframes that we will use to store all of our separate files

    # Iterate over all files in the extracted folder
    for root, directories, filenames in os.walk(extract_to):
        for file in filenames:
            file_path = os.path.join(root, file)
            if not file.startswith('MMSI'):
                #print(f"Skipping file: {file_path}")
                continue
            df = pd.read_csv(file_path, skiprows=1)
            df = df[['MMSI', 'TimeOfFix', 'Latitude', 'Longitude']]
            df = df[df['MMSI'] >= 100000000]
            df.dropna(inplace = True)
            df = spatial_filter(df, min_lat, max_lat, min_lon, max_lon)
            df = df[df['Latitude'] != ' null']
            df = df[df['Longitude'] != ' null']
            df['Latitude'] = df['Latitude'].apply(normalizelat)
            df['Longitude'] = df['Longitude'].apply(normalizelon)
            df["Exercise"] = df["TimeOfFix"].apply(label_exercise)
            df["TimeOfFix"] = df["TimeOfFix"].apply(convert_to_datetime)
            interpolations(df, 1)
            #add dataframe to list
            datalist.append(df)

    print("Data Read and Processed into List")
    return datalist

# Combine the first entry from each dataframe in datalist into one dataframe
def findfirst(inputlist):
    templist = []
    for x in inputlist:
        templist.append(x.head(1))
    return pd.concat(templist)

# Combine all processed data into one massive dataframe
def alldata(inputlist):
    return pd.concat(inputlist)

# Isolate data that doesn't occur during an exercise
def cleardata(inputlist):
    templist = []
    for x in inputlist:
        x = x[x["Exercise"] == -1]
        templist.append(x)
    return pd.concat(templist)

# Isolate exercise data
def exercisedata(inputlist):
    templist = []
    for x in inputlist:
        x = x[x["Exercise"] != -1]
        templist.append(x)
    return pd.concat(templist)

# Save dataframes to CSV
def save_to_csv(dataframe, filename):
    dataframe.to_csv(filename, index=False)
    print(f"Saved {filename}")


In [5]:
########################## LOAD DATA ###########################################
###################### TAKES A LONG TIME #######################################

# Example usage - replace if you are replicating (download files on github and copy in local file paths)
zip_file_path = "C:/Users/nitis/Downloads/SeaVision-20240611T210922Z-001.zip"
extract_to = "C:/Users/nitis/Downloads/extracted_files"   


# Process the data
originaldata = load_and_process_data(zip_file_path, extract_to)
first_ships = findfirst(originaldata)
total_ships = alldata(originaldata)
exercise_ships = exercisedata(originaldata)
cleared_ships = cleardata(originaldata)

# Save cleaned dataframes to CSV
save_to_csv(first_ships, "first_ships.csv")
save_to_csv(total_ships, "total_ships.csv")
save_to_csv(exercise_ships, "exercise_ships.csv")
save_to_csv(cleared_ships, "cleared_ships.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/nitis/Downloads/SeaVision-20240611T210922Z-001.zip'