In [1]:
import pandas as pd
import numpy as np
import json
import datetime
from google.colab import files
import zipfile
import os
from sklearn.cluster import DBSCAN
from matplotlib import pyplot as plt
from geopy.distance import geodesic

#from geopy.distance import geodesic (need if we want to normalize to the globe)

ModuleNotFoundError: No module named 'google.colab'

In [None]:
#Special thanks to Scott
#Region of interest
min_lat = 4
max_lat = 26
min_lon = 110
max_lon = 130
#May need to adjust boundaries. Currently barely contains Taiwan, but has a lot of the Phillipine Sea

#Remove data outside of our box
def spatial_filter(x, y_0, y_1, x_0, x_1):
  return x[(x['Latitude'] >= min_lat) &
                     (x['Latitude'] <= max_lat) &
                     (x['Longitude'] >= min_lon) &
                     (x['Longitude'] <= max_lon)]

# for labeling rows that occur during an exercise
def label_exercise(date):
  date = pd.to_datetime(date, format=' %Y %b %d %H:%M:%S UTC', utc = True)
  date_ranges = [[pd.to_datetime("2019-08-01 00:00:00", format='%Y-%m-%d %H:%M:%S', utc = True), pd.to_datetime("2019-08-30 00:00:00", format='%Y-%m-%d %H:%M:%S', utc = True)],
                 [pd.to_datetime("2020-10-01 00:00:00", format='%Y-%m-%d %H:%M:%S', utc = True), pd.to_datetime("2020-10-31 00:00:00", format='%Y-%m-%d %H:%M:%S', utc = True)],
                 [pd.to_datetime("2021-11-01 00:00:00", format='%Y-%m-%d %H:%M:%S', utc = True), pd.to_datetime("2021-11-30 00:00:00", format='%Y-%m-%d %H:%M:%S', utc = True)],
                 [pd.to_datetime("2022-08-01 00:00:00", format='%Y-%m-%d %H:%M:%S', utc = True), pd.to_datetime("2022-08-30 00:00:00", format='%Y-%m-%d %H:%M:%S', utc = True)],
                 [pd.to_datetime("2022-04-01 00:00:00", format='%Y-%m-%d %H:%M:%S', utc = True), pd.to_datetime("2022-04-30 00:00:00", format='%Y-%m-%d %H:%M:%S', utc = True)]
                 ]
  for i in range(len(date_ranges)):
    if date >= date_ranges[i][0] and date < date_ranges[i][1]:
      return i
  return -1

def convert_to_datetime(x):
  return pd.to_datetime(x, format=' %Y %b %d %H:%M:%S UTC', utc = True)

In [None]:
################################ CLEANING CODE ################################

#loads data into a list of dataframes
def load_and_process_data():
  #upload a .zip file of seavision data
  uploaded = files.upload()
  #extract files from zip
  for file_name in uploaded.keys():
    if file_name.endswith('.zip'):
        with zipfile.ZipFile(file_name, 'r') as zip_ref:
            zip_ref.extractall('/content/uploaded_folder')
  datalist = [] #list of dataframes that we will use to store all of our separate files
  #iterate over all files in the uploadedfolder
  folder_path = '/content/uploaded_folder'
  for root, directories, filenames in os.walk(folder_path):
    for file in filenames:
      file_path = os.path.join(root, file)
      df = pd.read_csv(file_path)
      df.dropna(subset = ['MMSI', 'TimeOfFix', 'Latitude', 'Longitude', 'SOG', 'Heading'], inplace = True)
      df = spatial_filter(df, min_lat, max_lat, min_lon, max_lon)
      #mask = (type(df['SOG']) != str) & (type(df['Heading']) != str) & (type(df['Latitude']) != str) & (type(df['Longitude']) != str)
      df = df[df['SOG'] != ' null']
      df = df[df['Heading'] != ' null']
      df = df[df['Latitude'] != ' null']
      df = df[df['Longitude'] != ' null']
      df.index = df.pop('MMSI')
      df["Exercise"] = df["TimeOfFix"].apply(label_exercise)
      df["TimeOfFix"] = df["TimeOfFix"].apply(convert_to_datetime)
      #add dataframe to list

      datalist.append(df)
  print("Data Read and Processed into List")

  return datalist

#takes the first entry from each dataframe in datalist and combines them into one dataframe
#in place
def findfirst(inputlist):
  templist = []
  for x in inputlist:
    templist.append(x.head(1))
  return pd.concat(templist)

# combines all processed data into one massive csv
def alldata(inputlist):
  return pd.concat(inputlist)

#isolates data that doesn't occur during an exercise
def cleardata(inputlist):
  templist = []
  for x in inputlist:
    x = x[x["Exercise"] == -1]
    templist.append(x)
  return pd.concat(templist)

#isolates exercise data
def exercisedata(inputlist):
  templist = []
  for x in inputlist:
    x = x[x["Exercise"] != -1]
    templist.append(x)
  return pd.concat(templist)

In [None]:
########################## LOAD DATA ###########################################
###################### TAKES A LONG TIME #######################################

originaldata = load_and_process_data()
first_ships = findfirst(originaldata)
total_ships = alldata(originaldata)
exercise_ships = exercisedata(originaldata)
cleared_ships = cleardata(originaldata)