In [5]:
import pandas as pd
import numpy as np
import json
import datetime
import zipfile
import os
from sklearn.cluster import DBSCAN
from matplotlib import pyplot as plt
from geopy.distance import geodesic

# Region of interest
min_lat = 4
max_lat = 24
min_lon = 110
max_lon = 130
# May need to adjust boundaries. Currently barely contains Taiwan, but has a lot of the Philippine Sea

# Remove data outside of our box
def spatial_filter(x, y_0, y_1, x_0, x_1):
    return x[(x['Latitude'] >= y_0) & (x['Latitude'] <= y_1) & (x['Longitude'] >= x_0) & (x['Longitude'] <= x_1)]

# For labeling rows that occur during an exercise
def label_exercise(date):
    date = pd.to_datetime(date, format=' %Y %b %d %H:%M:%S UTC', utc=True)
    date_ranges = [
        [pd.to_datetime("2019-08-01 00:00:00", format='%Y-%m-%d %H:%M:%S', utc=True), pd.to_datetime("2019-08-30 00:00:00", format='%Y-%m-%d %H:%M:%S', utc=True)],
        [pd.to_datetime("2020-10-01 00:00:00", format='%Y-%m-%d %H:%M:%S', utc=True), pd.to_datetime("2020-10-31 00:00:00", format='%Y-%m-%d %H:%M:%S', utc=True)],
        [pd.to_datetime("2021-11-01 00:00:00", format='%Y-%m-%d %H:%M:%S', utc=True), pd.to_datetime("2021-11-30 00:00:00", format='%Y-%m-%d %H:%M:%S', utc=True)],
        [pd.to_datetime("2022-08-01 00:00:00", format='%Y-%m-%d %H:%M:%S', utc=True), pd.to_datetime("2022-08-30 00:00:00", format='%Y-%m-%d %H:%M:%S', utc=True)],
        [pd.to_datetime("2022-04-01 00:00:00", format='%Y-%m-%d %H:%M:%S', utc=True), pd.to_datetime("2022-04-30 00:00:00", format='%Y-%m-%d %H:%M:%S', utc=True)]
    ]
    for i in range(len(date_ranges)):
        if date >= date_ranges[i][0] and date < date_ranges[i][1]:
            return i
    return -1

def convert_to_datetime(x):
    return pd.to_datetime(x, format=' %Y %b %d %H:%M:%S UTC', utc=True)

################################ CLEANING CODE ################################

# Load and process data
def load_and_process_data(zip_file_path, extract_to):
    # Extract files from zip
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    
    datalist = []  # List of dataframes that we will use to store all of our separate files

    # Iterate over all files in the extracted folder
    for root, directories, filenames in os.walk(extract_to):
        for file in filenames:
            file_path = os.path.join(root, file)
            df = pd.read_csv(file_path)
            df = df[['MMSI', 'TimeOfFix', 'Latitude', 'Longitude']]
            df.dropna(inplace = True)
            df = spatial_filter(df, min_lat, max_lat, min_lon, max_lon)
            df = df[df['MMSI'] >= 100000000]
            df = df[df['Latitude'] != ' null']
            df = df[df['Longitude'] != ' null']
            df.index = df.pop('MMSI')
            df["Exercise"] = df["TimeOfFix"].apply(label_exercise)
            df["TimeOfFix"] = df["TimeOfFix"].apply(convert_to_datetime)
            datalist.append(df)

    print("Data Read and Processed into List")
    return datalist

# Combine the first entry from each dataframe in datalist into one dataframe
def findfirst(inputlist):
    templist = []
    for x in inputlist:
        templist.append(x.head(1))
    return pd.concat(templist)

# Combine all processed data into one massive dataframe
def alldata(inputlist):
    return pd.concat(inputlist)

# Isolate data that doesn't occur during an exercise
def cleardata(inputlist):
    templist = []
    for x in inputlist:
        x = x[x["Exercise"] == -1]
        templist.append(x)
    return pd.concat(templist)

# Isolate exercise data
def exercisedata(inputlist):
    templist = []
    for x in inputlist:
        x = x[x["Exercise"] != -1]
        templist.append(x)
    return pd.concat(templist)

# Save dataframes to CSV
def save_to_csv(dataframe, filename):
    dataframe.to_csv(filename, index=False)
    print(f"Saved {filename}")

########################## LOAD DATA ###########################################
###################### TAKES A LONG TIME #######################################

# Example usage - replace if you are replicating (download files on github and copy in local file paths)
zip_file_path = "C:/Users/nitis/Downloads/SeaVision-20240611T210922Z-001.zip"
extract_to = "C:/Users/nitis/Downloads/extracted_files"   


# Process the data
originaldata = load_and_process_data(zip_file_path, extract_to)
first_ships = findfirst(originaldata)
total_ships = alldata(originaldata)
exercise_ships = exercisedata(originaldata)
cleared_ships = cleardata(originaldata)

# Save cleaned dataframes to CSV
save_to_csv(first_ships, "first_ships.csv")
save_to_csv(total_ships, "total_ships.csv")
save_to_csv(exercise_ships, "exercise_ships.csv")
save_to_csv(cleared_ships, "cleared_ships.csv")

Data Read and Processed into List
Saved first_ships.csv


  return pd.concat(templist)
  return pd.concat(inputlist)
  return pd.concat(templist)
  return pd.concat(templist)


Saved total_ships.csv
Saved exercise_ships.csv
Saved cleared_ships.csv
