In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [None]:
df = pd.read_csv('../data/cleaned/cleaned_vessel_calls.csv')
df = df.drop(columns=['vessel.vesselName','Timestamp'])

In [None]:
df["arrivalDate"] = pd.to_datetime(df["arrivalDate"])
df["sailingDate"] = pd.to_datetime(df["sailingDate"])

In [None]:
# order by IMO and arrival date
df.sort_values(by=["IMO", "arrivalDate"], inplace=True)

In [None]:
df.sort_values(by=["IMO", "arrivalDate"], inplace=True)
df['next_unlocode'] = df.groupby('IMO')['place.unlocode'].shift(-1)
df['is_last_imo'] = df['IMO'] != df['IMO'].shift(-1)
df_cleaned = df[~df["is_last_imo"]]

In [None]:
df_cleaned = df_cleaned.drop(columns=['is_last_imo'])
df_cleaned

In [None]:
# drop all the rows with event type ARRIVAL
df_cleaned = df_cleaned[df_cleaned['Event_Type'] != 'ARRIVAL']
df_cleaned

In [None]:
df_cleaned = df_cleaned.drop(columns=['Event_Type'])

In [None]:
# get the different types of placeTypes and the number of each
place_types = df_cleaned['place.placeType'].unique()
place_type_counts = df_cleaned['place.placeType'].value_counts()
place_type_counts, place_types

In [None]:
df_cleaned = df_cleaned.drop(columns=['arrivalDate', 'sailingDate'])
df_cleaned

In [None]:
df_cleaned = df_cleaned.rename(columns={'place.unlocode':'unlocode', 'vessel.vesselType':'vesselType', 'place.placeType': 'placeType', 'place.countryName': 'countryName'})
df = df_cleaned
df

In [None]:
# Encode categorical features
label_encoders = {}
for column in ['unlocode', 'placeType', 'vesselType']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le
 
# Encode target variable
le_target = LabelEncoder()
df['next_unlocode_encoded'] = le_target.fit_transform(df['next_unlocode'])

In [None]:
# Prepare features and labels
X = df.drop(['next_unlocode', 'next_unlocode_encoded', 'place.placeName', 'IMO', 'countryName'], axis=1)
y = df['next_unlocode_encoded']

In [None]:
# print out columns in X and y
X = X.values.astype(np.float32) 
y = y.values.astype(np.int32)

In [None]:
# Convert features and labels to numpy arrays
y = to_categorical(y) # Convert labels to one-hot encoding
X.shape, y.shape

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# If your model requires sequences, you may need to reshape the data
# Assuming each sample is a single timestep (1 feature per sample for simplicity here)
X_train = np.expand_dims(X_train, axis=1)
X_test = np.expand_dims(X_test, axis=1)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Masking
 
# Define the model
model = Sequential()
model.add(Masking(mask_value=0., input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(LSTM(64, return_sequences=False))
model.add(Dense(y.shape[1], activation='softmax'))
 
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
 
# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy: {:.2f}%".format(accuracy * 100))
 
# Predict and decode labels
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)
 
# Decode the predicted and true labels back to their original port codes
predicted_ports = le_target.inverse_transform(y_pred_classes)
true_ports = le_target.inverse_transform(y_test_classes)
 
# Optionally, you can check some predictions
print("Predicted ports:", predicted_ports[:10])
print("True ports:", true_ports[:10])