# Preprocessing the training dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
train_dataset = pd.read_csv('/Users/eivindmidtbo/Desktop/dev/Maskinlaering_i_praksis/A-laget-ML-bajas/data/ais_train.csv', sep='|')

#Drop columns we don't want to use
train_dataset = train_dataset.drop(['etaRaw','portId', 'cog', 'sog','rot','navstat'], axis=1)

#Transform time to datetime
train_dataset['time'] = pd.to_datetime(train_dataset['time'])

#Sort training data by vesselId and time
train_dataset = train_dataset.sort_values(by=['vesselId','time'])

#Print 
train_dataset.head()

Unnamed: 0,time,heading,latitude,longitude,vesselId
131115,2024-01-12 14:07:47,316,7.50361,77.5834,61e9f38eb937134a3c4bfd8b
131279,2024-01-12 14:31:00,313,7.57302,77.49505,61e9f38eb937134a3c4bfd8b
131514,2024-01-12 14:57:23,312,7.65043,77.39404,61e9f38eb937134a3c4bfd8b
131696,2024-01-12 15:18:48,313,7.71275,77.31394,61e9f38eb937134a3c4bfd8b
131885,2024-01-12 15:39:47,313,7.77191,77.23585,61e9f38eb937134a3c4bfd8b


In [3]:
def create_training_data(df: pd.DataFrame, N_KEEP_PAST: int) -> pd.DataFrame:
    data_rows = []  # List to store data rows
    # Get unique vessel IDs from the DataFrame
    unique_vessels = df['vesselId'].unique()

    # Iterate over each unique vessel
    for vessel in tqdm(unique_vessels):
        # Filter data for the current vessel and sort by time
        vessel_data = df[df['vesselId'] == vessel].sort_values(by='time').reset_index(drop=True)
        total_rows = len(vessel_data)
        
        # Skip vessels with insufficient data
        if total_rows <= N_KEEP_PAST:
            continue

        # Process each record starting from N_KEEP_PAST
        for index in range(N_KEEP_PAST, total_rows):
            current_record = vessel_data.iloc[index]
            previous_records = vessel_data.iloc[index - N_KEEP_PAST:index]

            # Initialize a dictionary to hold the feature values and target
            feature_row = {
                'vesselId': vessel,
                'target_lat': current_record['latitude'],
                'target_lon': current_record['longitude'],
                'target_time': current_record['time']
            }

            # Loop through past records to collect features
            for j, past_record in previous_records.iterrows():
                time_difference = (current_record['time'] - past_record['time']).total_seconds() / 60.0
                feature_row[f'minutes_from_target_{j - (index - N_KEEP_PAST)}'] = time_difference
                feature_row[f'heading_{j - (index - N_KEEP_PAST)}'] = past_record['heading']
                feature_row[f'lat_{j - (index - N_KEEP_PAST)}'] = past_record['latitude']
                feature_row[f'lon_{j - (index - N_KEEP_PAST)}'] = past_record['longitude']


            # Append the feature row to the data list
            data_rows.append(feature_row)

    # Convert the list of feature rows to a DataFrame
    return pd.DataFrame(data_rows)


In [4]:
processed_train = create_training_data(train_dataset, 5)

100%|██████████| 688/688 [23:18<00:00,  2.03s/it]


In [5]:
processed_train.to_csv('/Users/eivindmidtbo/Desktop/dev/Maskinlaering_i_praksis/A-laget-ML-bajas/solutions/eivind_models/data/processed_train.csv', index=False)

In [6]:
def sort_training_data(train_df: pd.DataFrame) -> pd.DataFrame:
    # Ensure that the 'time' column is of datetime type for correct sorting
    if not pd.api.types.is_datetime64_any_dtype(train_df['time']):
        train_df['time'] = pd.to_datetime(train_df['time'])
    
    # Sort the DataFrame by 'vesselId' and 'time' in ascending order
    sorted_df = train_df.sort_values(by=['vesselId', 'time'], ascending=True).reset_index(drop=True)
    
    return sorted_df

In [7]:
sorted_train = sort_training_data(train_dataset)

# Preprocessing the test dataset

In [8]:
test_dataset = pd.read_csv('/Users/eivindmidtbo/Desktop/dev/Maskinlaering_i_praksis/A-laget-ML-bajas/data/ais_test.csv')

test_dataset['time'] = pd.to_datetime(test_dataset['time'])

test_dataset.head()

Unnamed: 0,ID,vesselId,time,scaling_factor
0,0,61e9f3aeb937134a3c4bfe3d,2024-05-08 00:03:16,0.3
1,1,61e9f473b937134a3c4c02df,2024-05-08 00:06:17,0.3
2,2,61e9f469b937134a3c4c029b,2024-05-08 00:10:02,0.3
3,3,61e9f45bb937134a3c4c0221,2024-05-08 00:10:34,0.3
4,4,61e9f38eb937134a3c4bfd8d,2024-05-08 00:12:27,0.3


In [9]:
import pandas as pd
from tqdm import tqdm

def create_test_data(train_df: pd.DataFrame, test_df: pd.DataFrame, N_KEEP_PAST: int) -> pd.DataFrame:
    data_rows = []  # List to collect all the data rows
    unique_vessels = test_df['vesselId'].unique()  # Get unique vessel IDs

    # Iterate over each unique vessel
    for vessel in tqdm(unique_vessels):
        # Filter out the train data for this vessel that is less than the max time in test data
        vessel_train_data = train_df[train_df['vesselId'] == vessel]
        # Sort the train data by time, if not already sorted
        vessel_train_data = vessel_train_data.sort_values(by='time').reset_index(drop=True)

        # Get test data for the current vessel
        test_vessel_data = test_df[test_df['vesselId'] == vessel]

        for _, test_row in test_vessel_data.iterrows():
            target_time = test_row['time']
            ID = test_row['ID']
            # Get the last N_KEEP_PAST records from the train data before the target time
            past_data = vessel_train_data[vessel_train_data['time'] < target_time].tail(N_KEEP_PAST)

            # Check if we have enough past data
            if len(past_data) < N_KEEP_PAST:
                continue  # Not enough past data; skip this test row

            # Prepare a dictionary to hold the features
            feature_row = {
                'vesselId': vessel,
                'target_time': target_time  # Only include target time
            }

            # Loop through past records to collect features
            for j in range(N_KEEP_PAST):
                past_record = past_data.iloc[j]
                time_diff = (target_time - past_record['time']).total_seconds() / 60.0  # Difference in minutes

                # Add features for minutes from target and heading
                feature_row[f'minutes_from_target_{j}'] = time_diff
                feature_row[f'heading_{j}'] = past_record['heading']
                feature_row[f'lat_{j}'] = past_record['latitude']
                feature_row[f'lon_{j}'] = past_record['longitude']
                

            feature_row['vesselId'] = vessel
            feature_row['ID'] = ID
            # Append the row to the list
            data_rows.append(feature_row)

    # Convert the list of feature rows to a DataFrame
    return pd.DataFrame(data_rows)



In [10]:
processed_test = create_test_data(sorted_train, test_dataset, 5)

100%|██████████| 215/215 [01:20<00:00,  2.68it/s]


In [11]:
processed_test.to_csv('/Users/eivindmidtbo/Desktop/dev/Maskinlaering_i_praksis/A-laget-ML-bajas/solutions/eivind_models/data/processed_test.csv', index=False)

# Code for making predictions

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import LinearSVR
from sklearn.preprocessing import StandardScaler
from scipy.optimize import minimize  

# Load datasets into pandas DataFrames
test_df = pd.read_csv('/Users/eivindmidtbo/Desktop/dev/Maskinlaering_i_praksis/A-laget-ML-bajas/solutions/eivind_models/data/processed_test.csv')
train_df = pd.read_csv('/Users/eivindmidtbo/Desktop/dev/Maskinlaering_i_praksis/A-laget-ML-bajas/solutions/eivind_models/data/processed_train.csv')

# Randomly sample 10% of the training data for quick testing
train_df_sample = train_df.sample(frac=0.1, random_state=42)

# Separate features and targets from training data
X = train_df_sample.drop(columns=['target_lat', 'target_lon', 'vesselId', 'target_time'])
y = train_df_sample[['target_lat', 'target_lon']]
X_test = test_df[X.columns]

# Scale features
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

# Initialize LinearSVR model wrapped in MultiOutputRegressor
linear_svr = LinearSVR(C=1.0, epsilon=0.1, max_iter=10000, random_state=42)  # Increase max_iter
multi_output_model = MultiOutputRegressor(linear_svr, n_jobs=-1)  # Use all cores for fitting

# Set up K-Fold cross-validation
n_folds = 3  # Reduce folds for quicker validation
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Initialize arrays for predictions
oof_preds = np.zeros((X.shape[0], y.shape[1]))
test_preds = np.zeros((X_test.shape[0], y.shape[1], n_folds))

# Perform training and predictions using K-Fold
for fold_index, (train_indices, valid_indices) in enumerate(kf.split(X, y)):
    X_train, y_train = X[train_indices], y.iloc[train_indices]
    X_val, y_val = X[valid_indices], y.iloc[valid_indices]

    print(f"Training fold {fold_index + 1}/{n_folds} using LinearSVR...")
    multi_output_model.fit(X_train, y_train)

    # Make predictions
    val_preds = multi_output_model.predict(X_val)
    test_fold_preds = multi_output_model.predict(X_test)

    # Save the predictions
    oof_preds[valid_indices] = val_preds
    test_preds[:, :, fold_index] = test_fold_preds

# Define the function to compute mean squared error for weight optimization
def compute_mse(weights):
    normalized_weights = np.array(weights) / np.sum(weights)  # Normalize weights
    combined_preds = normalized_weights[0] * oof_preds  # Since there's only one model
    mse = mean_squared_error(y, combined_preds)
    return mse

# Initial weight for optimization
initial_weight = [1.0]  # Single model
bounds = [(0, 1)]  # Weights should be between 0 and 1

# Optimize the weights to minimize the MSE
optimal_mse = float('inf')
optimal_weights = None

print("Starting weight optimization...")
try:
    optimization_result = minimize(compute_mse, initial_weight, method='Nelder-Mead', bounds=bounds)
    optimal_mse = optimization_result.fun
    optimal_weights = optimization_result.x / np.sum(optimization_result.x)  # Normalize weights
except Exception as e:
    print(f"Error during optimization: {e}")

# Average the test predictions across folds
mean_test_predictions = np.mean(test_preds, axis=2)

# Finalize predictions using optimized weights
final_predictions = optimal_weights[0] * mean_test_predictions  # Since there's only one model

# Create a DataFrame for output
results_df = pd.DataFrame({
    'ID': test_df['ID'],
    'longitude_predicted': final_predictions[:, 1],
    'latitude_predicted': final_predictions[:, 0]
})

# Save the predictions to a CSV file
results_df.to_csv('/Users/eivindmidtbo/Desktop/dev/Maskinlaering_i_praksis/A-laget-ML-bajas/solutions/eivind_models/data/predictions_model6_linear_svr.csv', index=False)

# Print optimized results
print(f"Optimized Mean Squared Error: {optimal_mse}")
print(f"Final Prediction Weight: {optimal_weights[0]:.4f}")


Training fold 1/3 using LinearSVR...
Training fold 2/3 using LinearSVR...
Training fold 3/3 using LinearSVR...




Starting weight optimization...
Optimized Mean Squared Error: 15.554004819559704
Final Prediction Weight: 1.0000
