# 1. Preprocessing the datasets

In [108]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

In [109]:
ais = pd.read_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/data/raw/ais_train.csv', sep='|')


#Drop columns we don't want to use
ais = ais.drop(['etaRaw','cog', 'rot', 'sog', 'portId', 'heading', 'navstat'], axis=1)



#Sort training data by vesselId and time
ais = ais.sort_values(by=['vesselId','time'])

#Print 
ais.head()

Unnamed: 0,time,latitude,longitude,vesselId
131115,2024-01-12 14:07:47,7.50361,77.5834,61e9f38eb937134a3c4bfd8b
131279,2024-01-12 14:31:00,7.57302,77.49505,61e9f38eb937134a3c4bfd8b
131514,2024-01-12 14:57:23,7.65043,77.39404,61e9f38eb937134a3c4bfd8b
131696,2024-01-12 15:18:48,7.71275,77.31394,61e9f38eb937134a3c4bfd8b
131885,2024-01-12 15:39:47,7.77191,77.23585,61e9f38eb937134a3c4bfd8b


##### Transform time

In [110]:
ais['time'] = pd.to_datetime(ais['time'])

##### How long have the vessel been measured?

In [114]:
ais['time_measured'] = ais.groupby('vesselId')['time'].diff().dt.total_seconds()

# 2. Creating training data

In [122]:
def create_training_data(df: pd.DataFrame, N_KEEP_PAST: int) -> pd.DataFrame:
    data_rows = []  # List to store data rows
    # Get unique vessel IDs from the DataFrame
    unique_vessels = df['vesselId'].unique()

    # Iterate over each unique vessel
    for vessel in tqdm(unique_vessels):
        # Filter data for the current vessel and sort by time
        vessel_data = df[df['vesselId'] == vessel].sort_values(by='time').reset_index(drop=True)
        total_rows = len(vessel_data)
        
        # Skip vessels with insufficient data
        if total_rows <= N_KEEP_PAST:
            continue

        # Process each record starting from N_KEEP_PAST
        for index in range(N_KEEP_PAST, total_rows):
            current_record = vessel_data.iloc[index]
            previous_records = vessel_data.iloc[index - N_KEEP_PAST:index]

            # Initialize a dictionary to hold the feature values and target
            feature_row = {
                'vesselId': vessel,
                'target_lat': current_record['latitude'],
                'target_lon': current_record['longitude'],
                'target_time': current_record['time'],
                'time_measured': current_record['time_measured']
             
            }

            # Loop through past records to collect features
            for j, past_record in previous_records.iterrows():
                feature_row[f'lat_{j - (index - N_KEEP_PAST)}'] = past_record['latitude']
                feature_row[f'lon_{j - (index - N_KEEP_PAST)}'] = past_record['longitude']

            # Append the feature row to the data list
            data_rows.append(feature_row)

    # Convert the list of feature rows to a DataFrame
    return pd.DataFrame(data_rows)


In [123]:
processed_train = create_training_data(ais, 2)

100%|██████████| 688/688 [06:30<00:00,  1.76it/s]


#### Store training data as CSV

In [124]:
processed_train.to_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/solutions/Mina/processed_train_mina.csv', index=False)

# 3. Preprocessing the test data

#### 3.1 Load test data

In [125]:
test_dataset = pd.read_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/data/raw/ais_test.csv')

test_dataset['time'] = pd.to_datetime(test_dataset['time'])

test_dataset.head()

Unnamed: 0,ID,vesselId,time,scaling_factor
0,0,61e9f3aeb937134a3c4bfe3d,2024-05-08 00:03:16,0.3
1,1,61e9f473b937134a3c4c02df,2024-05-08 00:06:17,0.3
2,2,61e9f469b937134a3c4c029b,2024-05-08 00:10:02,0.3
3,3,61e9f45bb937134a3c4c0221,2024-05-08 00:10:34,0.3
4,4,61e9f38eb937134a3c4bfd8d,2024-05-08 00:12:27,0.3


In [126]:
import pandas as pd
from tqdm import tqdm

def create_test_data(train_df: pd.DataFrame, test_df: pd.DataFrame, N_KEEP_PAST: int) -> pd.DataFrame:
    data_rows = []  # List to collect all the data rows
    unique_vessels = test_df['vesselId'].unique()  # Get unique vessel IDs

    # Iterate over each unique vessel
    for vessel in tqdm(unique_vessels):
        # Filter out the train data for this vessel that is less than the max time in test data
        vessel_train_data = train_df[train_df['vesselId'] == vessel].sort_values(by='time').reset_index(drop=True)
       
        # Get test data for the current vessel
        test_vessel_data = test_df[test_df['vesselId'] == vessel]
        
        for _, test_row in test_vessel_data.iterrows():
        
            target_time = test_row['time']
            #current_record = vessel_train_data.iloc[_]
            
            
            ID = test_row['ID']
            # Get the last N_KEEP_PAST records from the train data before the target time
            past_data = vessel_train_data[vessel_train_data['time'] < target_time].tail(N_KEEP_PAST)

            # Check if we have enough past data
            if len(past_data) < N_KEEP_PAST:
                continue  # Not enough past data; skip this test row

            # Prepare a dictionary to hold the features
            feature_row = {
                'vesselId': vessel, 
                'target_time': target_time, # Only include target time
            }

            # Loop through past records to collect features
            for j in range(N_KEEP_PAST):
                past_record = past_data.iloc[j]
                feature_row[f'time_measured']= past_record['time_measured']
                feature_row[f'lat_{j}'] = past_record['latitude']
                feature_row[f'lon_{j}'] = past_record['longitude']
                

            feature_row['vesselId'] = vessel
            feature_row['ID'] = ID
            
            # Append the row to the list
            data_rows.append(feature_row)

    #Convert the list of feature rows to a DataFrame
    return pd.DataFrame(data_rows)



In [127]:
processed_test = create_test_data(ais, test_dataset, 2)

100%|██████████| 215/215 [00:44<00:00,  4.80it/s]


In [128]:
print(processed_test.dtypes)

vesselId                 object
target_time      datetime64[ns]
time_measured           float64
lat_0                   float64
lon_0                   float64
lat_1                   float64
lon_1                   float64
ID                        int64
dtype: object


In [129]:
processed_test.to_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/solutions/Mina/processed_test.csv', index=False)

# Code for making predictions

In [61]:
# import pandas as pd
# import numpy as np
# from sklearn.model_selection import KFold
# from sklearn.multioutput import MultiOutputRegressor
# from sklearn.metrics import mean_squared_error
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.linear_model import ElasticNet
# import xgboost as xgb
# import lightgbm as lgb
# from scipy.optimize import minimize
# from sklearn.preprocessing import LabelEncoder

# # Load the data
# test_final = pd.read_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/solutions/Mina/processed_test.csv')
# final_train = pd.read_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/solutions/Mina/processed_train_mina.csv')

# # Define features and targets
# features = final_train.drop(columns=['target_lat', 'target_lon', 'time_measured'])
# targets = final_train[['target_lat', 'target_lon']]

# feature_columns = features.columns
# print(feature_columns)
# X_test = test_final[feature_columns]

# # Define the models
# models = {
#     'RandomForest': MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42, max_depth=4, n_jobs=8)),
#     'XGBoost': MultiOutputRegressor(xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, seed=42)),
#     'ElasticNet': MultiOutputRegressor(ElasticNet(random_state=42)),
#     'LightGBM': MultiOutputRegressor(lgb.LGBMRegressor(n_estimators=100, random_state=42, verbose=-1))
# }

# # Prepare arrays to hold OOF predictions and test predictions
# n_splits = 5
# kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# oof_preds = {model_name: np.zeros((features.shape[0], targets.shape[1])) for model_name in models}
# test_preds = {model_name: np.zeros((X_test.shape[0], targets.shape[1], n_splits)) for model_name in models}

# # Perform cross-validation and collect predictions
# for fold, (train_idx, valid_idx) in enumerate(kf.split(features, targets)):
#     X_train, y_train = features.iloc[train_idx], targets.iloc[train_idx]
#     X_valid, y_valid = features.iloc[valid_idx], targets.iloc[valid_idx]
    
#     for model_name, model in models.items():
#         print(f"Training and predicting with model: {model_name}")
#         clf = model
#         clf.fit(X_train, y_train)
#         y_pred_valid = clf.predict(X_valid)
#         y_pred_test = clf.predict(X_test)
      
        
#         # Save OOF predictions
#         oof_preds[model_name][valid_idx] = y_pred_valid
#         # Save test predictions
#         test_preds[model_name][:,:,fold] = y_pred_test

# # Define the loss function for optimization
# def mse_loss(weights):
#     weights = np.array(weights)
#     # Normalize weights to sum to 1
#     weights = weights / np.sum(weights)
#     # Combine OOF predictions using the weights
#     final_oof = np.zeros_like(targets.values)
#     for i, model_name in enumerate(models):
#         final_oof += weights[i] * oof_preds[model_name]
#     # Compute mean squared error
#     mse = mean_squared_error(targets.values, final_oof)
#     return mse

# # Optimization methods to try
# #methods = [
# #    'Nelder-Mead', 'Powell', 'trust-constr', 'CG', 'BFGS', 'Newton-CG',
# #    'L-BFGS-B', 'TNC', 'COBYLA', 'SLSQP', 'dogleg', 'trust-ncg',
# #    'trust-exact', 'trust-krylov'
# #]
# methods = [
#     'Nelder-Mead', 'Powell',  'CG', 'BFGS',
#     'L-BFGS-B', 'TNC', 'SLSQP', 
# ] # 'trust-constr' is just really slow and not performing well at the this moment

# # Initial weights
# initial_weights = np.ones(len(models)) / len(models)

# # Constraints and bounds
# constraints = ({'type': 'eq', 'fun': lambda w: np.sum(w) - 1})
# bounds = [(0, 1)] * len(models)

# # Optimize weights using different methods
# best_mse = np.inf
# best_weights = None
# best_method = None

# for method in methods:
#     print(f"Optimizing weights using method: {method}")
#     try:
#         if method in ['trust-constr', 'COBYLA', 'SLSQP', 'trust-ncg', 'trust-krylov', 'trust-exact']:
#             res = minimize(mse_loss, initial_weights, method=method, bounds=bounds, constraints=constraints)
#         elif method in ['L-BFGS-B', 'TNC']:
#             res = minimize(mse_loss, initial_weights, method=method, bounds=bounds)
#         else:
#             # For unconstrained methods, weights will be normalized in mse_loss
#             res = minimize(mse_loss, initial_weights, method=method)
#         if res.fun < best_mse:
#             best_mse = res.fun
#             best_weights = res.x / np.sum(res.x)  # Normalize weights
#             best_method = method
#         print(f"Method: {method}, MSE: {res.fun}")
#     except Exception as e:
#         print(f"Method: {method}, failed with error: {e}")

# # Average test predictions over folds for each model
# for model_name in models:
#     test_preds[model_name] = np.mean(test_preds[model_name], axis=2)  # Average over folds

# # Combine the test predictions using the best weights
# final_test_pred = np.zeros((X_test.shape[0], targets.shape[1]))
# for i, model_name in enumerate(models):
#     final_test_pred += best_weights[i] * test_preds[model_name]

# # Create a DataFrame with IDs and predictions
# prediction_df = pd.DataFrame({
#     'ID': test_final['ID'],
#     'longitude_predicted': final_test_pred[:, 1],
#     'latitude_predicted': final_test_pred[:, 0]
# })

# # Save the predictions to a CSV file
# prediction_df.to_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/solutions/Mina/predictions11.csv', index=False)

# # Print the best method and weights
# print(f"Best optimization method: {best_method}")
# print("Best weights:")
# for i, model_name in enumerate(models):
#     print(f"{model_name}: {best_weights[i]:.4f}")


Index(['heading_0', 'lat_0', 'lon_0', 'heading_1', 'lat_1', 'lon_1',
       'heading_2', 'lat_2', 'lon_2', 'heading_3', 'lat_3', 'lon_3',
       'heading_4', 'lat_4', 'lon_4'],
      dtype='object')
Training and predicting with model: RandomForest
Training and predicting with model: XGBoost
Training and predicting with model: ElasticNet
Training and predicting with model: LightGBM
Training and predicting with model: RandomForest
Training and predicting with model: XGBoost
Training and predicting with model: ElasticNet
Training and predicting with model: LightGBM
Training and predicting with model: RandomForest
Training and predicting with model: XGBoost
Training and predicting with model: ElasticNet
Training and predicting with model: LightGBM
Training and predicting with model: RandomForest
Training and predicting with model: XGBoost
Training and predicting with model: ElasticNet
Training and predicting with model: LightGBM
Training and predicting with model: RandomForest
Training and

In [130]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
import xgboost as xgb
import lightgbm as lgb
from scipy.optimize import minimize
from sklearn.preprocessing import LabelEncoder


# Load the data
test_final = pd.read_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/solutions/Mina/processed_test.csv')
final_train = pd.read_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/solutions/Mina/processed_train_mina.csv')

# Sort the training data by 'time_measured' to ensure chronological order
final_train = final_train.sort_values(by='time_measured').reset_index(drop=True)
print(final_train.dtypes)

# Define features and targets
features = final_train.drop(columns=['target_lat', 'target_lon', 'vesselId', 'time_measured', 'target_time'])
targets = final_train[['target_lat', 'target_lon']]

feature_columns = features.columns
print(feature_columns)
X_test = test_final[feature_columns]

# Define the models
models = {
    'RandomForest': MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42, max_depth=4, n_jobs=8)),
    'XGBoost': MultiOutputRegressor(xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, seed=42)),
    'ElasticNet': MultiOutputRegressor(ElasticNet(random_state=42)),
    'LightGBM': MultiOutputRegressor(lgb.LGBMRegressor(n_estimators=100, random_state=42, verbose=-1))
}

# Prepare arrays to hold OOF predictions and test predictions
n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)

oof_preds = {model_name: np.zeros((features.shape[0], targets.shape[1])) for model_name in models}
test_preds = {model_name: np.zeros((X_test.shape[0], targets.shape[1], n_splits)) for model_name in models}

# Perform time series cross-validation and collect predictions
for fold, (train_idx, valid_idx) in enumerate(tscv.split(features, targets)):
    X_train, y_train = features.iloc[train_idx], targets.iloc[train_idx]
    X_valid, y_valid = features.iloc[valid_idx], targets.iloc[valid_idx]
    
    for model_name, model in models.items():
        print(f"Training and predicting with model: {model_name}")
        clf = model
        clf.fit(X_train, y_train)
        y_pred_valid = clf.predict(X_valid)
        y_pred_test = clf.predict(X_test)
      
        # Save OOF predictions
        oof_preds[model_name][valid_idx] = y_pred_valid
        # Save test predictions
        test_preds[model_name][:,:,fold] = y_pred_test

# Define the loss function for optimization
def mse_loss(weights):
    weights = np.array(weights)
    # Normalize weights to sum to 1
    weights = weights / np.sum(weights)
    # Combine OOF predictions using the weights
    final_oof = np.zeros_like(targets.values)
    for i, model_name in enumerate(models):
        final_oof += weights[i] * oof_preds[model_name]
    # Compute mean squared error
    mse = mean_squared_error(targets.values, final_oof)
    return mse

# Optimization methods to try
methods = [
    'Nelder-Mead', 'Powell',  'CG', 'BFGS',
    'L-BFGS-B', 'TNC', 'SLSQP', 
]

# Initial weights
initial_weights = np.ones(len(models)) / len(models)

# Constraints and bounds
constraints = ({'type': 'eq', 'fun': lambda w: np.sum(w) - 1})
bounds = [(0, 1)] * len(models)

# Optimize weights using different methods
best_mse = np.inf
best_weights = None
best_method = None

for method in methods:
    print(f"Optimizing weights using method: {method}")
    try:
        if method in ['SLSQP', 'TNC', 'L-BFGS-B']:
            res = minimize(mse_loss, initial_weights, method=method, bounds=bounds, constraints=constraints)
        else:
            # For unconstrained methods, weights will be normalized in mse_loss
            res = minimize(mse_loss, initial_weights, method=method)
        if res.fun < best_mse:
            best_mse = res.fun
            best_weights = res.x / np.sum(res.x)  # Normalize weights
            best_method = method
        print(f"Method: {method}, MSE: {res.fun}")
    except Exception as e:
        print(f"Method: {method}, failed with error: {e}")

# Average test predictions over folds for each model
for model_name in models:
    test_preds[model_name] = np.mean(test_preds[model_name], axis=2)  # Average over folds

# Combine the test predictions using the best weights
final_test_pred = np.zeros((X_test.shape[0], targets.shape[1]))
for i, model_name in enumerate(models):
    final_test_pred += best_weights[i] * test_preds[model_name]

# Create a DataFrame with IDs and predictions
prediction_df = pd.DataFrame({
    'ID': test_final['ID'],
    'longitude_predicted': final_test_pred[:, 1],
    'latitude_predicted': final_test_pred[:, 0]
})

# Save the predictions to a CSV file
prediction_df.to_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/solutions/Mina/predictions12.csv', index=False)

# Print the best method and weights
print(f"Best optimization method: {best_method}")
print("Best weights:")
for i, model_name in enumerate(models):
    print(f"{model_name}: {best_weights[i]:.4f}")


vesselId          object
target_lat       float64
target_lon       float64
target_time       object
time_measured    float64
lat_0            float64
lon_0            float64
lat_1            float64
lon_1            float64
dtype: object
Index(['lat_0', 'lon_0', 'lat_1', 'lon_1'], dtype='object')
Training and predicting with model: RandomForest
Training and predicting with model: XGBoost
Training and predicting with model: ElasticNet
Training and predicting with model: LightGBM
Training and predicting with model: RandomForest
Training and predicting with model: XGBoost
Training and predicting with model: ElasticNet
Training and predicting with model: LightGBM
Training and predicting with model: RandomForest
Training and predicting with model: XGBoost
Training and predicting with model: ElasticNet
Training and predicting with model: LightGBM
Training and predicting with model: RandomForest
Training and predicting with model: XGBoost
Training and predicting with model: ElasticNet
Traini

  warn('Method %s cannot handle constraints.' % method,


Method: L-BFGS-B, MSE: 597.1793760909342
Optimizing weights using method: TNC


  warn('Method %s cannot handle constraints.' % method,


Method: TNC, MSE: 597.1794024941654
Optimizing weights using method: SLSQP
Method: SLSQP, MSE: 597.1793761708374
Best optimization method: CG
Best weights:
RandomForest: 0.0127
XGBoost: 0.2367
ElasticNet: 0.4497
LightGBM: 0.3010


In [269]:
print(features)

         vessel_size  yearBuilt  minutes_from_target_0  heading_0     lat_0  \
0             6368.0       2000             107.016667        316   7.50361   
1             6368.0       2000             103.983333        313   7.57302   
2             6368.0       2000              98.016667        312   7.65043   
3             6368.0       2000              96.600000        313   7.71275   
4             6368.0       2000              94.816667        313   7.77191   
...              ...        ...                    ...        ...       ...   
1518624       4966.0       2017              97.083333        289  59.46124   
1518625       4966.0       2017              97.783333        291  59.48803   
1518626       4966.0       2017              98.016667        294  59.51857   
1518627       4966.0       2017             103.333333        295  59.54180   
1518628       4966.0       2017             103.733333        298  59.57721   

            lon_0  minutes_from_target_1  heading_1

In [131]:
output_file = '/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/solutions/Mina/predictions13.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/solutions/Mina/predictions12.csv')

# Ensure the 'ID' column exists in the DataFrame
if 'ID' in df.columns:
    # Remove decimals in the 'ID' column by converting to integers
    # This will truncate the decimal part
    df['ID'] = df['ID'].astype(int)
    
    # Alternatively, if you want to round the 'ID' values
    # df['ID'] = df['ID'].round().astype(int)
else:
    print("The 'ID' column is not found in the CSV file.")
    # Optionally, you can exit the script or handle the error as needed
    # exit()

# Sort the DataFrame by the 'ID' column
df = df.sort_values(by='ID')

# Reset the index if desired
df = df.reset_index(drop=True)

# Save the modified DataFrame to a new CSV file
df.to_csv(output_file, index=False)