# 1. Preprocessing the datasets

In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

In [68]:
ais = pd.read_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/data/raw/ais_train.csv', sep='|')
vessel = pd.read_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/data/raw/vessels.csv', sep='|')
ports = pd.read_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/data/raw/ports.csv', sep='|')


#Drop columns we don't want to use
ais = ais.drop(['etaRaw','cog', 'rot'], axis=1)
vessel = vessel.drop(["DWT", "GT", "NT", "vesselType", "depth", "draft", "enginePower", "freshWater", "fuel", "homePort", "maxHeight", "maxSpeed", "maxWidth", "rampCapacity", "CEU", "shippingLineId"], axis=1)
ports = ports.drop(['name','portLocation','UN_LOCODE', 'countryName','ISO'], axis=1)


#Sort training data by vesselId and time
ais = ais.sort_values(by=['vesselId','time'])
vessel = vessel.sort_values(by='vesselId')
ports = ports.sort_values(by='portId')

#Print 
ais.head()

Unnamed: 0,time,sog,heading,navstat,latitude,longitude,vesselId,portId
131115,2024-01-12 14:07:47,17.1,316,0,7.50361,77.5834,61e9f38eb937134a3c4bfd8b,61d376b393c6feb83e5eb50c
131279,2024-01-12 14:31:00,17.3,313,0,7.57302,77.49505,61e9f38eb937134a3c4bfd8b,61d376d893c6feb83e5eb546
131514,2024-01-12 14:57:23,16.9,312,0,7.65043,77.39404,61e9f38eb937134a3c4bfd8b,61d376d893c6feb83e5eb546
131696,2024-01-12 15:18:48,16.9,313,0,7.71275,77.31394,61e9f38eb937134a3c4bfd8b,61d376d893c6feb83e5eb546
131885,2024-01-12 15:39:47,16.3,313,0,7.77191,77.23585,61e9f38eb937134a3c4bfd8b,61d376d893c6feb83e5eb546


##### Transform time

In [69]:
ais['time'] = pd.to_datetime(ais['time'])

##### Time since last measure

In [70]:
ais['time_since__last_measure'] = ais.groupby('vesselId')['time'].diff().dt.total_seconds()
ais['time_since__last_measure'].fillna(0, inplace=True)

##### Are the vessel standing still?

In [71]:

# Mapping of NAVSTAT codes to statuses
navstat_stationary_codes = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]  # Example codes for stationary statuses

# Create a boolean column based on NAVSTAT
ais['is_not_in_use'] = ais['navstat'].isin(navstat_stationary_codes)


##### Vessel Size

In [72]:
vessel['vessel_size'] = vessel['breadth'] * vessel['length']
vessel = vessel.drop(['breadth', 'length'],axis=1)
vessel['vessel_size'].fillna(vessel['vessel_size'].mean(), inplace=True)

In [73]:
vessel.head()

Unnamed: 0,vesselId,yearBuilt,vessel_size
0,61e9f38eb937134a3c4bfd8b,2000,6368.0
1,61e9f38eb937134a3c4bfd8d,2006,5642.0
2,61e9f38eb937134a3c4bfd8f,2010,5642.0
3,61e9f38eb937134a3c4bfd91,2011,4676.0
4,61e9f390b937134a3c4bfd93,2018,7439.256


##### Change column name of port location

In [74]:
ports.rename(columns={'latitude': 'portLatitude', 'longitude': 'portLongitude'}, inplace=True)

## 1.1 Merge datasets to training data

In [75]:
train = pd.merge(ais, vessel, on='vesselId')
train.head()

Unnamed: 0,time,sog,heading,navstat,latitude,longitude,vesselId,portId,time_since__last_measure,is_not_in_use,yearBuilt,vessel_size
0,2024-01-12 14:07:47,17.1,316,0,7.50361,77.5834,61e9f38eb937134a3c4bfd8b,61d376b393c6feb83e5eb50c,0.0,False,2000,6368.0
1,2024-01-12 14:31:00,17.3,313,0,7.57302,77.49505,61e9f38eb937134a3c4bfd8b,61d376d893c6feb83e5eb546,1393.0,False,2000,6368.0
2,2024-01-12 14:57:23,16.9,312,0,7.65043,77.39404,61e9f38eb937134a3c4bfd8b,61d376d893c6feb83e5eb546,1583.0,False,2000,6368.0
3,2024-01-12 15:18:48,16.9,313,0,7.71275,77.31394,61e9f38eb937134a3c4bfd8b,61d376d893c6feb83e5eb546,1285.0,False,2000,6368.0
4,2024-01-12 15:39:47,16.3,313,0,7.77191,77.23585,61e9f38eb937134a3c4bfd8b,61d376d893c6feb83e5eb546,1259.0,False,2000,6368.0


In [76]:
train = pd.merge(train, ports, on='portId')
train.head()

Unnamed: 0,time,sog,heading,navstat,latitude,longitude,vesselId,portId,time_since__last_measure,is_not_in_use,yearBuilt,vessel_size,portLongitude,portLatitude
0,2024-01-12 14:07:47,17.1,316,0,7.50361,77.5834,61e9f38eb937134a3c4bfd8b,61d376b393c6feb83e5eb50c,0.0,False,2000,6368.0,80.341111,13.263333
1,2024-04-04 12:56:59,0.0,30,5,1.28263,103.75133,61e9f39cb937134a3c4bfdbf,61d376b393c6feb83e5eb50c,1260.0,True,1999,5795.678,80.341111,13.263333
2,2024-04-04 13:17:59,0.0,30,5,1.28267,103.75127,61e9f39cb937134a3c4bfdbf,61d376b393c6feb83e5eb50c,1260.0,True,1999,5795.678,80.341111,13.263333
3,2024-04-04 13:35:59,0.0,30,5,1.28263,103.75128,61e9f39cb937134a3c4bfdbf,61d376b393c6feb83e5eb50c,1080.0,True,1999,5795.678,80.341111,13.263333
4,2024-04-04 13:56:59,0.1,30,5,1.28265,103.75132,61e9f39cb937134a3c4bfdbf,61d376b393c6feb83e5eb50c,1260.0,True,1999,5795.678,80.341111,13.263333


In [77]:
import math

def calculate_bearing(lat1, lon1, lat2, lon2):
    """
    Calculates the bearing between two points.
    """
    lat1 = math.radians(lat1)
    lat2 = math.radians(lat2)
    delta_lon = math.radians(lon2 - lon1)

    x = math.sin(delta_lon) * math.cos(lat2)
    y = math.cos(lat1) * math.sin(lat2) - (math.sin(lat1)
        * math.cos(lat2) * math.cos(delta_lon))
    initial_bearing = math.atan2(x, y)

    # Convert from radians to degrees and normalize
    initial_bearing = math.degrees(initial_bearing)
    compass_bearing = (initial_bearing + 360) % 360
    return compass_bearing

# Calculate bearing to next port
train['bearing_to_next_port'] = train.apply(
    lambda row: calculate_bearing(row['latitude'], row['longitude'], row['portLatitude'], row['portLongitude']), axis=1)

In [78]:
train.head()

Unnamed: 0,time,sog,heading,navstat,latitude,longitude,vesselId,portId,time_since__last_measure,is_not_in_use,yearBuilt,vessel_size,portLongitude,portLatitude,bearing_to_next_port
0,2024-01-12 14:07:47,17.1,316,0,7.50361,77.5834,61e9f38eb937134a3c4bfd8b,61d376b393c6feb83e5eb50c,0.0,False,2000,6368.0,80.341111,13.263333,24.982842
1,2024-04-04 12:56:59,0.0,30,5,1.28263,103.75133,61e9f39cb937134a3c4bfdbf,61d376b393c6feb83e5eb50c,1260.0,True,1999,5795.678,80.341111,13.263333,298.432136
2,2024-04-04 13:17:59,0.0,30,5,1.28267,103.75127,61e9f39cb937134a3c4bfdbf,61d376b393c6feb83e5eb50c,1260.0,True,1999,5795.678,80.341111,13.263333,298.432121
3,2024-04-04 13:35:59,0.0,30,5,1.28263,103.75128,61e9f39cb937134a3c4bfdbf,61d376b393c6feb83e5eb50c,1080.0,True,1999,5795.678,80.341111,13.263333,298.432183
4,2024-04-04 13:56:59,0.1,30,5,1.28265,103.75132,61e9f39cb937134a3c4bfdbf,61d376b393c6feb83e5eb50c,1260.0,True,1999,5795.678,80.341111,13.263333,298.432109


#### Encoding cathegorical features

In [79]:
from sklearn.preprocessing import LabelEncoder

train['is_not_in_use'] = train['is_not_in_use'].astype(str)

is_not_in_use_encoder = LabelEncoder()


train['is_not_in_use_encoded'] = is_not_in_use_encoder.fit_transform(train['is_not_in_use'])


train = train.drop(['is_not_in_use'], axis=1)

train.head()

Unnamed: 0,time,sog,heading,navstat,latitude,longitude,vesselId,portId,time_since__last_measure,yearBuilt,vessel_size,portLongitude,portLatitude,bearing_to_next_port,is_not_in_use_encoded
0,2024-01-12 14:07:47,17.1,316,0,7.50361,77.5834,61e9f38eb937134a3c4bfd8b,61d376b393c6feb83e5eb50c,0.0,2000,6368.0,80.341111,13.263333,24.982842,0
1,2024-04-04 12:56:59,0.0,30,5,1.28263,103.75133,61e9f39cb937134a3c4bfdbf,61d376b393c6feb83e5eb50c,1260.0,1999,5795.678,80.341111,13.263333,298.432136,1
2,2024-04-04 13:17:59,0.0,30,5,1.28267,103.75127,61e9f39cb937134a3c4bfdbf,61d376b393c6feb83e5eb50c,1260.0,1999,5795.678,80.341111,13.263333,298.432121,1
3,2024-04-04 13:35:59,0.0,30,5,1.28263,103.75128,61e9f39cb937134a3c4bfdbf,61d376b393c6feb83e5eb50c,1080.0,1999,5795.678,80.341111,13.263333,298.432183,1
4,2024-04-04 13:56:59,0.1,30,5,1.28265,103.75132,61e9f39cb937134a3c4bfdbf,61d376b393c6feb83e5eb50c,1260.0,1999,5795.678,80.341111,13.263333,298.432109,1


# 2. Creating training data

In [81]:
def create_training_data(df: pd.DataFrame, N_KEEP_PAST: int) -> pd.DataFrame:
    data_rows = []  # List to store data rows
    # Get unique vessel IDs from the DataFrame
    unique_vessels = df['vesselId'].unique()

    # Iterate over each unique vessel
    for vessel in tqdm(unique_vessels):
        # Filter data for the current vessel and sort by time
        vessel_data = df[df['vesselId'] == vessel].sort_values(by='time').reset_index(drop=True)
        total_rows = len(vessel_data)
        
        # Skip vessels with insufficient data
        if total_rows <= N_KEEP_PAST:
            continue

        # Process each record starting from N_KEEP_PAST
        for index in range(N_KEEP_PAST, total_rows):
            current_record = vessel_data.iloc[index]
            previous_records = vessel_data.iloc[index - N_KEEP_PAST:index]

            # Initialize a dictionary to hold the feature values and target
            feature_row = {
                'vesselId': vessel,
                'target_lat': current_record['latitude'],
                'target_lon': current_record['longitude'],
                'target_time': current_record['time'],
                'vessel_size': current_record['vessel_size'],
                'time_since__last_measure': current_record['time_since__last_measure'],
                'yearBuilt': current_record['yearBuilt'],
                'is_not_in_use': current_record['is_not_in_use_encoded']
                #'CEU': current_record['CEU']
            }

            # Loop through past records to collect features
            for j, past_record in previous_records.iterrows():
                time_difference = (current_record['time'] - past_record['time']).total_seconds() / 60.0
                feature_row[f'minutes_from_target_{j - (index - N_KEEP_PAST)}'] = time_difference
                feature_row[f'heading_{j - (index - N_KEEP_PAST)}'] = past_record['heading']
                feature_row[f'lat_{j - (index - N_KEEP_PAST)}'] = past_record['latitude']
                feature_row[f'lon_{j - (index - N_KEEP_PAST)}'] = past_record['longitude']
                feature_row[f'bearing_to_next_port_{j - (index - N_KEEP_PAST)}'] = past_record['bearing_to_next_port']


            # Append the feature row to the data list
            data_rows.append(feature_row)

    # Convert the list of feature rows to a DataFrame
    return pd.DataFrame(data_rows)


In [82]:
processed_train = create_training_data(train, 5)

100%|██████████| 688/688 [12:18<00:00,  1.07s/it]


#### Store training data as CSV

In [84]:
processed_train.to_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/solutions/Mina/processed_train_mina.csv', index=False)

# 3. Preprocessing the test data

#### 3.1 Load test data

In [85]:
test_dataset = pd.read_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/data/raw/ais_test.csv')

test_dataset['time'] = pd.to_datetime(test_dataset['time'])

test_dataset.head()

Unnamed: 0,ID,vesselId,time,scaling_factor
0,0,61e9f3aeb937134a3c4bfe3d,2024-05-08 00:03:16,0.3
1,1,61e9f473b937134a3c4c02df,2024-05-08 00:06:17,0.3
2,2,61e9f469b937134a3c4c029b,2024-05-08 00:10:02,0.3
3,3,61e9f45bb937134a3c4c0221,2024-05-08 00:10:34,0.3
4,4,61e9f38eb937134a3c4bfd8d,2024-05-08 00:12:27,0.3


#### Encode features

In [175]:
# from sklearn.preprocessing import LabelEncoder

# test_dataset['vesselId'] = test_dataset['vesselId'].astype(str)

# vesselId_encoder = LabelEncoder()

# test_dataset['vesselId_encoded'] = vesselId_encoder.fit_transform(test_dataset['vesselId'])

# test_dataset = test_dataset.drop(['vesselId'], axis=1)

# test_dataset.head()

Unnamed: 0,ID,time,scaling_factor,vesselId_encoded
0,0,2024-05-08 00:03:16,0.3,20
1,1,2024-05-08 00:06:17,0.3,185
2,2,2024-05-08 00:10:02,0.3,171
3,3,2024-05-08 00:10:34,0.3,138
4,4,2024-05-08 00:12:27,0.3,0


In [86]:

import pandas as pd
from tqdm import tqdm

def create_test_data(train_df: pd.DataFrame, test_df: pd.DataFrame, N_KEEP_PAST: int) -> pd.DataFrame:
    data_rows = []  # List to collect all the data rows
    unique_vessels = test_df['vesselId'].unique()  # Get unique vessel IDs

    # Iterate over each unique vessel
    for vessel in tqdm(unique_vessels):
        # Filter out the train data for this vessel that is less than the max time in test data
        vessel_train_data = train_df[train_df['vesselId'] == vessel].sort_values(by='time').reset_index(drop=True)
       
        # Get test data for the current vessel
        test_vessel_data = test_df[test_df['vesselId'] == vessel]
        
        for _, test_row in test_vessel_data.iterrows():
        
            target_time = test_row['time']
            #current_record = vessel_train_data.iloc[_]
            
            
            ID = test_row['ID']
            # Get the last N_KEEP_PAST records from the train data before the target time
            past_data = vessel_train_data[vessel_train_data['time'] < target_time].tail(N_KEEP_PAST)

            # Check if we have enough past data
            if len(past_data) < N_KEEP_PAST:
                continue  # Not enough past data; skip this test row

            # Prepare a dictionary to hold the features
            feature_row = {
                'vesselId': vessel, 
                'target_time': target_time, # Only include target time
            }

            # Loop through past records to collect features
            for j in range(N_KEEP_PAST):
                past_record = past_data.iloc[j]
                feature_row['yearBuilt'] = past_record['yearBuilt']
                feature_row['is_not_in_use'] = past_record['is_not_in_use_encoded']
                feature_row['vessel_size'] = past_record['vessel_size']
                feature_row['time_since__last_measure'] = past_record['time_since__last_measure']
                time_diff = (target_time - past_record['time']).total_seconds() / 60.0  # Difference in minutes

                # Add features for minutes from target and heading
                feature_row[f'minutes_from_target_{j}'] = time_diff
                feature_row[f'heading_{j}'] = past_record['heading']
                feature_row[f'lat_{j}'] = past_record['latitude']
                feature_row[f'lon_{j}'] = past_record['longitude']
                feature_row[f'bearing_to_next_port_{j}'] = past_record['bearing_to_next_port']
                

            feature_row['vesselId'] = vessel
            feature_row['ID'] = ID
            
            # Append the row to the list
            data_rows.append(feature_row)

    #Convert the list of feature rows to a DataFrame
    return pd.DataFrame(data_rows)



In [87]:
processed_test = create_test_data(train, test_dataset, 5)

100%|██████████| 215/215 [01:07<00:00,  3.21it/s]


In [88]:
print(processed_test.dtypes)

vesselId                            object
target_time                 datetime64[ns]
yearBuilt                            int64
is_not_in_use                        int64
vessel_size                        float64
time_since__last_measure           float64
minutes_from_target_0              float64
heading_0                            int64
lat_0                              float64
lon_0                              float64
bearing_to_next_port_0             float64
minutes_from_target_1              float64
heading_1                            int64
lat_1                              float64
lon_1                              float64
bearing_to_next_port_1             float64
minutes_from_target_2              float64
heading_2                            int64
lat_2                              float64
lon_2                              float64
bearing_to_next_port_2             float64
minutes_from_target_3              float64
heading_3                            int64
lat_3      

In [89]:
processed_test.to_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/solutions/Mina/processed_test.csv', index=False)

# Code for making predictions

In [48]:
# import pandas as pd
# import numpy as np
# from sklearn.model_selection import KFold
# from sklearn.multioutput import MultiOutputRegressor
# from sklearn.metrics import mean_squared_error
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.linear_model import ElasticNet
# import xgboost as xgb
# import lightgbm as lgb
# from scipy.optimize import minimize
# from sklearn.preprocessing import LabelEncoder

# # Load the data
# test_final = pd.read_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/solutions/Mina/processed_test.csv')
# final_train = pd.read_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/solutions/Mina/processed_train_mina.csv')

# # Define features and targets
# features = final_train.drop(columns=['target_lat', 'target_lon', 'vesselId', 'target_time'])
# targets = final_train[['target_lat', 'target_lon']]

# feature_columns = features.columns
# print(feature_columns)
# X_test = test_final[feature_columns]

# # Define the models
# models = {
#     'RandomForest': MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42, max_depth=4, n_jobs=8)),
#     'XGBoost': MultiOutputRegressor(xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, seed=42)),
#     'ElasticNet': MultiOutputRegressor(ElasticNet(random_state=42)),
#     'LightGBM': MultiOutputRegressor(lgb.LGBMRegressor(n_estimators=100, random_state=42, verbose=-1))
# }

# # Prepare arrays to hold OOF predictions and test predictions
# n_splits = 5
# kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# oof_preds = {model_name: np.zeros((features.shape[0], targets.shape[1])) for model_name in models}
# test_preds = {model_name: np.zeros((X_test.shape[0], targets.shape[1], n_splits)) for model_name in models}

# # Perform cross-validation and collect predictions
# for fold, (train_idx, valid_idx) in enumerate(kf.split(features, targets)):
#     X_train, y_train = features.iloc[train_idx], targets.iloc[train_idx]
#     X_valid, y_valid = features.iloc[valid_idx], targets.iloc[valid_idx]
    
#     for model_name, model in models.items():
#         print(f"Training and predicting with model: {model_name}")
#         clf = model
#         clf.fit(X_train, y_train)
#         y_pred_valid = clf.predict(X_valid)
#         y_pred_test = clf.predict(X_test)
      
        
#         # Save OOF predictions
#         oof_preds[model_name][valid_idx] = y_pred_valid
#         # Save test predictions
#         test_preds[model_name][:,:,fold] = y_pred_test

# # Define the loss function for optimization
# def mse_loss(weights):
#     weights = np.array(weights)
#     # Normalize weights to sum to 1
#     weights = weights / np.sum(weights)
#     # Combine OOF predictions using the weights
#     final_oof = np.zeros_like(targets.values)
#     for i, model_name in enumerate(models):
#         final_oof += weights[i] * oof_preds[model_name]
#     # Compute mean squared error
#     mse = mean_squared_error(targets.values, final_oof)
#     return mse

# # Optimization methods to try
# #methods = [
# #    'Nelder-Mead', 'Powell', 'trust-constr', 'CG', 'BFGS', 'Newton-CG',
# #    'L-BFGS-B', 'TNC', 'COBYLA', 'SLSQP', 'dogleg', 'trust-ncg',
# #    'trust-exact', 'trust-krylov'
# #]
# methods = [
#     'Nelder-Mead', 'Powell',  'CG', 'BFGS',
#     'L-BFGS-B', 'TNC', 'SLSQP', 
# ] # 'trust-constr' is just really slow and not performing well at the this moment

# # Initial weights
# initial_weights = np.ones(len(models)) / len(models)

# # Constraints and bounds
# constraints = ({'type': 'eq', 'fun': lambda w: np.sum(w) - 1})
# bounds = [(0, 1)] * len(models)

# # Optimize weights using different methods
# best_mse = np.inf
# best_weights = None
# best_method = None

# for method in methods:
#     print(f"Optimizing weights using method: {method}")
#     try:
#         if method in ['trust-constr', 'COBYLA', 'SLSQP', 'trust-ncg', 'trust-krylov', 'trust-exact']:
#             res = minimize(mse_loss, initial_weights, method=method, bounds=bounds, constraints=constraints)
#         elif method in ['L-BFGS-B', 'TNC']:
#             res = minimize(mse_loss, initial_weights, method=method, bounds=bounds)
#         else:
#             # For unconstrained methods, weights will be normalized in mse_loss
#             res = minimize(mse_loss, initial_weights, method=method)
#         if res.fun < best_mse:
#             best_mse = res.fun
#             best_weights = res.x / np.sum(res.x)  # Normalize weights
#             best_method = method
#         print(f"Method: {method}, MSE: {res.fun}")
#     except Exception as e:
#         print(f"Method: {method}, failed with error: {e}")

# # Average test predictions over folds for each model
# for model_name in models:
#     test_preds[model_name] = np.mean(test_preds[model_name], axis=2)  # Average over folds

# # Combine the test predictions using the best weights
# final_test_pred = np.zeros((X_test.shape[0], targets.shape[1]))
# for i, model_name in enumerate(models):
#     final_test_pred += best_weights[i] * test_preds[model_name]

# # Create a DataFrame with IDs and predictions
# prediction_df = pd.DataFrame({
#     'ID': test_final['ID'],
#     'longitude_predicted': final_test_pred[:, 1],
#     'latitude_predicted': final_test_pred[:, 0]
# })

# # Save the predictions to a CSV file
# prediction_df.to_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/solutions/Mina/predictions9.csv', index=False)

# # Print the best method and weights
# print(f"Best optimization method: {best_method}")
# print("Best weights:")
# for i, model_name in enumerate(models):
#     print(f"{model_name}: {best_weights[i]:.4f}")


Index(['vessel_size', 'yearBuilt', 'is_not_in_use', 'minutes_from_target_0',
       'heading_0', 'lat_0', 'lon_0', 'bearing_to_next_port_0',
       'minutes_from_target_1', 'heading_1', 'lat_1', 'lon_1',
       'bearing_to_next_port_1', 'minutes_from_target_2', 'heading_2', 'lat_2',
       'lon_2', 'bearing_to_next_port_2', 'minutes_from_target_3', 'heading_3',
       'lat_3', 'lon_3', 'bearing_to_next_port_3', 'minutes_from_target_4',
       'heading_4', 'lat_4', 'lon_4', 'bearing_to_next_port_4'],
      dtype='object')
Training and predicting with model: RandomForest
Training and predicting with model: XGBoost
Training and predicting with model: ElasticNet
Training and predicting with model: LightGBM
Training and predicting with model: RandomForest
Training and predicting with model: XGBoost
Training and predicting with model: ElasticNet
Training and predicting with model: LightGBM
Training and predicting with model: RandomForest
Training and predicting with model: XGBoost
Training a

In [94]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
import xgboost as xgb
import lightgbm as lgb
from scipy.optimize import minimize
from sklearn.preprocessing import LabelEncoder


# Load the data
test_final = pd.read_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/solutions/Mina/processed_test.csv')
final_train = pd.read_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/solutions/Mina/processed_train_mina.csv')

# Sort the training data by 'time_measured' to ensure chronological order
print(final_train.dtypes)
final_train = final_train.sort_values(by='time_since_last_measure').reset_index(drop=True)


# Define features and targets
features = final_train.drop(columns=['target_lat', 'target_lon', 'vesselId', 'time_since__last_measure', 'target_time'])
targets = final_train[['target_lat', 'target_lon']]

feature_columns = features.columns
print(feature_columns)
X_test = test_final[feature_columns]

# Define the models
models = {
    'RandomForest': MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42, max_depth=4, n_jobs=8)),
    'XGBoost': MultiOutputRegressor(xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, seed=42)),
    'ElasticNet': MultiOutputRegressor(ElasticNet(random_state=42)),
    'LightGBM': MultiOutputRegressor(lgb.LGBMRegressor(n_estimators=100, random_state=42, verbose=-1))
}

# Prepare arrays to hold OOF predictions and test predictions
n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)

oof_preds = {model_name: np.zeros((features.shape[0], targets.shape[1])) for model_name in models}
test_preds = {model_name: np.zeros((X_test.shape[0], targets.shape[1], n_splits)) for model_name in models}

# Perform time series cross-validation and collect predictions
for fold, (train_idx, valid_idx) in enumerate(tscv.split(features, targets)):
    X_train, y_train = features.iloc[train_idx], targets.iloc[train_idx]
    X_valid, y_valid = features.iloc[valid_idx], targets.iloc[valid_idx]
    
    for model_name, model in models.items():
        print(f"Training and predicting with model: {model_name}")
        clf = model
        clf.fit(X_train, y_train)
        y_pred_valid = clf.predict(X_valid)
        y_pred_test = clf.predict(X_test)
      
        # Save OOF predictions
        oof_preds[model_name][valid_idx] = y_pred_valid
        # Save test predictions
        test_preds[model_name][:,:,fold] = y_pred_test

# Define the loss function for optimization
def mse_loss(weights):
    weights = np.array(weights)
    # Normalize weights to sum to 1
    weights = weights / np.sum(weights)
    # Combine OOF predictions using the weights
    final_oof = np.zeros_like(targets.values)
    for i, model_name in enumerate(models):
        final_oof += weights[i] * oof_preds[model_name]
    # Compute mean squared error
    mse = mean_squared_error(targets.values, final_oof)
    return mse

# Optimization methods to try
methods = [
    'Nelder-Mead', 'Powell',  'CG', 'BFGS',
    'L-BFGS-B', 'TNC', 'SLSQP', 
]

# Initial weights
initial_weights = np.ones(len(models)) / len(models)

# Constraints and bounds
constraints = ({'type': 'eq', 'fun': lambda w: np.sum(w) - 1})
bounds = [(0, 1)] * len(models)

# Optimize weights using different methods
best_mse = np.inf
best_weights = None
best_method = None

for method in methods:
    print(f"Optimizing weights using method: {method}")
    try:
        if method in ['SLSQP', 'TNC', 'L-BFGS-B']:
            res = minimize(mse_loss, initial_weights, method=method, bounds=bounds, constraints=constraints)
        else:
            # For unconstrained methods, weights will be normalized in mse_loss
            res = minimize(mse_loss, initial_weights, method=method)
        if res.fun < best_mse:
            best_mse = res.fun
            best_weights = res.x / np.sum(res.x)  # Normalize weights
            best_method = method
        print(f"Method: {method}, MSE: {res.fun}")
    except Exception as e:
        print(f"Method: {method}, failed with error: {e}")

# Average test predictions over folds for each model
for model_name in models:
    test_preds[model_name] = np.mean(test_preds[model_name], axis=2)  # Average over folds

# Combine the test predictions using the best weights
final_test_pred = np.zeros((X_test.shape[0], targets.shape[1]))
for i, model_name in enumerate(models):
    final_test_pred += best_weights[i] * test_preds[model_name]

# Create a DataFrame with IDs and predictions
prediction_df = pd.DataFrame({
    'ID': test_final['ID'],
    'longitude_predicted': final_test_pred[:, 1],
    'latitude_predicted': final_test_pred[:, 0]
})

# Save the predictions to a CSV file
prediction_df.to_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/solutions/Mina/predictions13.csv', index=False)

# Print the best method and weights
print(f"Best optimization method: {best_method}")
print("Best weights:")
for i, model_name in enumerate(models):
    print(f"{model_name}: {best_weights[i]:.4f}")


vesselId                     object
target_lat                  float64
target_lon                  float64
target_time                  object
vessel_size                 float64
time_since__last_measure    float64
yearBuilt                     int64
is_not_in_use                 int64
minutes_from_target_0       float64
heading_0                     int64
lat_0                       float64
lon_0                       float64
bearing_to_next_port_0      float64
minutes_from_target_1       float64
heading_1                     int64
lat_1                       float64
lon_1                       float64
bearing_to_next_port_1      float64
minutes_from_target_2       float64
heading_2                     int64
lat_2                       float64
lon_2                       float64
bearing_to_next_port_2      float64
minutes_from_target_3       float64
heading_3                     int64
lat_3                       float64
lon_3                       float64
bearing_to_next_port_3      

KeyError: 'time_since_last_measure'

In [269]:
print(features)

         vessel_size  yearBuilt  minutes_from_target_0  heading_0     lat_0  \
0             6368.0       2000             107.016667        316   7.50361   
1             6368.0       2000             103.983333        313   7.57302   
2             6368.0       2000              98.016667        312   7.65043   
3             6368.0       2000              96.600000        313   7.71275   
4             6368.0       2000              94.816667        313   7.77191   
...              ...        ...                    ...        ...       ...   
1518624       4966.0       2017              97.083333        289  59.46124   
1518625       4966.0       2017              97.783333        291  59.48803   
1518626       4966.0       2017              98.016667        294  59.51857   
1518627       4966.0       2017             103.333333        295  59.54180   
1518628       4966.0       2017             103.733333        298  59.57721   

            lon_0  minutes_from_target_1  heading_1

In [50]:
output_file = '/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/solutions/Mina/predictions10.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv('/Users/minasjovik/Projects/machine-learning-project/A-laget-ML-bajas/solutions/Mina/predictions9.csv')

# Ensure the 'ID' column exists in the DataFrame
if 'ID' in df.columns:
    # Remove decimals in the 'ID' column by converting to integers
    # This will truncate the decimal part
    df['ID'] = df['ID'].astype(int)
    
    # Alternatively, if you want to round the 'ID' values
    # df['ID'] = df['ID'].round().astype(int)
else:
    print("The 'ID' column is not found in the CSV file.")
    # Optionally, you can exit the script or handle the error as needed
    # exit()

# Sort the DataFrame by the 'ID' column
df = df.sort_values(by='ID')

# Reset the index if desired
df = df.reset_index(drop=True)

# Save the modified DataFrame to a new CSV file
df.to_csv(output_file, index=False)