## All in One

In [2]:
from joblib import Parallel, delayed
from tqdm import tqdm
import numpy as np
from pathlib import Path
import pandas as pd

#filename = "aisdk-2023-11-08_1_knn"
filename = "aisdk-2023-11-09_1_knn"

data =  pd.read_csv(f"../workdir/AIS-KNN-Files/{filename}.csv", lineterminator='$').values.reshape((-1,100,6))

multiindex = pd.MultiIndex.from_product([range(s) for s in data.shape], names=["track", "timeindex", "column"])
data = pd.Series(data.flatten(), index=multiindex).unstack(level="column")
data.columns = ["mmsi", "time", "x", "y", "deg", "dist"]
data = data.drop(columns=["deg", "dist"])
data["time"] = pd.to_datetime(data["time"], unit="s")
train_ships = pd.Series(data.mmsi.unique()).sample(frac=0.7, random_state=42)
train = data[data.mmsi.isin(train_ships)].drop(columns="mmsi")
test = data[~data.mmsi.isin(train_ships)].drop(columns="mmsi")

data = []
train_ships = []

def generate_row(df):
    df = df.copy()
    df["src"] = "output"
    df["time"] = (df["time"].max()-df["time"]).dt.total_seconds()
    df["x"] = df["x"] - df["x"].iloc[0]
    df["y"] = df["y"] - df["y"].iloc[0]
    df.iloc[:-1,-1] = "input_" + pd.Series(np.arange(df.shape[0]-1)).astype(str)
    df=df.melt(id_vars="src")
    df["colname"] = df.src + "_" + df.variable
    df=df.drop(columns=["src", "variable"]).set_index("colname")
    df=df.transpose().reset_index(drop=True)
    df.columns.name = None
    df=df.sort_index(axis=1).drop(["output_time"], axis=1)
    df=df.loc[:, ~df.columns.to_series().str.endswith('index')]
    return df

def generate_model_frame(df, known):
    window_size = known + 1
    skipped_windows = 4

    windows = df.reset_index(drop=False).groupby('track').rolling(window_size)
    windows = (win for i, win in enumerate(windows) if len(win) == window_size and i % (skipped_windows + 1) == 0)
    windows = list(windows)
    res = Parallel(n_jobs=-1)(delayed(generate_row)(win) for win in tqdm(windows))
    return pd.concat(res, ignore_index=True)

for known in range(12, 2, -1):
    path = Path(f"../workdir/AIS-ModelFrames/{filename}_train_{known}_0.csv")
    if not path.exists():     
        train_model = generate_model_frame(train, known)
        test_model = generate_model_frame(test, known)

        if known == 12:   
            var = list(train_model.shape)
            var = var[0]
            trainlimit = var
            print(f"Trainlimit: {trainlimit}")

            var = list(test_model.shape)
            var = var[0]
            testlimit = var
            print(f"Testlimit: {testlimit}")

        print(train_model.shape)
        train_model = train_model.sample(n = trainlimit)
        print(train_model.shape)

        print(test_model.shape)
        test_model = test_model.sample(n = testlimit)
        print(test_model.shape)
        
        train_model.to_csv(path, index=False)
        test_model.to_csv(f"../workdir/AIS-ModelFrames/{filename}_test_{known}_0.csv", index=False)
        print(f"Done with {known}")

100%|██████████| 2838847/2838847 [31:04<00:00, 1522.76it/s] 
100%|██████████| 1264800/1264800 [13:49<00:00, 1524.34it/s]


Trainlimit: 2838847
Testlimit: 1264800
(2838847, 38)
(2838847, 38)
(1264800, 38)
(1264800, 38)
Done with 12


100%|██████████| 2838847/2838847 [31:01<00:00, 1524.69it/s] 
100%|██████████| 1264800/1264800 [13:41<00:00, 1540.29it/s]


(2838847, 35)
(2838847, 35)
(1264800, 35)
(1264800, 35)
Done with 11


100%|██████████| 3005838/3005838 [32:48<00:00, 1526.59it/s] 
100%|██████████| 1339200/1339200 [14:28<00:00, 1542.09it/s]


(3005838, 32)
(2838847, 32)
(1339200, 32)
(1264800, 32)
Done with 10


100%|██████████| 3005838/3005838 [32:51<00:00, 1524.40it/s] 
100%|██████████| 1339200/1339200 [14:26<00:00, 1546.04it/s]


(3005838, 29)
(2838847, 29)
(1339200, 29)
(1264800, 29)
Done with 9


100%|██████████| 3005838/3005838 [32:44<00:00, 1529.81it/s] 
100%|██████████| 1339200/1339200 [14:14<00:00, 1567.43it/s]


(3005838, 26)
(2838847, 26)
(1339200, 26)
(1264800, 26)
Done with 8


100%|██████████| 3005838/3005838 [32:27<00:00, 1543.11it/s] 
100%|██████████| 1339200/1339200 [14:11<00:00, 1573.32it/s]


(3005838, 23)
(2838847, 23)
(1339200, 23)
(1264800, 23)
Done with 7


100%|██████████| 3005838/3005838 [31:46<00:00, 1576.85it/s] 
100%|██████████| 1339200/1339200 [13:59<00:00, 1595.08it/s]


(3005838, 20)
(2838847, 20)
(1339200, 20)
(1264800, 20)
Done with 6


100%|██████████| 3172829/3172829 [33:47<00:00, 1565.18it/s] 
100%|██████████| 1413600/1413600 [14:46<00:00, 1595.31it/s]


(3172829, 17)
(2838847, 17)
(1413600, 17)
(1264800, 17)
Done with 5


100%|██████████| 3172829/3172829 [33:27<00:00, 1580.40it/s] 
100%|██████████| 1413600/1413600 [14:42<00:00, 1601.86it/s]


(3172829, 14)
(2838847, 14)
(1413600, 14)
(1264800, 14)
Done with 4


100%|██████████| 3172829/3172829 [33:23<00:00, 1583.34it/s] 
100%|██████████| 1413600/1413600 [14:45<00:00, 1596.53it/s]


(3172829, 11)
(2838847, 11)
(1413600, 11)
(1264800, 11)
Done with 3
