## All in One

In [1]:
from joblib import Parallel, delayed
from tqdm import tqdm
import numpy as np
from pathlib import Path
import pandas as pd

filename = "aisdk-2023-11-08_1_kkn"

data =  pd.read_csv(f"../workdir/AIS-KNN-Files/{filename}.csv", lineterminator='$').values.reshape((-1,100,6))

multiindex = pd.MultiIndex.from_product([range(s) for s in data.shape], names=["track", "timeindex", "column"])
data = pd.Series(data.flatten(), index=multiindex).unstack(level="column")
data.columns = ["mmsi", "time", "x", "y", "deg", "dist"]
data = data.drop(columns=["deg", "dist"])
data["time"] = pd.to_datetime(data["time"], unit="s")
train_ships = pd.Series(data.mmsi.unique()).sample(frac=0.7, random_state=42)
train = data[data.mmsi.isin(train_ships)].drop(columns="mmsi")
test = data[~data.mmsi.isin(train_ships)].drop(columns="mmsi")

data = []
train_ships = []

def generate_row(df):
    df = df.copy()
    df["src"] = "output"
    df["time"] = (df["time"].max()-df["time"]).dt.total_seconds()
    df["x"] = df["x"] - df["x"].iloc[0]
    df["y"] = df["y"] - df["y"].iloc[0]
    df.iloc[:-1,-1] = "input_" + pd.Series(np.arange(df.shape[0]-1)).astype(str)
    df=df.melt(id_vars="src")
    df["colname"] = df.src + "_" + df.variable
    df=df.drop(columns=["src", "variable"]).set_index("colname")
    df=df.transpose().reset_index(drop=True)
    df.columns.name = None
    df=df.sort_index(axis=1).drop(["output_time"], axis=1)
    df=df.loc[:, ~df.columns.to_series().str.endswith('index')]
    return df

def generate_model_frame(df, known):
    window_size = known + 1
    skipped_windows = 4

    windows = df.reset_index(drop=False).groupby('track').rolling(window_size)
    windows = (win for i, win in enumerate(windows) if len(win) == window_size and i % (skipped_windows + 1) == 0)
    windows = list(windows)
    res = Parallel(n_jobs=-1)(delayed(generate_row)(win) for win in tqdm(windows))
    return pd.concat(res, ignore_index=True)

for known in range(12, 2, -1):
    path = Path(f"../workdir/AIS-ModelFrames/{filename}_train_{known}_0.csv")
    if not path.exists():     
        train_model = generate_model_frame(train, known)
        test_model = generate_model_frame(test, known)

        if known == 12:   
            var = list(train_model.shape)
            var = var[0]
            trainlimit = var
            print(f"Trainlimit: {trainlimit}")

            var = list(test_model.shape)
            var = var[0]
            testlimit = var
            print(f"Testlimit: {testlimit}")

        print(train_model.shape)
        train_model = train_model.sample(n = trainlimit)
        print(train_model.shape)

        print(test_model.shape)
        test_model = test_model.sample(n = testlimit)
        print(test_model.shape)
        
        train_model.to_csv(path, index=False)
        test_model.to_csv(f"../workdir/AIS-ModelFrames/{filename}_test_{known}_0.csv", index=False)
        print(f"Done with {known}")

100%|██████████| 2681903/2681903 [21:09<00:00, 2112.03it/s] 
100%|██████████| 1112344/1112344 [12:13<00:00, 1516.99it/s]


Trainlimit: 2681903
Testlimit: 1112344
(2681903, 38)
(2681903, 38)
(1112344, 38)
(1112344, 38)
Done with 12


100%|██████████| 2681903/2681903 [27:03<00:00, 1652.08it/s] 
100%|██████████| 1112344/1112344 [11:55<00:00, 1553.94it/s]


(2681903, 35)
(2681903, 35)
(1112344, 35)
(1112344, 35)
Done with 11


100%|██████████| 2839662/2839662 [28:56<00:00, 1635.19it/s] 
100%|██████████| 1177776/1177776 [12:36<00:00, 1556.22it/s]


(2839662, 32)
(2681903, 32)
(1177776, 32)
(1112344, 32)
Done with 10


100%|██████████| 2839662/2839662 [28:43<00:00, 1647.86it/s] 
100%|██████████| 1177776/1177776 [12:36<00:00, 1556.70it/s]


(2839662, 29)
(2681903, 29)
(1177776, 29)
(1112344, 29)
Done with 9


100%|██████████| 2839662/2839662 [30:52<00:00, 1532.63it/s] 
100%|██████████| 1177776/1177776 [12:35<00:00, 1558.60it/s]


(2839662, 26)
(2681903, 26)
(1177776, 26)
(1112344, 26)
Done with 8


100%|██████████| 2839662/2839662 [30:45<00:00, 1538.60it/s] 
100%|██████████| 1177776/1177776 [12:37<00:00, 1553.98it/s]


(2839662, 23)
(2681903, 23)
(1177776, 23)
(1112344, 23)
Done with 7


100%|██████████| 2839662/2839662 [30:50<00:00, 1534.54it/s] 
100%|██████████| 1177776/1177776 [12:33<00:00, 1562.24it/s]


(2839662, 20)
(2681903, 20)
(1177776, 20)
(1112344, 20)
Done with 6


100%|██████████| 2997421/2997421 [30:32<00:00, 1635.28it/s] 
100%|██████████| 1243208/1243208 [13:16<00:00, 1560.02it/s]


(2997421, 17)
(2681903, 17)
(1243208, 17)
(1112344, 17)
Done with 5


100%|██████████| 2997421/2997421 [32:38<00:00, 1530.29it/s] 
100%|██████████| 1243208/1243208 [13:16<00:00, 1560.73it/s]


(2997421, 14)
(2681903, 14)
(1243208, 14)
(1112344, 14)
Done with 4


100%|██████████| 2997421/2997421 [32:18<00:00, 1546.51it/s] 
100%|██████████| 1243208/1243208 [13:15<00:00, 1563.41it/s]


(2997421, 11)
(2681903, 11)
(1243208, 11)
(1112344, 11)
Done with 3
