# LSTM Ray Training

## Data Preprocessing
1. target mro selection -- ["mro"] or ["sub_mro1", "sub_mro2", ...]
2. add previous mro
3. dealing with purchase time
1. Standardization
    - Continuous Features
    - Categorical Features


--- 

### Target MRO Selection

In [None]:
import pandas as pd

file_name = "./Data/mro_daily_clean.csv"
data = pd.read_csv(file_name, index_col=0, engine="pyarrow")

# control parameter: target_mro
# a list defined by user
target_mro: list = ["mro"]


mro_detail = [
    "battery_dummy",
    "brake_dummy",
    "tire_dummy",
    "lof_dummy",
    "wiper_dummy",
    "filter_dummy",
    "others",
]
if target_mro == ["mro"]:
    data["target_mro"] = data["mro"]
elif isinstance(target_mro, list) and all(col in mro_detail for col in target_mro):
    data["target_mro"] = data[target_mro].max(axis=1)
else:
    print("Target MRO is defined with error")
    print("Use the mro as default mro")
    target_mro = ["mro"]
    data["target_mro"] = data["mro"]

---

### Add Previous MRO

In [None]:
# control parameter: add_mro_prev
add_mro_prev: bool = True


if add_mro_prev:
    data.sort_values(by=["id", "yr_nbr", "week_nbr"], inplace=True)
    data["mro_prev"] = data.groupby("id")["mro"].shift(1)
    mro_prev = ["mro_prev"]
else:
    mro_prev = []

---

### Dealing with Purchase Time

In [None]:
# control parameter: add_purchase_time
add_purchase_time: bool = True


if add_purchase_time:
    data["purchase_month"] = data["purchase_mth_nbr"].astype(int)
    # devide into 2 bins: 1-6 is the first half, 7-12 is the second half
    data["purchase_half_year"] = pd.cut(
        data["purchase_month"], bins=[0, 6, 12], labels=["first_half", "second_half"]
    )

    data["purchase_time"] = (
        data["purchase_yr_nbr"].astype(int).astype(str)
        + "_"
        + data["purchase_half_year"].astype(str)
    )

    purchase_time = ["purchase_time"]
else:
    purchase_time = []

---

### Weekly Aggregation

In [None]:
continuous_variable = [
    "hard_braking",
    "hard_acceleration",
    "speeding_sum",
    "day_mileage",
    "engn_size",
    "est_hh_incm_prmr_cd",
    "purchaser_age_at_tm_of_purch",
    "tavg",
    "random_avg_traffic",
]

category_variable = [
    "gmqualty_model",
    "umf_xref_finc_gbl_trim",
    "input_indiv_gndr_prmr_cd",
] + purchase_time

driver_navigation = [
    "id",
    "yr_nbr",
    "mth_nbr",
    "week_nbr",
]

data = data[
    driver_navigation
    + continuous_variable
    + category_variable
    + mro_prev
    + ["target_mro"]
]

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from utils import create_train_test_group

# control parameter: aggregation function
agg_fun = ["mean", "sum", "max", "min", "std", "skew"]


agg_rules = {
    "mth_nbr": "first",
    "target_mro": "max",
    "est_hh_incm_prmr_cd": "first",
    "purchaser_age_at_tm_of_purch": "first",
    "input_indiv_gndr_prmr_cd": "first",
    "gmqualty_model": "first",
    "umf_xref_finc_gbl_trim": "first",
    "engn_size": "first",
    "tavg": agg_fun,
    "random_avg_traffic": agg_fun,
}

# control parameter: add_driver_behavior
add_driver_behavior = False

# "hard_braking": agg_fun,
# "hard_acceleration": agg_fun,
# "speeding_sum": agg_fun,
# "day_mileage": agg_fun,
if add_driver_behavior:
    agg_rules["hard_braking"] = agg_fun
    agg_rules["hard_acceleration"] = agg_fun
    agg_rules["speeding_sum"] = agg_fun
    agg_rules["day_mileage"] = agg_fun
if add_mro_prev:
    agg_rules["mro_prev"] = "max"
if add_purchase_time:
    agg_rules["purchase_time"] = "first"


data = data.groupby(["id", "yr_nbr", "week_nbr"]).agg(agg_rules)

data.reset_index(inplace=True)


def flatten_columns(df: pd.DataFrame):
    def clean_col(col):
        if isinstance(col, tuple):
            col_name, agg_func = col
            agg_func = agg_func.strip()
            if col_name in (["target_mro"] + mro_prev) and agg_func == "max":
                return col_name
            if agg_func in ("first", ""):
                return col_name
            return f"{col_name}_{agg_func}"
        else:
            return col

    df.columns = [clean_col(col) for col in df.columns]
    return df


data = flatten_columns(data)
data.fillna(0, inplace=True)
data = data.drop(["yr_nbr", "week_nbr", "mth_nbr"], axis=1)

data

---

### Standardization

In [None]:
col_need_std = [
    item
    for item in data.columns.values.tolist()
    if item not in (["target_mro"] + mro_prev + ["id"] + category_variable)
]

col_need_encode = category_variable


scaler = StandardScaler()
data[col_need_std] = scaler.fit_transform(data[col_need_std])


encoder = OneHotEncoder(sparse_output=False)
encoded_categorical = encoder.fit_transform(data[col_need_encode])

category_counts = [len(encoder.categories_[i]) for i, _ in enumerate(col_need_encode)]

onehot_feature_names = []
for col_idx, col in enumerate(col_need_encode):
    num_categories = category_counts[col_idx]
    onehot_feature_names.extend([f"{col}_onehot_{i}" for i in range(num_categories)])

encoded_df = pd.DataFrame(
    encoded_categorical, index=data.index, columns=onehot_feature_names
)
data = pd.concat([data, encoded_df], axis=1)
data = data.drop(columns=col_need_encode)

data

---

In [None]:
# control parameter: sample_frac, test_size, valid_size
sample_frac = 1.0
test_size = 0.1
valid_size = 0.1


rnn_features = col_need_std + onehot_feature_names + mro_prev
rnn_target = ["target_mro"]
col_rnn_origin = ["id"] + rnn_features + rnn_target
data_rnn_origin = data[col_rnn_origin].copy()
data_rnn_origin = create_train_test_group(
    data_rnn_origin,
    sample_frac=sample_frac,
    test_size=test_size,
    valid_size=valid_size,
    random_state=42,
)

data_rnn_origin

In [None]:
print('The RNN input features are:')
print(rnn_features)
print("The RNN target is:")
print(rnn_target)

In [None]:
rnn_features

---

## Model Training
### Build the data loader

In [None]:
from model import mroRnnDataset
# ---------------------------------------------------------
max_seq_length = 8

train_data_set = mroRnnDataset(
    data_rnn_origin=data_rnn_origin,
    rnn_features=rnn_features,
    rnn_target=rnn_target,
    group="train",
    max_seq_length=max_seq_length,
)

val_data_set = mroRnnDataset(
    data_rnn_origin=data_rnn_origin,
    rnn_features=rnn_features,
    rnn_target=rnn_target,
    group="valid",
    max_seq_length=max_seq_length,
)

In [None]:
input_feature_size = len(rnn_features)
output_size = len(rnn_target)

In [None]:
alpha = 1 - data_rnn_origin["target_mro"].eq(1).mean()
print(f"Alpha value for Focal Loss: {alpha}")
gamma = 4
print(f"Gamma value for Focal Loss: {gamma}")

In [None]:
import os
import time

# ---------------------------------------------------------
# log file
result_csv_file = "./Out/ray_lstm_no_driver_behavior/ray_train_log.csv"
# if the file exist, delete it
os.makedirs(os.path.dirname(result_csv_file), exist_ok=True)

result_csv_file = os.path.abspath(result_csv_file)

if os.path.exists(result_csv_file):
    os.remove(result_csv_file)
    print(f"Deleted existing CSV file: {result_csv_file}")
# ---------------------------------------------------------
# model output
output_dir = "./Out/ray_lstm_no_driver_behavior"
os.makedirs(output_dir, exist_ok=True)
current_time = time.strftime("%Y%m%d_%H%M%S")
run_dir = os.path.join(output_dir, current_time)
os.makedirs(run_dir, exist_ok=True)
run_dir = os.path.abspath(run_dir)

# ---------------------------------------------------------
# early stop control parameter
best_val_loss = float("inf")
early_stop_patience = 20
early_stop_counter = 0
best_epoch = 0

In [None]:
# control parameter:
rnn_type = "LSTM"
rnn_output_size = 128
bidirectional = True
num_layers = 2
pooling_method = None
num_heads = None
use_last_hidden = True

learning_rate = 0.0005
batch_size = 4096

num_workers = 4

num_epochs = 1000

# ---------------------------------------------------------
# early stop control parameter
best_val_loss = float("inf")
early_stop_patience = 20
early_stop_counter = 0
best_epoch = 0
# ---------------------------------------------------------

In [None]:
from model import FocalLoss
from model import RnnModel
from utils import collate_fn

import torch
from torch import optim
from torch.utils.data import DataLoader


import ray.train.torch

from sklearn.metrics import (
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    roc_auc_score,
)


def train_func():
    global best_val_loss, early_stop_counter, best_epoch
    # Model, Loss, Optimizer
    model = RnnModel(
        rnn_type=rnn_type,
        input_size=input_feature_size,
        rnn_output_size=rnn_output_size,
        output_size=output_size,
        bidirectional=bidirectional,
        num_layers=num_layers,
        pooling_method=pooling_method,
        num_heads=num_heads,
        use_last_hidden=True,
    )
    # [1] Prepare model.
    model = ray.train.torch.prepare_model(model)
    # model.to("cuda")  # This is done by `prepare_model`
    criterion = FocalLoss(alpha=alpha, gamma=gamma)
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-4)

    # Data
    train_dataloader = DataLoader(
        train_data_set,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=collate_fn,
        num_workers=num_workers,
    )
    val_dataloader = DataLoader(
        val_data_set,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=collate_fn,
        num_workers=num_workers,
    )

    # [2] Prepare dataloader.
    train_loader = ray.train.torch.prepare_data_loader(train_dataloader)
    val_loader = ray.train.torch.prepare_data_loader(val_dataloader)

    # Training
    for epoch in range(num_epochs):
        if ray.train.get_context().get_world_size() > 1:
            train_loader.sampler.set_epoch(epoch)

        train_running_loss = 0.0
        # print(f"Epoch {epoch}")
        model.train()
        all_train_mro_preds = []
        all_train_mro_targets = []
        all_train_mro_scores = []
        for train_inputs, train_targets, train_lengths in train_loader:

            optimizer.zero_grad()
            train_inputs = train_inputs[:, :-1, :]
            train_targets = train_targets
            train_lengths = train_lengths

            # model_out = model(train_inputs, train_lengths)
            model_out = model(train_inputs, train_lengths)
            # the loss using Focal Loss with mean
            # which means the average loss of this batch
            loss = criterion(model_out, train_targets[:, -1, :])
            loss.backward()
            optimizer.step()
            # running loss equals to single loss * (train length / batch_szie)
            train_running_loss += loss.item()

            mro_pred = torch.sigmoid(model_out)
            mro_preds = (mro_pred > 0.5).int().cpu().numpy().flatten()
            mro_targets = train_targets[:, -1, :].cpu().numpy().flatten()

            all_train_mro_preds.extend(mro_preds)
            all_train_mro_targets.extend(mro_targets)
            all_train_mro_scores.extend(
                torch.sigmoid(model_out).detach().cpu().numpy().flatten()
            )

        train_average_loss = train_running_loss / (len(train_loader) / batch_size)
        train_f1 = f1_score(all_train_mro_targets, all_train_mro_preds)
        train_accuracy = accuracy_score(all_train_mro_targets, all_train_mro_preds)
        train_recall = recall_score(all_train_mro_targets, all_train_mro_preds)
        train_precision = precision_score(all_train_mro_targets, all_train_mro_preds)
        train_auc = roc_auc_score(all_train_mro_targets, all_train_mro_scores)
        # ------------------------------------------------
        model.eval()
        val_running_loss = 0.0
        all_val_mro_preds = []
        all_val_mro_targets = []
        all_val_mro_scores = []

        with torch.no_grad():
            for val_inputs, val_targets, val_lengths in val_loader:
                val_inputs = val_inputs[:, :-1, :]
                val_targets = val_targets

                model_out = model(val_inputs, val_lengths)
                loss = criterion(model_out, val_targets[:, -1, :])
                val_running_loss += loss.item()

                mro_pred = torch.sigmoid(model_out)
                mro_preds = (mro_pred > 0.5).int().cpu().numpy().flatten()
                mro_targets = val_targets[:, -1, :].cpu().numpy().flatten()
                all_val_mro_preds.extend(mro_preds)
                all_val_mro_targets.extend(mro_targets)
                all_val_mro_scores.extend(
                    torch.sigmoid(model_out).cpu().numpy().flatten()
                )

        val_average_loss = val_running_loss / (len(val_loader) / batch_size)
        val_f1 = f1_score(all_val_mro_targets, all_val_mro_preds)
        val_accuracy = accuracy_score(all_val_mro_targets, all_val_mro_preds)
        val_recall = recall_score(all_val_mro_targets, all_val_mro_preds)
        val_precision = precision_score(all_val_mro_targets, all_val_mro_preds)
        val_auc = roc_auc_score(all_val_mro_targets, all_val_mro_scores)
        # ------------------------------------------------
        # [3] Report metrics and checkpoint.
        metrics = {
            "epoch": epoch,
            "train_average_loss": train_average_loss,
            "val_average_loss": val_average_loss,
            "train_f1": train_f1,
            "train_accuracy": train_accuracy,
            "train_recall": train_recall,
            "train_precision": train_precision,
            "train_auc": train_auc,
            "val_f1": val_f1,
            "val_accuracy": val_accuracy,
            "val_recall": val_recall,
            "val_precision": val_precision,
            "val_auc": val_auc,
        }

        # with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
        #     torch.save(
        #         model.state_dict(), os.path.join(temp_checkpoint_dir, "model.pt")
        #     )
        #     ray.train.report(
        #         metrics,
        #         checkpoint=ray.train.Checkpoint.from_directory(temp_checkpoint_dir),
        #     )
        ray.train.report(
            metrics,
            checkpoint=ray.train.Checkpoint.from_directory(run_dir),
        )
        if ray.train.get_context().get_world_rank() == 0:
            df = pd.DataFrame([metrics])
            # Append to CSV
            write_header = not os.path.exists(result_csv_file)
            df.to_csv(result_csv_file, mode="a", header=write_header, index=False)

            print(metrics)

        if val_average_loss < best_val_loss:
            best_val_loss = val_average_loss
            best_epoch = epoch
            early_stop_counter = 0
            torch.save(model.state_dict(), os.path.join(run_dir, "model.pt"))
        else:
            early_stop_counter += 1

        if early_stop_counter >= early_stop_patience:
            print(f"Early stopping at epoch {epoch} due to no improvement in val loss.")
            break


# [4] Configure scaling and resource requirements.
scaling_config = ray.train.ScalingConfig(num_workers=4, use_gpu=True)

# [5] Launch distributed training job.
trainer = ray.train.torch.TorchTrainer(
    train_func,
    scaling_config=scaling_config,
)
result = trainer.fit()

---

# Result Analysis

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# load benchmark and our model training log data
benchmark_log_path = "./Out/ray_lstm_no_driver_behavior/ray_train_log.csv"
our_model_log_path = "./Out/ray_lstm/ray_train_log.csv"

benchmark_data = pd.read_csv(benchmark_log_path)
our_model_data = pd.read_csv(our_model_log_path)

# set the layout
fig, axes = plt.subplots(3, 2, figsize=(16, 18))


def plot_metric(
    ax,
    benchmark_data,
    our_model_data,
    metric_name,
    title,
    ylabel,
    model1_label="Benchmark Model",
    model2_label="Our Model",
):
    ax.plot(
        benchmark_data["epoch"],
        benchmark_data[f"train_{metric_name}"],
        marker="o",
        label=f"{model1_label} Train",
        linestyle="--",
    )
    ax.plot(
        benchmark_data["epoch"],
        benchmark_data[f"val_{metric_name}"],
        marker="s",
        label=f"{model1_label} Val",
    )

    ax.plot(
        our_model_data["epoch"],
        our_model_data[f"train_{metric_name}"],
        marker="^",
        label=f"{model2_label} Train",
        linestyle="--",
    )
    ax.plot(
        our_model_data["epoch"],
        our_model_data[f"val_{metric_name}"],
        marker="v",
        label=f"{model2_label} Val",
    )

    ax.set_title(title)
    ax.set_xlabel("Epoch")
    ax.set_ylabel(ylabel)
    ax.legend()
    ax.grid(True)


# Plot 1: Loss
plot_metric(
    axes[0, 0],
    benchmark_data,
    our_model_data,
    "average_loss",
    "Loss vs Epoch",
    "Average Loss",
)
# Plot 2: F1 Score
plot_metric(
    axes[0, 1], benchmark_data, our_model_data, "f1", "F1 Score vs Epoch", "F1 Score"
)
# Plot 3: Recall
plot_metric(
    axes[1, 0], benchmark_data, our_model_data, "recall", "Recall vs Epoch", "Recall"
)
# Plot 4: Precision
plot_metric(
    axes[1, 1],
    benchmark_data,
    our_model_data,
    "precision",
    "Precision vs Epoch",
    "Precision",
)
# Plot 5: Accuracy
plot_metric(
    axes[2, 0],
    benchmark_data,
    our_model_data,
    "accuracy",
    "Accuracy vs Epoch",
    "Accuracy",
)
# Plot 6: AUC
plot_metric(axes[2, 1], benchmark_data, our_model_data, "auc", "AUC vs Epoch", "AUC")


plt.tight_layout()
plt.show()

---

# Summary of Data Preprocessing

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder


def preprocess_data(
    file_name: str,
    target_mro: list,
    maintain_repair_mro: str,
    add_mro_prev: bool,
    add_purchase_time: bool,
    add_driver_behavior: bool,
    agg_weeks: int,
    agg_fun: list,
):
    data = pd.read_csv(file_name, index_col=0, engine="pyarrow")
    print("Load the dataset", file_name, "successfully.")
    # --------------------------------------------------
    # target mro selection
    mro_detail = [
        "battery_dummy",
        "brake_dummy",
        "tire_dummy",
        "lof_dummy",
        "wiper_dummy",
        "filter_dummy",
        "others",
    ]
    if target_mro == ["mro"]:
        data["target_mro"] = data["mro"]
    elif isinstance(target_mro, list) and all(col in mro_detail for col in target_mro):
        data["target_mro"] = data[target_mro].max(axis=1)
    else:
        print("Target MRO is defined with error")
        print("Use the mro as default mro")
        target_mro = ["mro"]
        data["target_mro"] = data["mro"]
    print("The MRO choosen is:", target_mro)
    # --------------------------------------------------
    # choose full mro, maintanance or repair
    if maintain_repair_mro == "maintenance":
        data["maintenance"] = data.where(
            (data["mro"] == 1) & (data["service_days"] <= 3), 1, 0
        )
        data["target_mro"] = data["maintenance"]
        print("Target MRO is maintenance.")
    elif maintain_repair_mro == "repair":
        data["repair"] = data.where(
            (data["mro"] == 1) & (data["service_days"] > 3), 1, 0
        )
        data["target_mro"] = data["repair"]
        print("Target MRO is Repair.")
    else:
        print("No need to know the maintenance or repair.")
        print("Use the Target MRO.")
    # --------------------------------------------------
    # # select the purchase year
    # if purchase_year_select == 2018:
    #     data = data[data["purchase_yr_nbr"] == 2018]
    # elif purchase_year_select == 2019:
    #     data = data[data["purchase_yr_nbr"] == 2019]
    # else:
    #     print("Not select purchase year.")
    # --------------------------------------------------
    # add previous mro
    if add_mro_prev:
        data.sort_values(by=["id", "yr_nbr", "week_nbr"], inplace=True)
        data["mro_prev"] = data.groupby("id")["mro"].shift(1)
        mro_prev = ["mro_prev"]
    else:
        mro_prev = []
    print("Add Previous MRO:", add_mro_prev)
    # --------------------------------------------------
    # dealing with purchase time
    if add_purchase_time:
        data["purchase_month"] = data["purchase_mth_nbr"].astype(int)
        # devide into 2 bins: 1-6 is the first half, 7-12 is the second half
        data["purchase_half_year"] = pd.cut(
            data["purchase_month"],
            bins=[0, 6, 12],
            labels=["first_half", "second_half"],
        )

        data["purchase_time"] = (
            data["purchase_yr_nbr"].astype(int).astype(str)
            + "_"
            + data["purchase_half_year"].astype(str)
        )

        purchase_time = ["purchase_time"]
    else:
        purchase_time = []
    print("Add Purchase Time:", add_purchase_time)
    # --------------------------------------------------
    # weekly aggregation
    continuous_variable = [
        "hard_braking",
        "hard_acceleration",
        "speeding_sum",
        "day_mileage",
        "engn_size",
        "est_hh_incm_prmr_cd",
        "purchaser_age_at_tm_of_purch",
        "tavg",
        "random_avg_traffic",
    ]

    category_variable = [
        "gmqualty_model",
        "umf_xref_finc_gbl_trim",
        "input_indiv_gndr_prmr_cd",
    ] + purchase_time

    driver_navigation = [
        "id",
        "yr_nbr",
        "mth_nbr",
        "week_nbr",
    ]

    data = data[
        driver_navigation
        + continuous_variable
        + category_variable
        + mro_prev
        + ["target_mro"]
    ]

    agg_rules = {
        # "mth_nbr": "first",
        "target_mro": "max",
        "est_hh_incm_prmr_cd": "first",
        "purchaser_age_at_tm_of_purch": "first",
        "input_indiv_gndr_prmr_cd": "first",
        "gmqualty_model": "first",
        "umf_xref_finc_gbl_trim": "first",
        "engn_size": "first",
        "tavg": agg_fun,
        "random_avg_traffic": agg_fun,
    }
    # --------------------------------------------------
    if add_driver_behavior:
        agg_rules["hard_braking"] = agg_fun
        agg_rules["hard_acceleration"] = agg_fun
        agg_rules["speeding_sum"] = agg_fun
        agg_rules["day_mileage"] = agg_fun
    print("Add behavior of Driver:", add_driver_behavior)
    # --------------------------------------------------
    if add_mro_prev:
        agg_rules["mro_prev"] = "max"
    if add_purchase_time:
        agg_rules["purchase_time"] = "first"
    # --------------------------------------------------
    # week aggregate
    data["group_week"] = (data["week_nbr"] - 1) // agg_weeks
    print("Aggregate the data into", agg_weeks, "week.")
    # data = data.groupby(["id", "yr_nbr", "week_nbr"]).agg(agg_rules)
    data = data.groupby(["id", "yr_nbr", "group_week"]).agg(agg_rules)
    # --------------------------------------------------
    data.reset_index(inplace=True)

    def flatten_columns(df: pd.DataFrame):
        def clean_col(col):
            if isinstance(col, tuple):
                col_name, agg_func = col
                agg_func = agg_func.strip()
                if col_name in (["target_mro"] + mro_prev) and agg_func == "max":
                    return col_name
                if agg_func in ("first", ""):
                    return col_name
                return f"{col_name}_{agg_func}"
            else:
                return col

        df.columns = [clean_col(col) for col in df.columns]
        return df

    data = flatten_columns(data)
    data.fillna(0, inplace=True)
    # data = data.drop(["yr_nbr", "week_nbr", "mth_nbr", "group_week"], axis=1)
    # data = data.drop(["yr_nbr", "mth_nbr", "group_week"], axis=1)
    data = data.drop(["yr_nbr", "group_week"], axis=1)
    # --------------------------------------------------
    # Standardization
    col_need_std = [
        item
        for item in data.columns.values.tolist()
        if item not in (["target_mro"] + mro_prev + ["id"] + category_variable)
    ]

    col_need_encode = category_variable

    scaler = StandardScaler()
    data[col_need_std] = scaler.fit_transform(data[col_need_std])

    encoder = OneHotEncoder(sparse_output=False)
    encoded_categorical = encoder.fit_transform(data[col_need_encode])

    category_counts = [
        len(encoder.categories_[i]) for i, _ in enumerate(col_need_encode)
    ]

    onehot_feature_names = []
    for col_idx, col in enumerate(col_need_encode):
        num_categories = category_counts[col_idx]
        onehot_feature_names.extend(
            [f"{col}_onehot_{i}" for i in range(num_categories)]
        )

    encoded_df = pd.DataFrame(
        encoded_categorical, index=data.index, columns=onehot_feature_names
    )
    data = pd.concat([data, encoded_df], axis=1)
    data = data.drop(columns=col_need_encode)
    print("Finish the process of data standardization")

    rnn_features = col_need_std + onehot_feature_names + mro_prev
    print("The RNN features are:", rnn_features)

    rnn_target = ["target_mro"]
    print("The RNN target is:", rnn_target)

    return {
        "data": data,
        "rnn_features": rnn_features,
        "rnn_target": rnn_target,
    }

In [None]:
prep_data = preprocess_data(
    file_name="./Data/mro_daily_clean.csv",
    target_mro=["mro"],
    maintain_repair_mro="full",
    add_mro_prev=True,
    add_purchase_time=True,
    add_driver_behavior=True,
    agg_weeks=1,
    agg_fun=["mean", "sum", "max", "min", "std", "skew"],
)

In [None]:
prep_data["data"]

In [None]:
prep_data["rnn_features"]

* 1-week 3,972,103 rows x 81 cols
* 2-week 2,104,340 rows x 81 cols
* 8-week 664,004 rows x 81 cols
    * 415,819 rows x 66 cols (2018)
    * 248,185 rows x 77 cols (2019)

In [None]:
data = prep_data["data"]
rnn_features = prep_data["rnn_features"]
rnn_target = prep_data["rnn_target"]
col_rnn_origin = ["id"] + rnn_features + rnn_target
data_rnn_origin = data[col_rnn_origin].copy()

In [None]:
data_rnn_origin