In [1]:
import pandas as pd
import torch
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from sklearn.preprocessing import StandardScaler, LabelEncoder
from torch import optim
import torch.nn as nn

try:
    import cupy as cp
    import cupy.cuda.runtime as runtime

    # get the num of GPUs
    n_gpus = runtime.getDeviceCount()
    print(f"Detected {n_gpus} GPU(s):")
    cupy_gpu_id = 7
    cp.cuda.Device(cupy_gpu_id).use()
    print(f"Using GPU {cupy_gpu_id}")
    using_cupy = True
except ImportError:
    print("CuPy is not available, falling back to NumPy.")
    using_cupy = False
    cp = np  # rename cupy to np for compatibility

Detected 10 GPU(s):
Using GPU 7


In [2]:
from utils import create_train_test_group
from utils import mroRnnDataset
from utils import collate_fn

In [3]:
# Check if CUDA (NVIDIA GPU) is available
if torch.cuda.is_available():
    # Get the number of available CUDA devices
    num_gpus = torch.cuda.device_count()
    print(f"Number of CUDA devices available: {num_gpus}")

    # Select a specific GPU (e.g., GPU 0)
    device = torch.device("cuda:8")  # Use "cuda:1" for GPU 1, etc.
    print(f"Using device: {torch.cuda.get_device_name(device)}")

else:
    device = "cpu"
    print("CUDA is not available, using CPU")

print(f"Using {device} device")

Number of CUDA devices available: 10
Using device: NVIDIA GeForce RTX 2080 Ti
Using cuda:8 device


In [4]:
sample_frac = 0.01
test_size = 0.1
max_seq_length = 10
batch_size = 2048
num_workers = 16
rnn_output_size = 16
learning_rate = 0.01
num_epochs = 1000


best_val_loss = float("inf")
counter = 0
model_save_folder = "./Out"
model_name = "Apr_28_LSTM_feature_eng"
patience = 10

In [None]:
file_name = "./Data/mro_daily_clean.csv"
data = pd.read_csv(file_name, index_col=0, engine="pyarrow")

Unnamed: 0,yr_nbr,mth_nbr,week_nbr,week_day,hard_braking,mild_hb,hard_braking2,harsh_hb,very_harsh_hb,est_hh_incm_prmr_cd,...,mro,record_days,latitude1,longitude1,purchase_lat1,purchase_lng1,purchase_yr_nbr,purchase_mth_nbr,tavg,random_avg_traffic
,,,,,,,,,,,,,,,,,,,,,
0,2019,3,13,7,0,0,0,0,0,6.0,...,0.0,70,45.0,-93.1,45.1,-93.2,2019.0,3.0,14.346983,12472.338289
1,2019,4,14,4,9,0,0,0,0,6.0,...,0.0,70,44.8,-92.9,45.1,-93.2,2019.0,3.0,13.670879,12410.618966
2,2019,4,14,6,9,1,0,0,0,6.0,...,0.0,70,44.8,-92.9,45.1,-93.2,2019.0,3.0,13.699830,12391.577959
3,2019,4,14,7,20,8,0,0,0,6.0,...,0.0,70,45.0,-93.1,45.1,-93.2,2019.0,3.0,13.704561,12313.165404
4,2019,4,15,4,0,0,0,0,0,6.0,...,0.0,70,45.0,-93.1,45.1,-93.2,2019.0,3.0,13.884265,12342.054130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18018968,2019,11,48,5,2,1,0,0,0,9.0,...,1.0,269,42.4,-82.9,42.4,-82.9,2018.0,6.0,12.700000,18816.070000
18018969,2019,12,49,5,1,0,0,0,0,9.0,...,0.0,269,42.4,-82.9,42.4,-82.9,2018.0,6.0,2.200000,6551.590000
18018970,2019,12,50,7,5,1,0,0,0,9.0,...,0.0,269,42.5,-82.9,42.4,-82.9,2018.0,6.0,-2.400000,15358.730000


In [None]:
column_need_std = [
    "hard_braking",
    "hard_acceleration",
    "speeding_sum",
    "day_mileage",
    "engn_size",
    "est_hh_incm_prmr_cd",
    "purchaser_age_at_tm_of_purch",
    "tavg",
    "random_avg_traffic",
]

column_after_std = [
    "hard_braking_std",
    "hard_acceleration_std",
    "speeding_sum_std",
    "day_mileage_std",
    "engn_size_std",
    "est_hh_incm_prmr_cd_std",
    "purchaser_age_at_tm_of_purch_std",
    "tavg_std",
    "random_avg_traffic_std",
]

column_need_encode = [
    "gmqualty_model",
    "umf_xref_finc_gbl_trim",
    "input_indiv_gndr_prmr_cd",
]

column_after_encode = [
    "gmqualty_model_encode",
    "umf_xref_finc_gbl_trim_encode",
    "input_indiv_gndr_prmr_cd_encode",
]

column_after_encode_std = [
    "gmqualty_model_encode_std",
    "umf_xref_finc_gbl_trim_encode_std",
    "input_indiv_gndr_prmr_cd_encode_std",
]

# standardize data
scaler = StandardScaler()
data[column_after_std] = scaler.fit_transform(data[column_need_std])

# encode data
label_encoders = {}
for i, col in enumerate(column_need_encode):
    le = LabelEncoder()
    data[column_after_encode[i]] = le.fit_transform(data[col])
    label_encoders[col] = le  # Store the encoder for later use if needed

data[column_after_encode_std] = scaler.fit_transform(data[column_after_encode])

Unnamed: 0,yr_nbr,mth_nbr,week_nbr,week_day,hard_braking,mild_hb,hard_braking2,harsh_hb,very_harsh_hb,est_hh_incm_prmr_cd,...,est_hh_incm_prmr_cd_std,purchaser_age_at_tm_of_purch_std,tavg_std,random_avg_traffic_std,gmqualty_model_encode,umf_xref_finc_gbl_trim_encode,input_indiv_gndr_prmr_cd_encode,gmqualty_model_encode_std,umf_xref_finc_gbl_trim_encode_std,input_indiv_gndr_prmr_cd_encode_std
,,,,,,,,,,,,,,,,,,,,,
0,2019,3,13,7,0,0,0,0,0,6.0,...,-0.120471,1.570537,-0.106431,-0.306925,10,2,1,-1.012058,-1.040603,0.552904
1,2019,4,14,4,9,0,0,0,0,6.0,...,-0.120471,1.570537,-0.213184,-0.309941,10,2,1,-1.012058,-1.040603,0.552904
2,2019,4,14,6,9,1,0,0,0,6.0,...,-0.120471,1.570537,-0.208613,-0.310871,10,2,1,-1.012058,-1.040603,0.552904
3,2019,4,14,7,20,8,0,0,0,6.0,...,-0.120471,1.570537,-0.207865,-0.314701,10,2,1,-1.012058,-1.040603,0.552904
4,2019,4,15,4,0,0,0,0,0,6.0,...,-0.120471,1.570537,-0.179491,-0.313290,10,2,1,-1.012058,-1.040603,0.552904
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18018968,2019,11,48,5,2,1,0,0,0,9.0,...,1.424761,0.145570,-0.366480,0.002982,13,6,1,-0.353471,0.813758,0.552904
18018969,2019,12,49,5,1,0,0,0,0,9.0,...,1.424761,0.145570,-2.024373,-0.596169,13,6,1,-0.353471,0.813758,0.552904
18018970,2019,12,50,7,5,1,0,0,0,9.0,...,1.424761,0.145570,-2.750688,-0.165918,13,6,1,-0.353471,0.813758,0.552904


In [None]:
# Example usage (assuming 'data', 'column_after_std', and 'column_after_encode_std' are already defined):
col_rnn_origin = ["id"] + column_after_std + column_after_encode_std + ["mro"]

data_rnn_origin = data[
    col_rnn_origin
].copy()  # Create a copy to avoid modifying the original DataFrame

data_rnn_origin = create_train_test_group(
    data_rnn_origin, sample_frac=sample_frac, test_size=test_size
)

Unnamed: 0,id,hard_braking_std,hard_acceleration_std,speeding_sum_std,day_mileage_std,engn_size_std,est_hh_incm_prmr_cd_std,purchaser_age_at_tm_of_purch_std,tavg_std,random_avg_traffic_std,gmqualty_model_encode_std,umf_xref_finc_gbl_trim_encode_std,input_indiv_gndr_prmr_cd_encode_std,mro,group
,,,,,,,,,,,,,,,
14576,wpvCrMKkwrvCucKbwrfCqsKrwr7CmsKWwqjCp8KpwprCmA==,-0.877139,-0.635340,-0.870160,-1.237176,1.040789,-2.695857,-2.093665,0.030617,-0.436927,-2.329233,-0.113422,0.552904,1.0,train
14577,wpvCrMKkwrvCucKbwrfCqsKrwr7CmsKWwqjCp8KpwprCmA==,-0.877139,-0.635340,-0.870160,-1.237899,1.040789,-2.695857,-2.093665,0.032903,-0.437392,-2.329233,-0.113422,0.552904,0.0,train
14578,wpvCrMKkwrvCucKbwrfCqsKrwr7CmsKWwqjCp8KpwprCmA==,1.399528,3.376614,2.247284,2.073757,1.040789,-2.695857,-2.093665,0.039447,-0.445585,-2.329233,-0.113422,0.552904,0.0,train
14579,wpvCrMKkwrvCucKbwrfCqsKrwr7CmsKWwqjCp8KpwprCmA==,1.049271,3.376614,3.380900,1.125505,1.040789,-2.695857,-2.093665,0.975624,-0.855509,-2.329233,-0.113422,0.552904,0.0,train
14580,wpvCrMKkwrvCucKbwrfCqsKrwr7CmsKWwqjCp8KpwprCmA==,1.574656,2.230342,1.680476,1.125866,1.040789,-2.695857,-2.093665,-0.461217,-0.704289,-2.329233,-0.113422,0.552904,0.0,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17967249,wrbCt8K1wrvDi8KswrjCtMONwr3CrsKVwqjCrcKlwp3Clg==,-0.877139,1.084069,-0.303352,-0.823400,-1.469546,0.394607,1.909815,0.213776,0.041796,-1.451116,-1.040603,-1.808634,0.0,train
17967250,wrbCt8K1wrvDi8KswrjCtMONwr3CrsKVwqjCrcKlwp3Clg==,-0.877139,-0.635340,-0.303352,-0.826652,-1.469546,0.394607,1.909815,0.216062,0.041331,-1.451116,-1.040603,-1.808634,0.0,train
17967251,wrbCt8K1wrvDi8KswrjCtMONwr3CrsKVwqjCrcKlwp3Clg==,-0.877139,-0.062204,-0.870160,-0.476478,-1.469546,0.394607,1.909815,0.218347,0.040866,-1.451116,-1.040603,-1.808634,0.0,train


In [8]:
rnn_features = [
    "hard_braking_std",
    "hard_acceleration_std",
    "speeding_sum_std",
    "day_mileage_std",
    "engn_size_std",
    "est_hh_incm_prmr_cd_std",
    "purchaser_age_at_tm_of_purch_std",
    "tavg_std",
    "random_avg_traffic_std",
    "gmqualty_model_encode_std",
    "umf_xref_finc_gbl_trim_encode_std",
    "input_indiv_gndr_prmr_cd_encode_std",
]

rnn_target = ["mro"]

In [9]:
train_data_set = mroRnnDataset(
    data_rnn_origin=data_rnn_origin,
    rnn_features=rnn_features,
    rnn_target=rnn_target,
    group="train",
    max_seq_length=max_seq_length,
)

test_data_set = mroRnnDataset(
    data_rnn_origin=data_rnn_origin,
    rnn_features=rnn_features,
    rnn_target=rnn_target,
    group="test",
    max_seq_length=max_seq_length,
)

In [10]:
  # Example batch size

train_dataloader = DataLoader(
    train_data_set,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn,
    num_workers=num_workers,
)

test_dataloader = DataLoader(
    test_data_set,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_fn,
    num_workers=num_workers,
)

In [11]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


class RnnModel(nn.Module):

    def __init__(
        self,
        rnn_type: str,
        input_size,
        rnn_output_size,
        output_size,
    ):
        super(RnnModel, self).__init__()
        if rnn_type == "LSTM":
            self.rnn = nn.LSTM(
                input_size=input_size,
                hidden_size=rnn_output_size,
                num_layers=1,
                batch_first=True,
            )
        # self.fc = nn.Sequential(
        #     nn.Linear(rnn_output_size, output_size),
        #     nn.Sigmoid()
        # )
        self.fc = nn.Linear(rnn_output_size, output_size)

    def forward(self, x, length: int):
        # x  (batch_size, seq_len, input_size)
        # use pack_padded_sequence and pad_packed_sequence to deal with different length of x input
        packed_x = pack_padded_sequence(
            x, length, batch_first=True, enforce_sorted=False
        )
        packed_out, _ = self.rnn(packed_x)
        rnn_out, _ = pad_packed_sequence(packed_out, batch_first=True)
        model_out = self.fc(rnn_out)  # (batch_size, seq_len, output_size)
        return model_out

In [12]:
input_feature_size = len(rnn_features)
output_size = len(rnn_target)

model = RnnModel(
    rnn_type="LSTM",
    input_size=input_feature_size,
    rnn_output_size=rnn_output_size,
    output_size=output_size,
).to(device)

print(model)

RnnModel(
  (rnn): LSTM(12, 16, batch_first=True)
  (fc): Linear(in_features=16, out_features=1, bias=True)
)


In [13]:
pos_weight_value = (data_rnn_origin["mro"] == 0).sum() / (data_rnn_origin["mro"] == 1).sum()
print(f"pos_weight: {pos_weight_value}")

pos_weight: 94.3175355450237


In [14]:
# criterion = nn.BCELoss()
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weight_value])).to(device)
# criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [15]:
# for input, target, length in train_dataloader:
#     target: torch.Tensor
#     input: torch.Tensor
#     print(input.shape)
#     print(target.shape)
#     print(length)
#     break

In [16]:
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
)

In [17]:
for epoch in range(num_epochs):
    running_loss = 0.0
    print(f"Epoch {epoch}")
    model.train()
    all_train_mro_preds = []
    all_train_mro_targets = []
    for train_inputs, train_targets, train_lengths in train_dataloader:

        optimizer.zero_grad()
        train_inputs = train_inputs.to(device)
        train_targets = train_targets.to(device)
        # train_lengths = train_lengths.to(device)

        model_out = model(train_inputs, train_lengths)

        loss = criterion(model_out[:, -1, :], train_targets[:, -1, :])
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

        mro_pred = torch.sigmoid(model_out[:, -1, :])
        mro_preds = (mro_pred > 0.5).int().cpu().numpy().flatten()
        mro_targets = train_targets[:, -1, :].cpu().numpy().flatten()

        all_train_mro_preds.extend(mro_preds)
        all_train_mro_targets.extend(mro_targets)

    average_loss = running_loss / len(train_dataloader)
    print(f"Average training loss: {average_loss}")

    train_f1 = f1_score(all_train_mro_targets, all_train_mro_preds)
    print(f"Training F1 Score: {train_f1}")
    train_accuracy = accuracy_score(all_train_mro_targets, all_train_mro_preds)
    print(f"Training Accuracy: {train_accuracy}")
    train_recall = recall_score(all_train_mro_targets, all_train_mro_preds)
    print(f"Training Recall: {train_recall}")
    train_precision = precision_score(all_train_mro_targets, all_train_mro_preds)
    print(f"Training Precision: {train_precision}")

    # Validation loop
    model.eval()
    val_loss = 0.0
    all_val_mro_preds = []
    all_val_mro_targets = []
    with torch.no_grad():
        for val_inputs, val_targets, test_lengths in test_dataloader:
            val_inputs = val_inputs.to(device)
            val_targets = val_targets.to(device)
            model_out = model(val_inputs, test_lengths)
            loss = criterion(model_out[:, -1, :], val_targets[:, -1, :])

            val_loss += loss.item()
            mro_pred = torch.sigmoid(model_out[:, -1, :])
            mro_preds = (mro_pred > 0.5).int().cpu().numpy().flatten()
            mro_targets = val_targets[:, -1, :].cpu().numpy().flatten()

            all_val_mro_preds.extend(mro_preds)
            all_val_mro_targets.extend(mro_targets)

        average_val_loss = val_loss / len(test_dataloader)
        print(f"Validation Loss: {average_loss}")
        val_f1 = f1_score(all_val_mro_targets, all_val_mro_preds)
        print(f"Validation F1 Score: {val_f1}")
        val_accuracy = accuracy_score(all_val_mro_targets, all_val_mro_preds)
        print(f"Validation Accuracy: {val_accuracy}")
        val_recall = recall_score(all_val_mro_targets, all_val_mro_preds)
        print(f"Validation Recall: {val_recall}")
        val_precision = precision_score(all_val_mro_targets, all_val_mro_preds)
        print(f"Validation Precision: {val_precision}")
    if average_val_loss < best_val_loss:
        best_val_loss = average_val_loss
        counter = 0
        # torchscript_model = torch.jit.script(model)
        # torchscript_model.save(model_save_folder + f"/{model_name}_torchscript.pt")
    else:
        counter += 1
        if counter >= patience:
            print("Early stopping!")
            break

Epoch 0
Average training loss: 1.3476907912595772
Training F1 Score: 0.02917667441036343
Training Accuracy: 0.7000760271264788
Training Recall: 0.43131548311990686
Training Precision: 0.015099030075800799
Validation Loss: 1.3476907912595772
Validation F1 Score: 0.03076134324532171
Validation Accuracy: 0.5442656541915265
Validation Recall: 0.6629834254143646
Validation Precision: 0.015745965096444037
Epoch 1
Average training loss: 1.3152963416076
Training F1 Score: 0.03094824329429543
Training Accuracy: 0.6099686768238908
Training Recall: 0.5960419091967404
Training Precision: 0.015886560032269573
Validation Loss: 1.3152963416076
Validation F1 Score: 0.029935675408213756
Validation Accuracy: 0.5273910685228711
Validation Recall: 0.6685082872928176
Validation Precision: 0.015310641528533469
Epoch 2
Average training loss: 1.307691956007922
Training F1 Score: 0.03141251405020607
Training Accuracy: 0.6069154274245051
Training Recall: 0.610011641443539
Training Precision: 0.01612134077868537