In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from datetime import datetime, timedelta

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
df = pd.read_csv('../weekly_filter_new_3.csv',index_col=0)

### preprocess for LSTM

In [4]:
#transfer yr/mth/week nbr to time
df.loc[(df['mth_nbr'] == 12) & (df['week_nbr'] == 1), 'week_nbr'] = 53

In [5]:
#add NA records for the missing weeks to the original dataset
df = df.sort_values(['id', 'yr_nbr', 'mth_nbr', 'week_nbr'])
grouped = df.groupby('id')
filled_dfs = []
for group_name, group_data in grouped:
    min_yr_nbr = group_data['yr_nbr'].min()
    max_yr_nbr = group_data['yr_nbr'].max()
    for yr_nbr in range(min_yr_nbr, max_yr_nbr + 1):
        min_week_nbr = group_data[group_data['yr_nbr'] == yr_nbr]['week_nbr'].min()
        max_week_nbr = group_data[group_data['yr_nbr'] == yr_nbr]['week_nbr'].max()
        expected_weeks = set(range(min_week_nbr, max_week_nbr + 1))
        actual_weeks = set(group_data[group_data['yr_nbr'] == yr_nbr]['week_nbr'])
        missing_weeks = expected_weeks - actual_weeks
        if missing_weeks:
            missing_data = [{'id': group_name, 'yr_nbr': yr_nbr, 'week_nbr': week_nbr} for week_nbr in missing_weeks]
            filled_dfs.append(pd.DataFrame(missing_data))
if filled_dfs:
    filled_df = pd.concat(filled_dfs)
    df = pd.concat([df, filled_df], ignore_index=True)

In [7]:
#get date
def get_date_from_year_week(year, week):
    first_day_of_year = datetime(year, 1, 1)
    days_to_add = timedelta(days=(week - 1) * 7)
    target_date = first_day_of_year + days_to_add
    return target_date
df['time'] = df.apply(lambda row: get_date_from_year_week(row['yr_nbr'], row['week_nbr']), axis=1)
df = df.drop(['yr_nbr','week_nbr','mth_nbr'], axis=1)

In [8]:
#fill NA records
fill_cols_zero = ['mro_new', 'hard_braking', 'hard_acceleration', 'speeding_sum', 'day_mileage']
df[fill_cols_zero] = df[fill_cols_zero].fillna(0)
fill_cols_ffill = ['est_hh_incm_prmr_cd', 'purchaser_age_at_tm_of_purch',
                   'input_indiv_gndr_prmr_cd', 'gmqualty_model', 'umf_xref_finc_gbl_trim',
                   'engn_size', 'purchase_time', 'tavg', 'random_avg_traffic']
df[fill_cols_ffill] = df[fill_cols_ffill].fillna(method='ffill')
df = df.set_index('time')

  df[fill_cols_ffill] = df[fill_cols_ffill].fillna(method='ffill')


In [None]:
### Encode factor variables
label_encoder = LabelEncoder()
result_df = df[['id', 'mro_new', 'est_hh_incm_prmr_cd', 'purchaser_age_at_tm_of_purch', 'engn_size', 'tavg', 'random_avg_traffic','hard_braking', 'hard_acceleration', 'speeding_sum', 'day_mileage']]
result_df['input_indiv_gndr_prmr_cd'] = label_encoder.fit_transform(df['input_indiv_gndr_prmr_cd'])
result_df['gmqualty_model'] = label_encoder.fit_transform(df['gmqualty_model'])
result_df['umf_xref_finc_gbl_trim'] = label_encoder.fit_transform(df['umf_xref_finc_gbl_trim'])
result_df['purchase_time'] = label_encoder.fit_transform(df['purchase_time'])

In [None]:
result_df[f'mro_lag'] = result_df.groupby('id')['mro_new'].transform(lambda x: x.shift(1))

In [19]:
df = result_df.dropna()

In [25]:
#driving behavior + mro_indicator
feature_cols = ['est_hh_incm_prmr_cd', 'input_indiv_gndr_prmr_cd',
       'gmqualty_model', 'umf_xref_finc_gbl_trim', 'purchase_time', 'mro_lag','purchaser_age_at_tm_of_purch', 'engn_size', 'tavg',
       'random_avg_traffic', 'hard_braking', 'hard_acceleration', 'speeding_sum', 'day_mileage']
df = df[['id','mro_new'] + feature_cols]
target_col = 'mro_new'

In [None]:
#no driving behavior + mro_indicator
feature_cols = ['est_hh_incm_prmr_cd', 'input_indiv_gndr_prmr_cd',
       'gmqualty_model', 'umf_xref_finc_gbl_trim', 'purchase_time',
       'mro_lag', 'purchaser_age_at_tm_of_purch', 'engn_size', 'tavg',
       'random_avg_traffic'
]
df = df[['id','mro_new'] + feature_cols]
target_col = 'mro_new'

In [None]:
#no driving behavior + no mro_indicator
feature_cols = ['est_hh_incm_prmr_cd', 'input_indiv_gndr_prmr_cd',
       'gmqualty_model', 'umf_xref_finc_gbl_trim', 'purchase_time',
       'purchaser_age_at_tm_of_purch', 'engn_size', 'tavg',
       'random_avg_traffic'
]
df = df[['id','mro_new'] + feature_cols]
target_col = 'mro_new'

In [26]:
data = []
for id, group in df.groupby('id'):
    group_features = group[feature_cols].values
    group_target = group[target_col].values
    data.append((group_features, group_target))
print(f"Total sequences: {len(data)}")
print(f"First sequence features shape: {data[0][0].shape}")
print(f"First sequence target shape: {data[0][1].shape}")

Total sequences: 58839
First sequence features shape: (101, 14)
First sequence target shape: (101,)


In [None]:
features = [torch.tensor(d[0], dtype=torch.float32) for d in data]
targets = [torch.tensor(d[1], dtype=torch.float32) for d in data]

features_padded = pad_sequence(features, batch_first=True)
targets_padded = pad_sequence(targets, batch_first=True)

print("Features shape:", features_padded.shape)
print("Targets shape:", targets_padded.shape)

Features shape: torch.Size([58839, 103, 14])
Targets shape: torch.Size([58839, 103])


In [None]:
def create_windows(features, targets, time_window, prediction_length):
    X = []
    y = []
    
    for i in range(len(features)):
        feature_len = features[i].shape[0]
        
        # Check if feature length is sufficient
        if feature_len < time_window + prediction_length:
            raise ValueError("Feature length is less than the sum of time_window and prediction_length.")
        
        for j in range(feature_len - time_window - prediction_length + 1):
            X.append(torch.tensor(features[i][j:j + time_window], dtype=torch.float32))
            y.append(torch.tensor(targets[i][j + time_window:j + time_window + prediction_length], dtype=torch.float32))   
    return torch.stack(X), torch.stack(y)

time_window = 8
prediction_length = 1
X, y = create_windows(features_padded, targets_padded, time_window, prediction_length)

In [39]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        out, _ = self.lstm(x, (h0, c0))
        out = out[:, -1, :]
        out = self.fc(out)
        return out

In [None]:
input_size = 14
hidden_size = 128
num_layers = 4
output_size = 1
learning_rate = 0.001

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = LSTMModel(input_size, hidden_size, num_layers, output_size).to(device)

positive_weight = (train_y == 0).sum().float() / (train_y == 1).sum().float()
criterion = nn.BCEWithLogitsLoss(pos_weight=positive_weight)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

In [None]:
#Calculate Permutation Feature Importance. Shuffle a feature column -> calculate test metrics
'''
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)
test_features_np = test_X.numpy() 
column_index = 11
column_to_shuffle = test_features_np[:, :, column_index]
shuffled_indices = np.random.permutation(len(column_to_shuffle))
shuffled_column = column_to_shuffle[shuffled_indices]
test_features_np[:, :,  column_index] = shuffled_column
test_features_tensor = torch.tensor(test_features_np, dtype=torch.float32)
test_targets_tensor = test_y
test_dataset_new = TensorDataset(test_features_tensor, test_targets_tensor)
test_loader_new = DataLoader(test_dataset_new, batch_size=64, shuffle=False)
'''

In [42]:
model.eval()
all_targets = []
all_predictions = []

with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)

        outputs = model(inputs).squeeze()
        all_targets.extend(targets.cpu().numpy().flatten().tolist())
        all_predictions.extend(torch.sigmoid(outputs).cpu().numpy().flatten().tolist())

all_targets = np.array(all_targets)

all_predictions = np.array(all_predictions)

In [None]:
threshold = 0.685
all_predictions_1 = (all_predictions> threshold).astype(float)

# 计算 precision, recall, f1-score
precision, recall, f1_score, _ = precision_recall_fscore_support(all_targets, all_predictions_1, average='binary')

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1_score}')