In [1]:
import pandas as pd
import numpy as np
import h5py

In [None]:
#Loading data
with h5py.File('../../data/3d_array/train_data_3d_h5.h5', 'r') as f:
    train_X = f['train_data_3d'][:]
with h5py.File('../../data/3d_array/val_data_3d_h5.h5', 'r') as f:
    val_X = f['val_data_3d'][:]
# with h5py.File('../../data/3d_array/test_data_3d_h5.h5', 'r') as f:
#     test_X = f['test_data_3d'][:]

train_y = pd.read_parquet('../../data/3d_array/train_targets.parquet')
val_y = pd.read_parquet('../../data/3d_array/val_targets.parquet')

In [7]:
train_y['end_of_month'].value_counts()

end_of_month
2017-03-31    270223
2017-04-30    270223
2017-05-31    270223
2017-06-30    270223
2017-07-31    270223
2017-08-31    270223
2017-09-30    270223
2017-10-31    270223
2017-11-30    270223
2017-12-31    270223
2018-01-31    270223
2018-02-28    270223
Name: count, dtype: int64

In [8]:
train_y = train_y[train_y['end_of_month'].isin(['2018-02-28'])]
val_y = val_y[val_y['end_of_month'].isin(['2018-02-28'])]

  train_y = train_y[train_y['end_of_month'].isin(['2018-02-28'])]
  val_y = val_y[val_y['end_of_month'].isin(['2018-02-28'])]


In [25]:
train_y.sort_values(by=['customer_ID'])

Unnamed: 0,customer_ID,end_of_month,target
11,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2018-02-28,0
23,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,2018-02-28,0
35,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,2018-02-28,0
47,000084e5023181993c2e1b665ac88dbb1ce9ef621ec537...,2018-02-28,0
59,000098081fde4fd64bc4d503a5d6f86a0aedc425c96f52...,2018-02-28,0
...,...,...,...
3242627,ffff39cc22a375d07369980d02d617883dd28ad81a6aa3...,2018-02-28,0
3242639,ffff41c8a52833b56430603969b9ca48d208e7c192c6a4...,2018-02-28,0
3242651,ffff518bb2075e4816ee3fe9f3b152c57fc0e6f01bf7fd...,2018-02-28,0
3242663,ffff9984b999fccb2b6127635ed0736dda94e544e67e02...,2018-02-28,0


In [9]:
train_X.shape, train_y.shape

((270223, 12, 107), (270223, 3))

In [10]:
val_X.shape, val_y.shape

((115811, 12, 107), (115811, 3))

In [None]:
import torch

import torch.nn as nn

class SmallRNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, fc_size, output_size=1):
        super(SmallRNNModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, fc_size)
        self.output = nn.Linear(fc_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # LSTM layer
        lstm_out, _ = self.lstm(x)
        # Take the output of the last time step
        lstm_last_out = lstm_out[:, -1, :]
        # Fully connected layer
        fc_out = self.fc(lstm_last_out)
        # Final output layer
        output = self.output(fc_out)
        # Apply sigmoid for binary classification
        return self.sigmoid(output)

# Example usage
input_size = train_X.shape[2]  # Number of features
hidden_size = 64  # Hidden state size for LSTM
fc_size = 32  # Size of the fully connected layer

model = SmallRNNModel(input_size=input_size, hidden_size=hidden_size, fc_size=fc_size)

SmallRNNModel(
  (lstm): LSTM(107, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=32, bias=True)
  (output): Linear(in_features=32, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [22]:
batch_size = 10000
from torchinfo import summary
summary(model, input_size=(batch_size, train_X.shape[1], train_X.shape[2]), device='cpu',
        col_names=["input_size", "kernel_size","output_size", "num_params"])

Layer (type:depth-idx)                   Input Shape               Kernel Shape              Output Shape              Param #
SmallRNNModel                            [10000, 12, 107]          --                        [10000, 1]                --
├─LSTM: 1-1                              [10000, 12, 107]          --                        [10000, 12, 64]           44,288
├─Linear: 1-2                            [10000, 64]               --                        [10000, 32]               2,080
├─Linear: 1-3                            [10000, 32]               --                        [10000, 1]                33
├─Sigmoid: 1-4                           [10000, 1]                --                        [10000, 1]                --
Total params: 46,401
Trainable params: 46,401
Non-trainable params: 0
Total mult-adds (G): 5.34
Input size (MB): 51.36
Forward/backward pass size (MB): 64.08
Params size (MB): 0.19
Estimated Total Size (MB): 115.63

In [23]:
from torch.utils.data import Dataset, DataLoader
class TimeSeriesDataset(Dataset):
    def __init__(self, data, targets):
        """
        Args:
            data: numpy array of shape (num_ids, time_steps, features)
            targets: numpy array of shape (num_ids,)
        """
        self.data = torch.FloatTensor(data)
        self.targets = torch.FloatTensor(targets).unsqueeze(1)  # Add dimension for output
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.targets[idx]

In [26]:
train_dataset = TimeSeriesDataset(train_X, train_y['target'].values)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [27]:
val_dataset = TimeSeriesDataset(val_X, val_y['target'].values)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [34]:
train_dataset.__getitem__(0)[0].shape, train_dataset.__getitem__(0)[1]

(torch.Size([12, 107]), tensor([0.]))

In [33]:
val_dataset.__getitem__(0)[0].shape, val_dataset.__getitem__(0)[1]

(torch.Size([12, 107]), tensor([0.]))

In [35]:
from sklearn.metrics import roc_auc_score
import time
import copy

import torch.optim as optim

# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training parameters
num_epochs = 20
patience = 3  # Number of epochs to wait for improvement before early stopping

# Initialize variables for early stopping
best_val_loss = float('inf')
best_val_auc = 0.0
best_model_wts = copy.deepcopy(model.parameters)
no_improve_epochs = 0

# For tracking metrics
train_losses = []
val_losses = []
val_aucs = []

# Move model to device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

print(f"Training on {device}")
start_time = time.time()

# Training loop
for epoch in range(num_epochs):
    # Training phase
    model.train()
    running_loss = 0.0
    
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * inputs.size(0)
    
    epoch_train_loss = running_loss / len(train_dataset)
    train_losses.append(epoch_train_loss)
    
    # Validation phase
    model.eval()
    running_loss = 0.0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            running_loss += loss.item() * inputs.size(0)
            
            # Collect predictions and labels for AUC calculation
            all_preds.extend(outputs.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Calculate metrics
    epoch_val_loss = running_loss / len(val_dataset)
    val_losses.append(epoch_val_loss)
    
    all_preds = [p[0] for p in all_preds]  # Flatten predictions
    all_labels = [l[0] for l in all_labels]  # Flatten labels
    epoch_val_auc = roc_auc_score(all_labels, all_preds)
    val_aucs.append(epoch_val_auc)
    
    # Print epoch statistics
    print(f"Epoch {epoch+1}/{num_epochs} - "
          f"Train Loss: {epoch_train_loss:.4f}, "
          f"Val Loss: {epoch_val_loss:.4f}, "
          f"Val AUC: {epoch_val_auc:.4f}")
    
    # Check if this is the best model
    if epoch_val_loss < best_val_loss:
        best_val_loss = epoch_val_loss
        best_val_auc = epoch_val_auc
        best_model_wts = copy.deepcopy(model.state_dict())
        no_improve_epochs = 0
    else:
        no_improve_epochs += 1
    
    # Early stopping
    if no_improve_epochs >= patience:
        print(f"Early stopping triggered after {epoch+1} epochs")
        break

# Training complete
time_elapsed = time.time() - start_time
print(f"Training completed in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s")
print(f"Best val loss: {best_val_loss:.4f}, Best val AUC: {best_val_auc:.4f}")

# Load best model weights
model.load_state_dict(best_model_wts)

Training on cuda:0
Epoch 1/20 - Train Loss: 0.5447, Val Loss: 0.3329, Val AUC: 0.9118
Epoch 2/20 - Train Loss: 0.3006, Val Loss: 0.2746, Val AUC: 0.9343
Epoch 3/20 - Train Loss: 0.2601, Val Loss: 0.2518, Val AUC: 0.9436
Epoch 4/20 - Train Loss: 0.2458, Val Loss: 0.2443, Val AUC: 0.9470
Epoch 5/20 - Train Loss: 0.2412, Val Loss: 0.2410, Val AUC: 0.9485
Epoch 6/20 - Train Loss: 0.2379, Val Loss: 0.2388, Val AUC: 0.9495
Epoch 7/20 - Train Loss: 0.2355, Val Loss: 0.2365, Val AUC: 0.9503
Epoch 8/20 - Train Loss: 0.2337, Val Loss: 0.2354, Val AUC: 0.9509
Epoch 9/20 - Train Loss: 0.2325, Val Loss: 0.2365, Val AUC: 0.9513
Epoch 10/20 - Train Loss: 0.2321, Val Loss: 0.2341, Val AUC: 0.9514
Epoch 11/20 - Train Loss: 0.2306, Val Loss: 0.2334, Val AUC: 0.9518
Epoch 12/20 - Train Loss: 0.2302, Val Loss: 0.2360, Val AUC: 0.9520
Epoch 13/20 - Train Loss: 0.2300, Val Loss: 0.2330, Val AUC: 0.9521
Epoch 14/20 - Train Loss: 0.2288, Val Loss: 0.2327, Val AUC: 0.9522
Epoch 15/20 - Train Loss: 0.2286, Val 

<All keys matched successfully>

In [None]:
import os

# Save the model weights

# Create directory if it doesn't exist
save_dir = '../../models/deep_learning'
os.makedirs(save_dir, exist_ok=True)

# Save model state dictionary
model_path = os.path.join(save_dir, 'experiment_1.pth')
torch.save(model.state_dict(), model_path)

# Save additional information for later reference
checkpoint_path = os.path.join(save_dir, 'experiment_1.pth')
checkpoint = {
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'val_loss': best_val_loss,
    'val_auc': best_val_auc,
    'input_size': input_size,
    'hidden_size': hidden_size,
    'fc_size': fc_size,
    'train_losses': train_losses,
    'val_losses': val_losses,
    'val_aucs': val_aucs
}
torch.save(checkpoint, checkpoint_path)

print(f"Model saved to {model_path}")
print(f"Checkpoint saved to {checkpoint_path}")

Model saved to ../../models/deep_learning\experiment_1.pth
Checkpoint saved to ../../models/deep_learning\experiment_1.pth
