In [1]:
import pandas as pd
import numpy as np
import h5py
import torch
import torch.nn as nn
import random
import sys
import os

#trying to ensure reproducibility
torch.manual_seed(0)
random.seed(0)
np.random.seed(0)

In [2]:
# Setting path to load util functions
from pathlib import Path
parent_dir = Path.cwd().parents[1]
sys.path.append(os.path.abspath(parent_dir))

In [3]:
experiment_num = 25

In [4]:
#Loading data
with h5py.File('../../data/3d_array/mod_train_data_3d_h5.h5', 'r') as f:
    train_X = f['train_data_3d'][:]
with h5py.File('../../data/3d_array/mod_val_data_3d_h5.h5', 'r') as f:
    val_X = f['val_data_3d'][:]
with h5py.File('../../data/3d_array/mod_test_data_3d_h5.h5', 'r') as f:
    test_X = f['test_data_3d'][:]

train_y = pd.read_parquet('../../data/3d_array/train_targets.parquet')
val_y = pd.read_parquet('../../data/3d_array/val_targets.parquet')
test_y = pd.read_parquet('../../data/3d_array/test_targets.parquet')

In [5]:
train_X = np.nan_to_num(train_X, nan=0.0)
val_X = np.nan_to_num(val_X, nan=0.0)
test_X = np.nan_to_num(test_X, nan=0.0)

In [6]:
train_y['end_of_month'].value_counts()

end_of_month
2018-03-31    289115
Name: count, dtype: int64

In [7]:
train_y = train_y[train_y['end_of_month'].isin(['2018-03-31'])]
val_y = val_y[val_y['end_of_month'].isin(['2018-03-31'])]
test_y = test_y[test_y['end_of_month'].isin(['2018-03-31'])]

  train_y = train_y[train_y['end_of_month'].isin(['2018-03-31'])]
  val_y = val_y[val_y['end_of_month'].isin(['2018-03-31'])]
  test_y = test_y[test_y['end_of_month'].isin(['2018-03-31'])]


In [8]:
train_y.sort_values(by=['customer_ID'])

Unnamed: 0,customer_ID,end_of_month,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2018-03-31,0
1,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,2018-03-31,0
2,000084e5023181993c2e1b665ac88dbb1ce9ef621ec537...,2018-03-31,0
3,000098081fde4fd64bc4d503a5d6f86a0aedc425c96f52...,2018-03-31,0
4,0000f99513770170a1aba690daeeb8a96da4a39f11fc27...,2018-03-31,1
...,...,...,...
289110,fffe3ec7cdbc1caac845c884b389ed347bfc1da9d09731...,2018-03-31,1
289111,fffef3305f19a11fb6c15f4ebe9be1bd664540e57c0a6a...,2018-03-31,0
289112,ffff39cc22a375d07369980d02d617883dd28ad81a6aa3...,2018-03-31,0
289113,ffff518bb2075e4816ee3fe9f3b152c57fc0e6f01bf7fd...,2018-03-31,0


In [9]:
train_X.shape, train_y.shape

((289115, 13, 86), (289115, 3))

In [10]:
val_X.shape, val_y.shape

((32124, 13, 86), (32124, 3))

In [11]:
class ParallelConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_sizes=[3, 5, 7], dropout_rate=0.2):
        """
        Parallel Convolutional Block that processes input through multiple convolutional paths
        with different kernel sizes and concatenates the results.
        
        Args:
            in_channels: Number of input channels
            out_channels: Number of output channels
            kernel_sizes: List of kernel sizes for parallel convolutions
            dropout_rate: Dropout probability
        """
        super(ParallelConvBlock, self).__init__()
        
        self.n_paths = len(kernel_sizes)
        # Calculate channels per path
        path_channels = out_channels // self.n_paths
        
        # Create parallel convolutional paths
        self.paths = nn.ModuleList()
        for k_size in kernel_sizes:
            padding = k_size // 2  # Same padding to maintain sequence length
            path = nn.Sequential(
                nn.Conv1d(in_channels, path_channels, kernel_size=k_size, padding=padding),
                nn.BatchNorm1d(path_channels),
                nn.ReLU(),
                nn.Dropout(dropout_rate)
            )
            self.paths.append(path)
            
        # Projection layer to ensure output has exactly out_channels
        self.projection = nn.Sequential(
            nn.Conv1d(path_channels * self.n_paths, out_channels, kernel_size=1),
            nn.BatchNorm1d(out_channels),
            nn.ReLU()
        )
    
    def forward(self, x):
        # Process input through parallel paths
        outputs = [path(x) for path in self.paths]
        
        # Concatenate along channel dimension
        # print(f"Outputs shape before concatenation: {[out.shape for out in outputs]}")
        x = torch.cat(outputs, dim=1)
        
        # Apply projection to get final output
        x = self.projection(x)
        
        return x

In [12]:
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, padding=1, pool=True, dropout=0.3):
        super(ConvBlock, self).__init__()
        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, padding=padding)
        self.bn = nn.BatchNorm1d(out_channels)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool1d(kernel_size=2) if pool else None
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.relu(x)
        if self.pool:
            x = self.pool(x)
        x = self.dropout(x)
        return x

In [13]:
class ConvLSTMModel(nn.Module):
    def __init__(self, input_size, lstm_hidden_size=64, num_lstm_layers=1, output_size=1):
        """
        Args:
            input_size (int): Number of features in the input sequence (feature dimension).
            lstm_hidden_size (int): Hidden size for the LSTM layer.
            num_lstm_layers (int): Number of layers for the LSTM.
            output_size (int): Size of the final output (e.g., 1 for binary classification).
        """
        super(ConvLSTMModel, self).__init__()

        self.input_size = input_size
        self.lstm_hidden_size = lstm_hidden_size

        # --- CNN Path ---
        # First convolutional block (Parallel)
        # Input channels = input_size (features)
        cnn_out_channels_1 = 100
        self.parallel_conv = ParallelConvBlock(input_size, cnn_out_channels_1, kernel_sizes=[3, 5, 7, 9], dropout_rate=0.2)

        # Second convolutional block
        cnn_out_channels_2 = 64
        self.block2 = ConvBlock(cnn_out_channels_1, cnn_out_channels_2, dropout=0.2)

        # Third convolutional block
        self.cnn_final_channels = 32
        self.block3 = ConvBlock(cnn_out_channels_2, self.cnn_final_channels, pool=False, dropout=0.2) # No pooling in the last block

        # Global pooling for CNN path
        self.global_avg_pool = nn.AdaptiveAvgPool1d(1)

        # --- LSTM Path ---
        self.lstm = nn.LSTM(input_size=input_size,
                            hidden_size=lstm_hidden_size,
                            num_layers=num_lstm_layers,
                            batch_first=True, # Crucial: input shape (batch, seq_len, features)
                            bidirectional=False) # Set to True if needed, adjust feature concatenation below

        # --- Combined Path ---
        # Calculate the combined feature size after CNN pooling and LSTM
        combined_features = self.cnn_final_channels + lstm_hidden_size # Add *2 if bidirectional LSTM

        # Fully connected layers
        self.fc1 = nn.Linear(combined_features, 16) # Input size adjusted
        self.relu_fc1 = nn.ReLU()
        self.dropout_fc1 = nn.Dropout(0.3) # Added dropout for FC layer
        self.fc2 = nn.Linear(16, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Input shape: batch_size x time_steps x features

        # --- CNN Path ---
        # For Conv1D: convert to batch_size x features(channels) x time_steps
        x_cnn = x.permute(0, 2, 1)
        # Pass through convolutional blocks
        cnn_out = self.parallel_conv(x_cnn)
        cnn_out = self.block2(cnn_out)
        cnn_out = self.block3(cnn_out)
        # Global pooling to get fixed-size representation for CNN path
        cnn_pooled = self.global_avg_pool(cnn_out)
        cnn_features = cnn_pooled.view(cnn_pooled.size(0), -1) # Flatten: batch_size x cnn_final_channels

        # --- LSTM Path ---
        # Input shape expected by LSTM (batch_first=True): batch_size x time_steps x features
        # No permutation needed for LSTM path if input is already in this format
        lstm_out, (h_n, c_n) = self.lstm(x)
        # We typically use the last hidden state
        # h_n shape: (num_layers * num_directions, batch_size, lstm_hidden_size)
        # Get the hidden state of the last layer
        lstm_features = h_n[-1] # Shape: batch_size x lstm_hidden_size (if not bidirectional)
        # If bidirectional: h_n shape is (num_layers*2, batch, hidden_size)
        # You might want to concatenate the last forward and backward hidden states:
        # lstm_features = torch.cat((h_n[-2,:,:], h_n[-1,:,:]), dim=1) # If bidirectional

        # --- Concatenate Features ---
        combined_features = torch.cat((cnn_features, lstm_features), dim=1)

        # --- Fully Connected Layers ---
        x = self.fc1(combined_features)
        x = self.relu_fc1(x)
        x = self.dropout_fc1(x)
        x = self.fc2(x)

        # Output probability
        return self.sigmoid(x)

In [14]:
# Initialize the ConvModel
input_size = train_X.shape[2]  # Number of features
output_size = 1  # Binary classification
lstm_hidden_size = 128
num_lstm_layers = 1

# Create model instance
model = ConvLSTMModel(input_size=input_size, output_size=output_size, lstm_hidden_size=lstm_hidden_size, num_lstm_layers=num_lstm_layers)
print(f"Model initialized with input_size={input_size}, output_size={output_size}")

Model initialized with input_size=86, output_size=1


In [15]:
# Define the model path
model_path = f'../../models/deep_learning/experiment_{experiment_num}.pth'

# Load the model parameters
try:
    # Load the saved dictionary
    checkpoint = torch.load(model_path)
    
    # Extract model parameters from the 'model_state_dict' key
    model.load_state_dict(checkpoint['model_state_dict'])
    print(f"Model parameters loaded successfully from {model_path}")
except FileNotFoundError:
    print(f"Model file not found at {model_path}")
    print("Please specify the correct path to the model parameters")
except KeyError:
    print(f"'model_state_dict' key not found in the checkpoint file")
    print("The file may have been saved with a different structure")
except Exception as e:
    print(f"Error loading model parameters: {str(e)}")

Model parameters loaded successfully from ../../models/deep_learning/experiment_25.pth


  checkpoint = torch.load(model_path)


In [16]:
batch_size = 2048
from torchinfo import summary
summary(model, input_size=(batch_size, train_X.shape[1], train_X.shape[2]), device='cpu',
        col_names=["input_size", "kernel_size","output_size", "num_params"], depth = 1)

Layer (type:depth-idx)                   Input Shape               Kernel Shape              Output Shape              Param #
ConvLSTMModel                            [2048, 13, 86]            --                        [2048, 1]                 --
├─ParallelConvBlock: 1-1                 [2048, 86, 13]            --                        [2048, 100, 13]           62,200
├─ConvBlock: 1-2                         [2048, 100, 13]           --                        [2048, 64, 6]             19,392
├─ConvBlock: 1-3                         [2048, 64, 6]             --                        [2048, 32, 6]             6,240
├─AdaptiveAvgPool1d: 1-4                 [2048, 32, 6]             --                        [2048, 32, 1]             --
├─LSTM: 1-5                              [2048, 13, 86]            --                        [2048, 13, 128]           110,592
├─Linear: 1-6                            [2048, 160]               --                        [2048, 16]                2,576


In [17]:
from torch.utils.data import Dataset, DataLoader
class TimeSeriesDataset(Dataset):
    def __init__(self, data, targets):
        """
        Args:
            data: numpy array of shape (num_ids, time_steps, features)
            targets: numpy array of shape (num_ids,)
        """
        self.data = torch.FloatTensor(data)
        self.targets = torch.FloatTensor(targets).unsqueeze(1)  # Add dimension for output
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.targets[idx]

In [18]:
train_dataset = TimeSeriesDataset(train_X, train_y['target'].values)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

In [19]:
val_dataset = TimeSeriesDataset(val_X, val_y['target'].values)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [20]:
test_dataset = TimeSeriesDataset(test_X, test_y['target'].values)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [21]:
train_dataset.__getitem__(0)[0].shape, train_dataset.__getitem__(0)[1]

(torch.Size([13, 86]), tensor([0.]))

In [22]:
val_dataset.__getitem__(0)[0].shape, val_dataset.__getitem__(0)[1]

(torch.Size([13, 86]), tensor([1.]))

In [23]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [24]:
# from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, confusion_matrix

# # Set model to evaluation mode
# model.eval()

# # Check if CUDA is available and move model to the appropriate device
# device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# model = model.to(device)

# # Lists to store predictions and true values
# all_preds = []
# all_labels = []

# # Perform inference without gradient calculation
# with torch.no_grad():
#     for inputs, labels in test_loader:
#         # Move inputs and labels to the appropriate device
#         inputs, labels = inputs.to(device), labels.to(device)
        
#         # Forward pass
#         outputs = model(inputs)
        
#         # Store predictions and labels
#         all_preds.append(outputs.cpu().numpy())
#         all_labels.append(labels.cpu().numpy())

# # Concatenate all batches
# all_preds = np.concatenate(all_preds)
# all_labels = np.concatenate(all_labels)

# # Convert predictions to binary (0 or 1) using threshold of 0.5
# pred_classes = (all_preds > 0.5).astype(int)
# true_classes = all_labels.astype(int)

# # Generate classification report

# # Print classification report
# print("Classification Report:")
# print(classification_report(true_classes, pred_classes, digits = 4))

# # Calculate and print accuracy
# accuracy = accuracy_score(true_classes, pred_classes)
# print(f"Accuracy: {accuracy:.4f}")

# # Calculate and print ROC-AUC score
# auc = roc_auc_score(true_classes, all_preds)
# print(f"ROC-AUC Score: {auc:.4f}")

# # Print confusion matrix
# print("\nConfusion Matrix:")
# print(confusion_matrix(true_classes, pred_classes))

In [25]:
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, confusion_matrix
from utils.threshold import find_threshold_binary_search

def evaluate_model(model, data_loader, device):
    """
    Evaluate the model on the given data loader and print evaluation metrics.

    Args:
        model: Trained PyTorch model to evaluate.
        data_loader: DataLoader containing the dataset to evaluate on.
        device: Device to run the evaluation on ('cuda' or 'cpu').

    Returns:
        dict: A dictionary containing evaluation metrics (classification report, accuracy, ROC-AUC score, confusion matrix).
    """
    # Set model to evaluation mode
    model.eval()

    # Move model to the appropriate device
    model = model.to(device)

    # Lists to store predictions and true values
    all_preds = []
    all_labels = []

    # Perform inference without gradient calculation
    with torch.no_grad():
        for inputs, labels in data_loader:
            # Move inputs and labels to the appropriate device
            inputs, labels = inputs.to(device), labels.to(device)

            # Forward pass
            outputs = model(inputs)

            # Store predictions and labels
            all_preds.append(outputs.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    # Concatenate all batches
    all_preds = np.concatenate(all_preds)
    all_labels = np.concatenate(all_labels)

    # Convert predictions to binary (0 or 1) using threshold of 0.5
    threshold, _ = find_threshold_binary_search(all_labels, all_preds, target_recall=0.98, target_class=0)
    pred_classes = (all_preds > threshold).astype(int)
    true_classes = all_labels.astype(int)

    # Generate classification report
    classification_rep = classification_report(true_classes, pred_classes, digits=4)

    # Calculate accuracy
    accuracy = accuracy_score(true_classes, pred_classes)

    # Calculate ROC-AUC score
    auc = roc_auc_score(true_classes, all_preds)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(true_classes, pred_classes)

    # Print evaluation metrics
    print("Classification Report:")
    print(classification_rep)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"ROC-AUC Score: {auc:.4f}")
    print("\nConfusion Matrix:")
    print(conf_matrix)

    # Return metrics as a dictionary
    return {
        "classification_report": classification_rep,
        "accuracy": accuracy,
        "roc_auc_score": auc,
        "confusion_matrix": conf_matrix
    }

# Example usage:
metrics = evaluate_model(model, test_loader, device)

Target Recall: >= 0.9800 for Class 0
Threshold found by Binary Search: 0.7502944
Achieved Recall at Threshold: 0.9800
Classification Report:
              precision    recall  f1-score   support

           0     0.8597    0.9800    0.9159    102026
           1     0.9046    0.5424    0.6782     35648

    accuracy                         0.8667    137674
   macro avg     0.8822    0.7612    0.7971    137674
weighted avg     0.8713    0.8667    0.8544    137674

Accuracy: 0.8667
ROC-AUC Score: 0.9530

Confusion Matrix:
[[99986  2040]
 [16312 19336]]


In [26]:
val_metrics = evaluate_model(model, val_loader, device)

Target Recall: >= 0.9800 for Class 0
Threshold found by Binary Search: 0.7528384
Achieved Recall at Threshold: 0.9800
Classification Report:
              precision    recall  f1-score   support

           0     0.8614    0.9800    0.9169     23806
           1     0.9056    0.5488    0.6834      8318

    accuracy                         0.8684     32124
   macro avg     0.8835    0.7644    0.8002     32124
weighted avg     0.8729    0.8684    0.8564     32124

Accuracy: 0.8684
ROC-AUC Score: 0.9545

Confusion Matrix:
[[23330   476]
 [ 3753  4565]]


In [27]:
train_metrics = evaluate_model(model, train_loader, device)

Target Recall: >= 0.9800 for Class 0
Threshold found by Binary Search: 0.7270852
Achieved Recall at Threshold: 0.9800
Classification Report:
              precision    recall  f1-score   support

           0     0.8705    0.9800    0.9220    214253
           1     0.9106    0.5827    0.7106     74862

    accuracy                         0.8771    289115
   macro avg     0.8905    0.7813    0.8163    289115
weighted avg     0.8809    0.8771    0.8673    289115

Accuracy: 0.8771
ROC-AUC Score: 0.9588

Confusion Matrix:
[[209968   4285]
 [ 31240  43622]]


In [28]:
# from utils.eval_helpers import evaluate_model_for_recall
# evaluate_model_for_recall(target_class=0, desired_recall=0.98, y_true=true_classes, y_pred_proba=all_preds)

In [29]:
for name, module in model.named_modules():
    # 'name' is the hierarchical string name
    # 'module' is the actual layer/module object
    print(f"Name: '{name}' \t Module Type: {module.__class__.__name__}")

Name: '' 	 Module Type: ConvLSTMModel
Name: 'parallel_conv' 	 Module Type: ParallelConvBlock
Name: 'parallel_conv.paths' 	 Module Type: ModuleList
Name: 'parallel_conv.paths.0' 	 Module Type: Sequential
Name: 'parallel_conv.paths.0.0' 	 Module Type: Conv1d
Name: 'parallel_conv.paths.0.1' 	 Module Type: BatchNorm1d
Name: 'parallel_conv.paths.0.2' 	 Module Type: ReLU
Name: 'parallel_conv.paths.0.3' 	 Module Type: Dropout
Name: 'parallel_conv.paths.1' 	 Module Type: Sequential
Name: 'parallel_conv.paths.1.0' 	 Module Type: Conv1d
Name: 'parallel_conv.paths.1.1' 	 Module Type: BatchNorm1d
Name: 'parallel_conv.paths.1.2' 	 Module Type: ReLU
Name: 'parallel_conv.paths.1.3' 	 Module Type: Dropout
Name: 'parallel_conv.paths.2' 	 Module Type: Sequential
Name: 'parallel_conv.paths.2.0' 	 Module Type: Conv1d
Name: 'parallel_conv.paths.2.1' 	 Module Type: BatchNorm1d
Name: 'parallel_conv.paths.2.2' 	 Module Type: ReLU
Name: 'parallel_conv.paths.2.3' 	 Module Type: Dropout
Name: 'parallel_conv.path

In [40]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from torch.utils.data import DataLoader # For type hinting

def extract_features_with_hook_no_labels(model: nn.Module, target_layer_name: str, data_loader: DataLoader, device: torch.device) -> pd.DataFrame:
    """
    Runs inference and extracts features from a specific layer using hooks.
    Assumes no labels are needed or available in the output.
    """
    model.eval()
    captured_features = []

    def hook_fn(module, input, output):
        captured_features.append(output.detach().cpu().numpy())

    hook_handle = None
    target_layer = None
    try:
        # Use model.get_submodule(target_name) for robustness if needed
        # For simplicity, assuming direct attribute access works:
        module_dict = dict(model.named_modules())
        if target_layer_name in module_dict:
             target_layer = module_dict[target_layer_name]
        else:
             raise KeyError(f"Layer '{target_layer_name}' not found.")

        hook_handle = target_layer.register_forward_hook(hook_fn)
    except KeyError as e:
        print(f"Error: {e}")
        print("Available named modules:")
        for name, _ in model.named_modules():
             print(f"- {name}")
        return pd.DataFrame()
    except Exception as e:
        print(f"An error occurred during hook registration: {e}")
        if hook_handle: # Clean up if hook was partially registered
             hook_handle.remove()
        return pd.DataFrame()

    print(f"Extracting features from '{target_layer_name}' using device: {device}")
    with torch.no_grad():
        pbar = tqdm(data_loader, desc="Extracting Features")
        for batch in pbar:
            # Handle variable dataloader output (with or without labels)
            if isinstance(batch, (list, tuple)):
                inputs = batch[0].to(device) # Assume inputs are the first element
            else:
                inputs = batch.to(device) # Assume loader yields only inputs

            # Run forward pass to trigger the hook
            _ = model(inputs)

    if hook_handle:
        hook_handle.remove()

    if not captured_features:
        print("Warning: No features were captured.")
        return pd.DataFrame()

    all_features_np = np.concatenate(captured_features, axis=0)

    num_features = all_features_np.shape[1]
    feature_columns = [f'DL_{i}' for i in range(num_features)]
    features_df = pd.DataFrame(all_features_np, columns=feature_columns)

    print(f"Feature extraction complete. DataFrame shape: {features_df.shape}")
    return features_df

In [41]:
target_layer_name = 'dropout_fc1' # Make sure this matches!

test_extracted_df = extract_features_with_hook_no_labels(
    model,
    target_layer_name,
    test_loader,
    device
)

Extracting features from 'dropout_fc1' using device: cuda:0


Extracting Features:   0%|          | 0/68 [00:00<?, ?it/s]

Feature extraction complete. DataFrame shape: (137674, 16)


In [42]:
test_extracted_df.head()

Unnamed: 0,DL_0,DL_1,DL_2,DL_3,DL_4,DL_5,DL_6,DL_7,DL_8,DL_9,DL_10,DL_11,DL_12,DL_13,DL_14,DL_15
0,6.035169,5.707216,0.0,5.63657,0.0,6.809452,5.764231,6.173485,0.0,0.0,0.0,0.0,6.153308,4.215621,6.063298,0.0
1,4.716325,4.411739,0.0,4.441925,0.0,5.319164,4.50921,4.71425,0.0,0.0,0.0,0.0,4.894351,3.420918,4.600852,0.0
2,3.970655,3.554694,0.0,3.736287,0.0,4.512755,3.706461,3.885978,0.0,0.0,0.0,0.0,4.245215,2.843341,3.793341,0.0
3,5.287063,4.879447,0.0,5.274143,0.0,6.075306,5.021643,5.275514,0.0,0.0,0.0,0.0,5.387375,3.88043,5.249128,0.0
4,0.0,0.0,3.338976,0.0,1.966514,0.0,0.0,0.0,2.390673,1.96816,2.084374,1.777916,0.0,0.0,0.0,1.780722


In [43]:
val_extracted_df = extract_features_with_hook_no_labels(
    model,
    target_layer_name,
    val_loader,
    device
)
val_extracted_df.head()

Extracting features from 'dropout_fc1' using device: cuda:0


Extracting Features:   0%|          | 0/16 [00:00<?, ?it/s]

Feature extraction complete. DataFrame shape: (32124, 16)


Unnamed: 0,DL_0,DL_1,DL_2,DL_3,DL_4,DL_5,DL_6,DL_7,DL_8,DL_9,DL_10,DL_11,DL_12,DL_13,DL_14,DL_15
0,0.0,0.0,1.432674,0.0,0.804038,0.0,0.0,0.0,1.110735,0.708074,0.931448,0.595479,0.0,0.0,0.0,0.783689
1,5.209517,4.857351,0.0,5.162995,0.0,6.157306,5.006511,5.400367,0.0,0.0,0.0,0.0,5.437877,3.900988,5.449194,0.0
2,4.426666,4.13431,0.0,3.974846,0.0,5.069541,4.251059,4.530725,0.0,0.0,0.0,0.0,4.512188,3.168711,4.593558,0.0
3,4.357077,3.836432,0.0,4.303031,0.0,4.963942,3.954748,4.305258,0.0,0.0,0.0,0.0,4.441238,3.11723,4.063981,0.0
4,2.237704,1.519138,0.0,2.060107,0.0,2.624698,2.026786,2.129326,0.0,0.0,0.0,0.0,2.5466,1.50363,2.158355,0.0


In [44]:
train_extracted_df = extract_features_with_hook_no_labels(
    model,
    target_layer_name,
    train_loader,
    device
)
train_extracted_df.head()

Extracting features from 'dropout_fc1' using device: cuda:0


Extracting Features:   0%|          | 0/142 [00:00<?, ?it/s]

Feature extraction complete. DataFrame shape: (289115, 16)


Unnamed: 0,DL_0,DL_1,DL_2,DL_3,DL_4,DL_5,DL_6,DL_7,DL_8,DL_9,DL_10,DL_11,DL_12,DL_13,DL_14,DL_15
0,6.894572,6.48405,0.0,6.806992,0.0,7.981942,6.737336,7.320887,0.0,0.0,0.0,0.0,6.951766,4.875103,7.235418,0.0
1,6.679119,6.465893,0.0,6.333076,0.0,7.62556,6.486637,6.799274,0.0,0.0,0.0,0.0,7.007101,4.792603,6.760164,0.0
2,3.606601,3.107703,0.0,3.207478,0.0,4.238704,3.384953,3.755443,0.0,0.0,0.0,0.0,3.777967,2.448155,3.791546,0.0
3,1.138242,1.050633,0.0,1.54171,0.0,1.647928,1.205866,1.268733,0.0,0.0,0.0,0.0,1.24736,0.901859,1.33176,0.0
4,0.0,0.0,1.944834,0.0,1.209252,0.0,0.0,0.0,1.523999,1.312626,1.391679,1.093533,0.0,0.0,0.0,1.151772


In [46]:
test_X[:20, :, :].shape

(20, 13, 86)

In [47]:
#Inference example: randomly sample 50 datapoints from test_X
sample_indices = np.random.choice(test_X.shape[0], 50, replace=False)
sampled_data = test_X[sample_indices, :, :]
sampled_data.shape

(50, 13, 86)

In [48]:
sample_dataset = TimeSeriesDataset(sampled_data, np.zeros((50, 1)))
sample_loader = DataLoader(sample_dataset, batch_size=batch_size, shuffle=False)
sample_loader.dataset.__getitem__(0)[0].shape, sample_loader.dataset.__getitem__(0)[1]

(torch.Size([13, 86]), tensor([[0.]]))

In [49]:
sample_extracted_df = extract_features_with_hook_no_labels(
    model,
    target_layer_name,
    sample_loader,
    device
)
sample_extracted_df.head()

Extracting features from 'dropout_fc1' using device: cuda:0


Extracting Features:   0%|          | 0/1 [00:00<?, ?it/s]

Feature extraction complete. DataFrame shape: (50, 16)


Unnamed: 0,DL_0,DL_1,DL_2,DL_3,DL_4,DL_5,DL_6,DL_7,DL_8,DL_9,DL_10,DL_11,DL_12,DL_13,DL_14,DL_15
0,6.62698,6.059169,0.0,6.516596,0.0,7.49419,6.364658,6.831047,0.0,0.0,0.0,0.0,6.539848,4.856055,6.567194,0.0
1,2.374995,1.885042,0.0,2.324354,0.0,2.690286,2.123864,2.248994,0.0,0.0,0.0,0.0,2.462363,1.614155,2.299339,0.0
2,3.91573,2.613201,0.0,3.844631,0.0,4.583123,3.657589,3.899049,0.0,0.0,0.0,0.0,4.058995,2.977775,3.588254,0.0
3,5.387104,5.036843,0.0,4.802422,0.0,6.086957,5.229789,5.43071,0.0,0.0,0.0,0.0,5.625329,3.801256,5.339056,0.0
4,2.952773,2.355582,0.0,2.586623,0.0,3.081563,2.457197,2.7895,0.0,0.0,0.0,0.0,2.800545,1.957896,2.434073,0.0


In [51]:
train_extracted_df.to_parquet(f'../../data/dl_features/exp_{experiment_num}_train_extracted.parquet', index=False)
val_extracted_df.to_parquet(f'../../data/dl_features/exp_{experiment_num}_val_extracted.parquet', index=False)
test_extracted_df.to_parquet(f'../../data/dl_features/exp_{experiment_num}_test_extracted.parquet', index=False)