In [1]:
import pandas as pd
import numpy as np
import h5py

In [2]:
with h5py.File('../../../data/3d_array/test_data_3d_h5.h5', 'r') as f:
    test_X = f['test_data_3d'][:]
test_y = pd.read_parquet('../../../data/3d_array/test_targets.parquet')

In [3]:
test_y['end_of_month'].value_counts()

end_of_month
2017-04-30    391349
2017-05-31    391349
2017-06-30    391349
2017-07-31    391349
2017-08-31    391349
2017-09-30    391349
2017-10-31    391349
2017-11-30    391349
2017-12-31    391349
2018-01-31    391349
2018-02-28    391349
2018-03-31    391349
Name: count, dtype: int64

In [4]:
test_y = test_y[test_y['end_of_month'].isin(['2018-03-31'])]


  test_y = test_y[test_y['end_of_month'].isin(['2018-03-31'])]


In [5]:
test_y

Unnamed: 0,customer_ID,end_of_month,target
11,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2018-03-31,0
23,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,2018-03-31,0
35,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,2018-03-31,0
47,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,2018-03-31,0
59,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,2018-03-31,0
...,...,...,...
4696139,ffff41c8a52833b56430603969b9ca48d208e7c192c6a4...,2018-03-31,0
4696151,ffff518bb2075e4816ee3fe9f3b152c57fc0e6f01bf7fd...,2018-03-31,0
4696163,ffff9984b999fccb2b6127635ed0736dda94e544e67e02...,2018-03-31,0
4696175,ffffa5c46bc8de74f5a4554e74e239c8dee6b9baf38814...,2018-03-31,1


In [8]:
import torch

import torch.nn as nn

class SmallRNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, fc_size, output_size=1):
        super(SmallRNNModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, fc_size)
        self.output = nn.Linear(fc_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # LSTM layer
        lstm_out, _ = self.lstm(x)
        # Take the output of the last time step
        lstm_last_out = lstm_out[:, -1, :]
        # Fully connected layer
        fc_out = self.fc(lstm_last_out)
        # Final output layer
        output = self.output(fc_out)
        # Apply sigmoid for binary classification
        return self.sigmoid(output)

# Example usage
input_size = test_X.shape[2]  # Number of features
hidden_size = 64  # Hidden state size for LSTM
fc_size = 32  # Size of the fully connected layer

model = SmallRNNModel(input_size=input_size, hidden_size=hidden_size, fc_size=fc_size)

In [9]:
# Define the model path
model_path = '../../../models/deep_learning/experiment_1.pth'

# Load the model parameters
try:
    # Load the saved dictionary
    checkpoint = torch.load(model_path)
    
    # Extract model parameters from the 'model_state_dict' key
    model.load_state_dict(checkpoint['model_state_dict'])
    print(f"Model parameters loaded successfully from {model_path}")
except FileNotFoundError:
    print(f"Model file not found at {model_path}")
    print("Please specify the correct path to the model parameters")
except KeyError:
    print(f"'model_state_dict' key not found in the checkpoint file")
    print("The file may have been saved with a different structure")
except Exception as e:
    print(f"Error loading model parameters: {str(e)}")

Model parameters loaded successfully from ../../../models/deep_learning/experiment_1.pth


  checkpoint = torch.load(model_path)


In [10]:
from torch.utils.data import Dataset, DataLoader
class TimeSeriesDataset(Dataset):
    def __init__(self, data, targets):
        """
        Args:
            data: numpy array of shape (num_ids, time_steps, features)
            targets: numpy array of shape (num_ids,)
        """
        self.data = torch.FloatTensor(data)
        self.targets = torch.FloatTensor(targets).unsqueeze(1)  # Add dimension for output
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.targets[idx]

In [None]:
batch_size = 10000
test_dataset = TimeSeriesDataset(test_X, test_y['target'].values)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, confusion_matrix

# Set model to evaluation mode
model.eval()

# Check if CUDA is available and move model to the appropriate device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Lists to store predictions and true values
all_preds = []
all_labels = []

# Perform inference without gradient calculation
with torch.no_grad():
    for inputs, labels in test_loader:
        # Move inputs and labels to the appropriate device
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Forward pass
        outputs = model(inputs)
        
        # Store predictions and labels
        all_preds.append(outputs.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Concatenate all batches
all_preds = np.concatenate(all_preds)
all_labels = np.concatenate(all_labels)

# Convert predictions to binary (0 or 1) using threshold of 0.5
pred_classes = (all_preds > 0.5).astype(int)
true_classes = all_labels.astype(int)

# Generate classification report

# Print classification report
print("Classification Report:")
print(classification_report(true_classes, pred_classes, digits = 4))

# Calculate and print accuracy
accuracy = accuracy_score(true_classes, pred_classes)
print(f"Accuracy: {accuracy:.4f}")

# Calculate and print ROC-AUC score
auc = roc_auc_score(true_classes, all_preds)
print(f"ROC-AUC Score: {auc:.4f}")

# Print confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(true_classes, pred_classes))

Classification Report:
              precision    recall  f1-score   support

           0     0.9404    0.9349    0.9376    299141
           1     0.7927    0.8077    0.8001     92208

    accuracy                         0.9049    391349
   macro avg     0.8665    0.8713    0.8689    391349
weighted avg     0.9056    0.9049    0.9052    391349

Accuracy: 0.9049
ROC-AUC Score: 0.9595

Confusion Matrix:
[[279659  19482]
 [ 17729  74479]]
