In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

In [2]:
# --- 1. Define the Model ---
# A logistic regression model is just a single linear layer.
class LogisticRegression(nn.Module):
    def __init__(self, num_features):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(num_features, 1)

    def forward(self, x):
        return self.linear(x)

In [3]:
"""Main function to run the PyTorch classification task."""
print("Starting PyTorch classification task...")

# --- 2. Load and Preprocess Data ---
try:
    df = pd.read_csv("./creditcard.csv/creditcard.csv")
except FileNotFoundError:
    print("Error: creditcard.csv not found. Please ensure it's in the correct directory.")

# Features are all columns except 'Time' and 'Class'
X = df.drop(columns=['Time', 'Class']).values
y = df['Class'].values

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features for better performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert numpy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

# Create DataLoader for batching
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(dataset=train_dataset, batch_size=1024, shuffle=True)

# --- 3. Set up for Training ---
num_features = X.shape[1]
model = LogisticRegression(num_features)

# --- Handling Class Imbalance ---
# Calculate weight for the positive class (fraud) to pass to the loss function.
# This makes the model penalize errors on the minority class much more heavily.
neg_count = len(df[df['Class'] == 0])
pos_count = len(df[df['Class'] == 1])
pos_weight = torch.tensor([neg_count / pos_count], dtype=torch.float32)

# BCEWithLogitsLoss is numerically stable and combines a Sigmoid layer and BCELoss.
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# --- 4. Training Loop ---
print("Training the model...")
num_epochs = 10
for epoch in range(num_epochs):
    for i, (features, labels) in enumerate(train_loader):
        # Forward pass
        outputs = model(features)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# --- 5. Evaluation ---
print("\nEvaluating the model...")
model.eval()  # Set the model to evaluation mode
with torch.no_grad(): # No need to calculate gradients during evaluation
    # Get model outputs (logits)
    test_outputs = model(X_test_tensor)
    
    # Apply sigmoid to get probabilities
    test_probs = torch.sigmoid(test_outputs)
    
    # Apply a 0.5 threshold to get binary predictions
    predicted_labels = (test_probs > 0.5).float()

# Calculate metrics
auc = roc_auc_score(y_test_tensor, test_probs)
accuracy = accuracy_score(y_test_tensor, predicted_labels)
precision = precision_score(y_test_tensor, predicted_labels)
recall = recall_score(y_test_tensor, predicted_labels)
f1 = f1_score(y_test_tensor, predicted_labels)

print("--- PyTorch Classification Metrics ---")
print(f"AUC: {auc:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("------------------------------------")

Starting PyTorch classification task...
Training the model...
Epoch [1/10], Loss: 0.6711
Epoch [2/10], Loss: 0.5593
Epoch [3/10], Loss: 0.5626
Epoch [4/10], Loss: 0.4339
Epoch [5/10], Loss: 0.3856
Epoch [6/10], Loss: 0.3579
Epoch [7/10], Loss: 0.3019
Epoch [8/10], Loss: 0.2884
Epoch [9/10], Loss: 0.2790
Epoch [10/10], Loss: 1.2308

Evaluating the model...
--- PyTorch Classification Metrics ---
AUC: 0.9765
Accuracy: 0.9730
Precision: 0.0544
Recall: 0.8980
F1 Score: 0.1026
------------------------------------


In [4]:
# Predict on a few samples
sample_indices = [0, 1, 2, 3, 4]
sample_features = X_test_tensor[sample_indices]
with torch.no_grad():
    sample_outputs = model(sample_features)
    sample_probs = torch.sigmoid(sample_outputs)
    sample_predictions = (sample_probs > 0.5).float()
print("Sample Predictions (first 5 test samples):", sample_predictions.squeeze().tolist())

Sample Predictions (first 5 test samples): [0.0, 0.0, 0.0, 0.0, 1.0]
