In [1]:
import pandas as pd
import numpy as np

import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

torch.manual_seed(42)


<torch._C.Generator at 0x1818ded70>

In [3]:
# CSVDataset Class with On-the-Fly Normalization
from collections import Counter
class CSVDataset(Dataset):
    def __init__(self, csv_file):
        self.data = pd.read_csv(csv_file)
        self.labels = self.data.iloc[:, 0].values  # First column as labels
        self.features = self.data.iloc[:, 1:].values  # Remaining columns as features
        
        # Handle NaN values
        imputer = SimpleImputer(strategy="mean")
        self.features = imputer.fit_transform(self.features)
        
        # Normalize features
        self.scaler = StandardScaler()
        self.features = self.scaler.fit_transform(self.features)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        features = self.features[idx]
        label = self.labels[idx]
        return torch.tensor(features, dtype=torch.float32), torch.tensor(label, dtype=torch.long)
    
# Load data
csv_file = "data/merged_data/filtered_merged.csv"  
dataset = CSVDataset(csv_file)

# Split dataset
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Prepare train and test features/labels for SMOTE
X_train = np.array([train_dataset[i][0].numpy() for i in range(len(train_dataset))])
y_train = np.array([train_dataset[i][1].item() for i in range(len(train_dataset))])

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check label distribution after SMOTE
print("Label distribution after SMOTE:")
print(Counter(y_train_resampled))

# Create PyTorch datasets
train_dataset = [(torch.tensor(X_train_resampled[i], dtype=torch.float32),
                  torch.tensor(y_train_resampled[i], dtype=torch.long)) 
                 for i in range(len(y_train_resampled))]
test_dataset = [(torch.tensor(test_dataset[i][0].numpy(), dtype=torch.float32),
                 test_dataset[i][1]) 
                for i in range(len(test_dataset))]

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

Label distribution after SMOTE:
Counter({0: 1232, 1: 1232})


In [4]:
class LogisticRegression(torch.nn.Module):
    def __init__(self, n_inputs, n_outputs):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(n_inputs, n_outputs)

    def forward(self, x):
        return torch.sigmoid(self.linear(x))  # Apply sigmoid here

In [5]:
# Instantiate the model
n_inputs = X_train.shape[1]   # Number of features (excluding the label)
# features, label = dataset[0]

# # Determine the number of input features
# n_inputs = features.shape[0]
n_outputs = 1  # Binary classification
log_regr = LogisticRegression(n_inputs, n_outputs)


In [6]:
# Define optimizer and loss function
optimizer = torch.optim.SGD(log_regr.parameters(), lr=0.0003)
criterion = torch.nn.BCELoss()  # Binary Cross-Entropy Loss

# Training and Evaluation Loop
epochs = 100
Loss = []
acc = []

for epoch in range(epochs):
    # Training Loop
    log_regr.train()
    for features, labels in train_dataloader:
        optimizer.zero_grad()
        outputs = log_regr(features).squeeze()  # Squeeze to match shape for BCELoss
        labels = labels.float()  # Convert labels to float for BCELoss
        # print(outputs)
        # Debugging shapes
        # print(f"Outputs shape: {outputs.shape}, Labels shape: {labels.shape}")
        labels = labels.float() 
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # Evaluation Loop
    log_regr.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for features, labels in test_dataloader:
            outputs = log_regr(features).squeeze()
            predicted = (outputs > 0.5).float()  # Apply threshold for binary classification
            correct += (predicted == labels.float()).sum().item()
            total += labels.size(0)

    accuracy = 100 * correct / total
    Loss.append(loss.item())
    acc.append(accuracy)
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}, Accuracy: {accuracy:.2f}%')

# Save final metrics
print("Training Complete!")

Epoch 1/100, Loss: 0.6927, Accuracy: 56.39%
Epoch 2/100, Loss: 0.7527, Accuracy: 56.89%
Epoch 3/100, Loss: 0.6637, Accuracy: 58.90%
Epoch 4/100, Loss: 0.7201, Accuracy: 60.40%
Epoch 5/100, Loss: 0.7396, Accuracy: 60.90%
Epoch 6/100, Loss: 0.6597, Accuracy: 62.91%
Epoch 7/100, Loss: 0.6381, Accuracy: 64.16%
Epoch 8/100, Loss: 0.7030, Accuracy: 64.91%
Epoch 9/100, Loss: 0.6705, Accuracy: 66.17%
Epoch 10/100, Loss: 0.6300, Accuracy: 67.17%
Epoch 11/100, Loss: 0.6615, Accuracy: 68.42%
Epoch 12/100, Loss: 0.6113, Accuracy: 69.42%
Epoch 13/100, Loss: 0.6363, Accuracy: 70.18%
Epoch 14/100, Loss: 0.6639, Accuracy: 70.18%
Epoch 15/100, Loss: 0.6784, Accuracy: 70.43%
Epoch 16/100, Loss: 0.5959, Accuracy: 70.93%
Epoch 17/100, Loss: 0.7016, Accuracy: 71.68%
Epoch 18/100, Loss: 0.6163, Accuracy: 71.93%
Epoch 19/100, Loss: 0.5885, Accuracy: 72.18%
Epoch 20/100, Loss: 0.5671, Accuracy: 72.43%
Epoch 21/100, Loss: 0.6068, Accuracy: 72.43%
Epoch 22/100, Loss: 0.6065, Accuracy: 72.43%
Epoch 23/100, Loss:

In [7]:
from sklearn.metrics import classification_report, confusion_matrix

# log_regr.load_state_dict(torch.load('logistic_regression_model.pth'))
log_regr.eval()

# Evaluate the model on the evaluation dataset
log_regr.eval()
all_labels = []
all_predictions = []

with torch.no_grad():
    for features, labels in test_dataloader:
        outputs = log_regr(features)
        _, predicted = torch.max(outputs.data, 1)
        all_labels.extend(labels.numpy())
        all_predictions.extend(predicted.numpy())

# Compute metrics
print("Confusion Matrix:")
print(confusion_matrix(all_labels, all_predictions))

print("\nClassification Report:")
print(classification_report(all_labels, all_predictions, target_names=["Class 0", "Class 1"]))

Confusion Matrix:
[[304   0]
 [ 95   0]]

Classification Report:
              precision    recall  f1-score   support

     Class 0       0.76      1.00      0.86       304
     Class 1       0.00      0.00      0.00        95

    accuracy                           0.76       399
   macro avg       0.38      0.50      0.43       399
weighted avg       0.58      0.76      0.66       399



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
