In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import pandas as pd
import torch.nn.init as init
import torch.optim as optim

In [4]:
class WaterDataset(Dataset):
    '''
    Inherits from torch.utils.data.Dataset, making it compatible with PyTorch’s DataLoader.
    This allows batch processing, shuffling, and parallel loading.
    '''
    def __init__(self, csv_path):
        '''
        Dataset might have its own internal setup, which should be initialized.
        super().__init__() ensures that PyTorch’s dataset functionality works correctly.
        '''
        super().__init__()
        df = pd.read_csv(csv_path)
        self.data = df.to_numpy()  # Store the dataset as a NumPy array

    def __len__(self):
        return self.data.shape[0]  # Use self.data

    def __getitem__(self, idx):
        '''
        idx: The index of the sample being retrieved.
        features = self.data[idx, :-1]: Selects all columns except the last one (input features).
        labels = self.data[idx, -1]: Selects only the last column (output label).
        PyTorch requires __getitem__() to fetch individual samples for training.
        '''
        features = self.data[idx, :-1]  # Select all columns except the last as features
        labels = self.data[idx, -1]  # Corrected typo: self.data, not sel.data
        return features, labels


'''
4. When Should You Use super().__init__()?
Always in a child class that inherits from a parent class.
When using PyTorch’s Dataset, nn.Module, or other framework classes.
When extending built-in Python classes like list, dict, etc.
'''

'\n4. When Should You Use super().__init__()?\nAlways in a child class that inherits from a parent class.\nWhen using PyTorch’s Dataset, nn.Module, or other framework classes.\nWhen extending built-in Python classes like list, dict, etc.\n'

In [5]:
dataset_train = WaterDataset('/Users/subhashmedipalli/Downloads/Github Projects/Deeplearning_Project_DataSets/water_potability/water_train.csv')
dataset_test = WaterDataset('/Users/subhashmedipalli/Downloads/Github Projects/Deeplearning_Project_DataSets/water_potability/water_test.csv')

In [6]:
# Create DataLoader objects
water_dataset_train = DataLoader(dataset_train, batch_size=2, shuffle=True)
water_dataset_test = DataLoader(dataset_test, batch_size=2, shuffle=False)

In [7]:
features, labels = next(iter(water_dataset_train))
print(f"Features: {features},\nLabels: {labels}")

Features: tensor([[0.5091, 0.5675, 0.2420, 0.5805, 0.6137, 0.5307, 0.5328, 0.2094, 0.6109],
        [0.5762, 0.3639, 0.1811, 0.5706, 0.8379, 0.5178, 0.5568, 0.5320, 0.5309]],
       dtype=torch.float64),
Labels: tensor([0., 0.], dtype=torch.float64)


In [8]:
# Check if everything works correctly
for batch in water_dataset_train:
    features, labels = batch
    print("Features:", features)
    print("Labels:", labels)
    break  # Print only the first batch for verification

Features: tensor([[0.3531, 0.4133, 0.6013, 0.4559, 0.7035, 0.5886, 0.6009, 0.3463, 0.6718],
        [0.2731, 0.5847, 0.7096, 0.5195, 0.5136, 0.3578, 0.3623, 0.6791, 0.5866]],
       dtype=torch.float64)
Labels: tensor([0., 0.], dtype=torch.float64)


In [9]:
# Defining the model
class Net(nn.Module):
    def __init__(self):
        super().__init__()  # Fix (A): Call parent constructor

        # Define layers
        self.fc1 = nn.Linear(9, 16)
        self.bn1 = nn.BatchNorm1d(16)
        self.fc2 = nn.Linear(16, 8)
        self.bn2 = nn.BatchNorm1d(8)
        self.fc3 = nn.Linear(8, 1)

        # Initialize weights
        init.kaiming_uniform_(self.fc1.weight)
        init.kaiming_uniform_(self.fc2.weight)
        init.kaiming_uniform_(self.fc3.weight, nonlinearity='sigmoid')  

    def forward(self, x):
        x = nn.functional.elu(self.bn1(self.fc1(x)))
        x = nn.functional.elu(self.bn2(self.fc2(x)))
        x = torch.sigmoid(self.fc3(x))  
        return x


In [10]:
# Instantiate the model
net = Net()

# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

In [11]:
for epoch in range(1000):
    for features, labels in water_dataset_train:
        optimizer.zero_grad()
        
        # Convert features and labels to float32
        features = features.float()
        labels = labels.float()

        output = net(features)
        loss = criterion(output, labels.view(-1, 1))
        
        loss.backward()
        optimizer.step() 

print('Training Completed!')


Training Completed!


In [13]:
import torch
from torchmetrics import Accuracy

# Initialize accuracy metric
acc = Accuracy(task='binary')

net.eval()
with torch.no_grad():
    for features, labels in water_dataset_test:
        features = features.float()
        labels = labels.float()
        output = net(features)

        pred = (output >= 0.5).float()  # Convert logits to binary predictions (0 or 1)
        acc.update(pred, labels.view(-1,1))  # Accumulate accuracy

# Compute final accuracy
accuracy = acc.compute()
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.5905


# Not bad for small dataset like 1500 Samples to get 60% accuracy.