In [194]:
from sklearn.datasets import make_classification
import torch

In [195]:
# Step - 1 : Create a Synthatic classification dataset using sklearn
X, y = make_classification(
    n_samples = 10,                     # No. of Samples
    n_redundant = 0,                    # No. Redundant Features
    n_classes = 2,                      # no. of Classes
    n_features = 2,                     # No. of Features
    n_informative = 2,                  # No. of Informatives
    random_state = 42
)

In [196]:
X

array([[ 1.06833894, -0.97007347],
       [-1.14021544, -0.83879234],
       [-2.8953973 ,  1.97686236],
       [-0.72063436, -0.96059253],
       [-1.96287438, -0.99225135],
       [-0.9382051 , -0.54304815],
       [ 1.72725924, -1.18582677],
       [ 1.77736657,  1.51157598],
       [ 1.89969252,  0.83444483],
       [-0.58723065, -1.97171753]])

In [197]:
y

array([1, 0, 0, 0, 0, 1, 1, 1, 1, 0])

In [198]:
# Convert Data to Pytorch Tensors
X = torch.tensor(X, dtype = torch.float32)
y = torch.tensor(y, dtype = torch.float32)

In [199]:
from torch.utils.data import Dataset, DataLoader

In [200]:
class CustomDataset(Dataset):

  def __init__(self, features, labels):
    self.features = features
    self.labels = labels

  def __len__(self):

    return self.features.shape[0]

  def __getitem__(self, index):

    return self.features[index], self.labels[index]

In [201]:
dataset = CustomDataset(X, y)

In [202]:
len(dataset)

10

In [203]:
dataloader = DataLoader(dataset, batch_size = 2, shuffle = True)

In [204]:
for batch_feature, batch_label in dataloader:

  print(batch_feature)
  print(batch_label)
  print('-'*50)

tensor([[ 1.7273, -1.1858],
        [-0.5872, -1.9717]])
tensor([1., 0.])
--------------------------------------------------
tensor([[ 1.0683, -0.9701],
        [ 1.8997,  0.8344]])
tensor([1., 1.])
--------------------------------------------------
tensor([[-1.1402, -0.8388],
        [-0.9382, -0.5430]])
tensor([0., 1.])
--------------------------------------------------
tensor([[-2.8954,  1.9769],
        [ 1.7774,  1.5116]])
tensor([0., 1.])
--------------------------------------------------
tensor([[-0.7206, -0.9606],
        [-1.9629, -0.9923]])
tensor([0., 0.])
--------------------------------------------------


> In Pytorch, the sampler in DataLoader class determines the stratergy for selecting samples from dataset during data loading. It controls how indices of dataset are drawn for each batch.

## ****Types of Sample****

Pytorch provides predefined samplers, and we can create custom ones  
1. **`SequentialSampler`**:
      - Samples elements sequentially, in the order they appear in dataset.
      - Default when `shuffle = False`.  

2. **`RandomSampler`**:
      - Samples elements randomly without replacements.
      - Default when `shuffle = True`.  

> The collate_fn in PyTorch's DataLoader function that specifies how to combine list of samples from a dataset into a single batch. By default, the DataLoader uses a simple batc collation mechanism, but collate_fn allows us to customize how data should be processed and batched.  

> The DataLoader class in PyTorch comes with several parameters that allow you to customize
how data is loaded, batched, and preprocessed. Some of the most commonly used and
important parameters include:  

1. **`dataset (mandatory)`**:
    - The Dataset from which the DataLoader will pull data. Must be a subclass of torch.utils.data.Dataset that implements __getitem__ and __len__.  
2. **`batch_size`** :
    - How many samples per batch to load.
    - Default is 1.
    - Larger batch sizes can speed up training on GPUs but require more memory.  
3. **`shuffle`** :
    - If True, the DataLoader will shuffle the dataset indices each epoch.
    - Helpful to avoid the model becoming too dependent on the order of samples.  
4. **`num_workers`** :
    - The number of worker processes used to load data in parallel.
    - Setting num_workers > 0 can speed up data loading by leveraging multiple CPU cores, especially if I/O or preprocessing is a bottleneck.  
5. **`pin_memory`** :
    - If True, the DataLoader will copy tensors into pinned (page-locked) memory before returning them.
    - This can improve GPU transfer speed and thus overall training throughput, particularly on CUDA systems.  
6. **`drop_last`** :
    - If True, the DataLoader will drop the last incomplete batch if the total number of samples is not divisible by the batch size.
    - Useful when exact batch sizes are required (for example, in some batch normalization scenarios).  
7. **`collate_fn`** :
    - A callable that processes a list of samples into a batch (the default simply stacks tensors).
    - Custom collate_fn can handle variable-length sequences, perform custom batching logic, or handle complex data structures.  
8. **`sampler`** :
    - sampler defines the strategy for drawing samples (e.g., for handling imbalanced classes, or custom sampling strategies).
    - batch_sampler works at the batch level, controlling how batches are formed.
    - Typically, you don’t need to specify these if you are using batch_size and shuffle. However, they provide lower-level control if you have advanced requirements.

In [205]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [206]:
data = pd.read_csv(r'https://raw.githubusercontent.com/gscdit/Breast-Cancer-Detection/refs/heads/master/data.csv')
data

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,


In [207]:
data.drop(columns = ['id', 'Unnamed: 32'], inplace = True)
display(data.head())
display(data.shape)

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


(569, 31)

In [208]:
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, 1:], data.iloc[:, 0], random_state = 42, test_size = 0.2)

In [209]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [210]:
encode = LabelEncoder()
y_train = encode.fit_transform(y_train)
y_test = encode.transform(y_test)

In [211]:
X_train_tensor = torch.from_numpy(X_train.astype(np.float32))
X_test_tensor = torch.from_numpy(X_test.astype(np.float32))
y_train_tensor = torch.from_numpy(y_train.astype(np.float32))
y_test_tensor = torch.from_numpy(y_test.astype(np.float32))

In [212]:
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):

  def __init__(self, features, labels):

    self.features = features
    self.labels = labels

  def __len__(self):

    return len(self.features)

  def __getitem__(self, idx):

    return self.features[idx], self.labels[idx]

In [213]:
train_dataset = CustomDataset(X_train_tensor, y_train_tensor)
test_dataset = CustomDataset(X_test_tensor, y_test_tensor)

In [214]:
train_loader = DataLoader(train_dataset, batch_size = 32, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size = 32, shuffle = True)

In [215]:
import torch.nn as nn

class MySimpleNN(nn.Module):
  def __init__(self, num_features):

    super().__init__()
    self.linear = nn.Linear(num_features, 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, features):
    out = self.linear(features)
    out = self.sigmoid(out)

    return out

In [216]:
learning_rate = 0.1
epochs = 25

In [217]:
# Create Model
model = MySimpleNN(X_train_tensor.shape[1])

# Define Optimizer
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)

# Define Loss Fuction
loss_function = nn.BCELoss()

In [218]:
# Define Loop
for epoch in range(epochs):

  for batch_feature, batch_label in train_loader:

    # Forward Pass
    y_pred = model(batch_feature)

    # Loss Calculate
    loss = loss_function(y_pred, batch_label.view(-1, 1))

    # Clear Gradients
    optimizer.zero_grad()

    # backward pass
    loss.backward()

    # parameter update
    optimizer.step()
  print(f'Epoch : {epoch + 1},  Loss : {loss.item()}')

Epoch : 1,  Loss : 0.193091481924057
Epoch : 2,  Loss : 0.25403282046318054
Epoch : 3,  Loss : 0.08762530237436295
Epoch : 4,  Loss : 0.09782593697309494
Epoch : 5,  Loss : 0.0681254118680954
Epoch : 6,  Loss : 0.21036474406719208
Epoch : 7,  Loss : 0.08807078748941422
Epoch : 8,  Loss : 0.14243485033512115
Epoch : 9,  Loss : 0.01402981672435999
Epoch : 10,  Loss : 0.02190672978758812
Epoch : 11,  Loss : 0.012735855765640736
Epoch : 12,  Loss : 0.08239572495222092
Epoch : 13,  Loss : 0.006948184221982956
Epoch : 14,  Loss : 0.21116141974925995
Epoch : 15,  Loss : 0.31103354692459106
Epoch : 16,  Loss : 0.23311340808868408
Epoch : 17,  Loss : 0.014159909449517727
Epoch : 18,  Loss : 0.19544388353824615
Epoch : 19,  Loss : 0.016660528257489204
Epoch : 20,  Loss : 0.1076335459947586
Epoch : 21,  Loss : 0.16946525871753693
Epoch : 22,  Loss : 0.08586449176073074
Epoch : 23,  Loss : 0.015884462743997574
Epoch : 24,  Loss : 0.029276715591549873
Epoch : 25,  Loss : 0.05734032019972801


In [219]:
 # Model evaluation using test_loader
 model.eval()                             # Set The Model to Evaluation Mode
 accuracy_list = []

 with torch.no_grad():
  for batch_feature, batch_label in test_loader:
    # Forward Pass
    y_pred = model(batch_feature)
    y_pred = (y_pred > 0.8).float()       # Convert Probabilities to Binary Prediction

    # Calculate Accuracy Fro Current Batch
    batch_accuracy = (y_pred.view(-1) == batch_label).float().mean().item()
    accuracy_list.append(batch_accuracy)

# Calculate Overall Accuracy
overall_accuracy = sum(accuracy_list) / len(accuracy_list)
print(f'Accuracy : {overall_accuracy: .4f}')

Accuracy :  0.9627
