In [1]:
import numpy as np
import json
import csv
import torch
from torch.utils.data import Dataset, DataLoader, random_split

The following two cells are a suggestion on accessing the files. You are free to ignore these!

In [None]:
with open("data.json", "r") as f:
    data_1 = json.load(f)
data_1

In [None]:
with open("data.csv", "r") as file:
    for row in csv.DictReader(file):
        print(row)

In [5]:
import json
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from typing import List, Tuple
from sklearn.model_selection import train_test_split

class TokenDataset(Dataset):
    def __init__(self, data_path: str = None, data: List[Tuple[np.ndarray, int]] = None, max_length: int = 50):
        """
        Custom dataset class for loading and processing JSON data or using preprocessed data.

        Parameters:
        - data_path: path to the JSON file (only used if `data` is not provided)
        - data: list of preprocessed (padded_tokens, label) tuples (optional)
        - max_length: max length for padding sequences (only used if loading from JSON)
        """
        if data is not None:
            # Use preprocessed data directly
            self.data = data
        else:
            # Load and preprocess data from JSON
            with open(data_path, "r") as f:
                raw_data = json.load(f)
            self.data = []
            for item in raw_data:
                tokens = np.array(item["tokens"])
                padded_tokens = np.zeros(max_length, dtype=int)
                length = min(len(tokens), max_length)
                padded_tokens[:length] = tokens[:length]
                label = item["label"]
                self.data.append((padded_tokens, label))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Get a single sample from the dataset.

        Parameters:
        - idx: index of the sample to retrieve

        Returns:
        - Tuple of (padded tokens, label) as torch Tensors
        """
        tokens, label = self.data[idx]
        tokens = torch.tensor(tokens, dtype=torch.long)
        label = torch.tensor(label, dtype=torch.long)
        return tokens, label

class TokenDataLoader(DataLoader):
    def __init__(self, dataset: TokenDataset, batch_size: int = 32, shuffle: bool = True):
        """
        Custom data loader for batching data from TokenDataset, subclassing PyTorch's DataLoader.

        Parameters:
        - dataset: TokenDataset instance
        - batch_size: number of samples per batch
        - shuffle: whether to shuffle data each epoch
        """
        super().__init__(dataset, batch_size=batch_size, shuffle=shuffle)

In [7]:
# Load the dataset from JSON
data_path = "data.json"
dataset = TokenDataset(data_path=data_path, max_length=50)

# Split dataset into training and testing sets
train_indices, test_indices = train_test_split(list(range(len(dataset))), test_size=0.2, random_state=42)

# Create subsets for training and testing data
train_data = [dataset[i] for i in train_indices]
test_data = [dataset[i] for i in test_indices]

# Create TokenDataset instances for training and testing data
train_dataset = TokenDataset(data=train_data)
test_dataset = TokenDataset(data=test_data)

# Create data loaders for training and testing
train_loader = TokenDataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = TokenDataLoader(test_dataset, batch_size=32, shuffle=False)

In [8]:
# Display the first batch from the training loader
for batch_inputs, batch_labels in train_loader:
    print("Training Batch Inputs:", batch_inputs)
    print("Training Batch Labels:", batch_labels)
    break

Test correctness here (do not change the cell below)

In [9]:
X, y = next(iter(train_loader))
print(X)
print(y)

In [10]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

class CSVDataset(Dataset):
    def __init__(self, csv_path: str, indices: list = None):
        """
        Custom dataset class for loading and processing CSV data.

        Parameters:
        - csv_path: path to the CSV file
        - indices: subset of indices to use (for train/test split)
        """
        # Load the CSV data
        self.data = pd.read_csv(csv_path)

        # Extract features (symptoms) and labels (prognosis)
        self.features = self.data.drop(columns=["prognosis"]).values.astype(np.float32)
        self.labels = self.data[["prognosis"]].values

        # One-hot encode the prognosis labels
        self.encoder = OneHotEncoder(sparse_output=False)
        self.encoded_labels = self.encoder.fit_transform(self.labels)

        # Apply indices if provided (for train/test splits)
        if indices is not None:
            self.features = self.features[indices]
            self.encoded_labels = self.encoded_labels[indices]

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Get a single sample from the dataset.

        Parameters:
        - idx: index of the sample to retrieve

        Returns:
        - Tuple of (features, encoded label) as torch Tensors
        """
        features = torch.tensor(self.features[idx])
        label = torch.tensor(self.encoded_labels[idx])
        return features, label


class CSVDataLoader(DataLoader):
    def __init__(self, dataset: CSVDataset, batch_size: int = 32, shuffle: bool = True):
        """
        Custom data loader for batching data from CSVDataset, subclassing PyTorch's DataLoader.

        Parameters:
        - dataset: CSVDataset instance
        - batch_size: number of samples per batch
        - shuffle: whether to shuffle data each epoch
        """
        super().__init__(dataset, batch_size=batch_size, shuffle=shuffle)

In [11]:

# Load the dataset
csv_path = "data.csv"
full_dataset = CSVDataset(csv_path=csv_path)

# Split indices for training and testing
train_indices, test_indices = train_test_split(list(range(len(full_dataset))), test_size=0.2, random_state=42)

# Create training and testing datasets
train_dataset = CSVDataset(csv_path=csv_path, indices=train_indices)
test_dataset = CSVDataset(csv_path=csv_path, indices=test_indices)

# Create data loaders for training and testing
train_loader = CSVDataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = CSVDataLoader(test_dataset, batch_size=32, shuffle=False)

In [12]:
# Display the first batch from the training loader
for batch_inputs, batch_labels in train_loader:
    print("Training Batch Inputs:", batch_inputs)
    print("Training Batch Labels:", batch_labels)
    break

Training Batch Inputs: tensor([[270.,   0.,   0.,  ...,   0.,   0.,   0.],
        [ 62.,   1.,   0.,  ...,   0.,   0.,   0.],
        [330.,   1.,   0.,  ...,   0.,   0.,   0.],
        ...,
        [ 48.,   1.,   1.,  ...,   0.,   0.,   0.],
        [288.,   0.,   1.,  ...,   0.,   0.,   0.],
        [447.,   0.,   1.,  ...,   0.,   0.,   0.]])
Training Batch Labels: tensor([[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0

Test correctness here (do not change the cell below)

In [13]:
X, y = next(iter(train_loader))
print(X.shape)
print(y.shape)

torch.Size([32, 65])
torch.Size([32, 11])
