## DataLoader

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import sklearn

# Step 1: Custom Dataset Class
class CustomDataset(Dataset):
    def __init__(self, data, transform=None):
        """
        Args:
            data (np.ndarray): A numpy array of shape (num_samples, input_dim).
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.data = data
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        if self.transform:
            sample = self.transform(sample)
        return torch.tensor(sample, dtype=torch.float32)

# Step 2: Generate Example Data
def generate_dummy_data(num_samples=1000, input_dim=10):
    """Generates dummy data for testing the dataloader."""
    return sklearn.datasets.make_classification(
        n_samples=num_samples, n_features=input_dim, 
        n_informative=input_dim, n_redundant=0, random_state=42)

# Step 3: Transformations (Optional)
def normalize_data(sample):
    """Normalize the sample to have values between 0 and 1."""
    return sample / np.max(sample)





In [31]:
def load_data(data_path):
    import pandas as pd
    # change first column name to 'target'
    df = pd.read_csv(data_path)
    df.rename(columns={df.columns[0]: 'target'}, inplace=True)
    data = df.drop('target', axis=1).values
    target = df['target'].values

        

    return data,target

def save_data(data, data_path):
    import pandas as pd
    df = pd.DataFrame(data)
    df.to_csv(data_path, index=False)
    print(f"Data saved to {data_path}")

def preprocess(input_path, output_path):
    import random
    import pandas as pd
    # Implementation of preprocessing logic
    data,target = load_data(input_path)

    # Preprocess data
    # Get 3 random columns from data and concatenate them to target
    random.seed(0)
    indices = random.sample(range(data.shape[1]), 3)
    x = np.concatenate([data[:, indices], target.reshape(-1, 1)], axis=1)

    # Save preprocessed data
    save_data(x, output_path)

    return x
    
    print(f"Preprocessing {input_path} -> {output_path}")


In [7]:
SAVE_FOLDER = 'data'

/app/notebook


In [None]:
# Step 4: Initialize Dataset and DataLoader
from LoadDataset import LoadDataset
_,_ = LoadDataset.load_iris(save_path=SAVE_FOLDER)
data = preprocess(f'{SAVE_FOLDER}/iris.csv', f'{SAVE_FOLDER}/iris_preprocessed.csv')
dataset = CustomDataset(data, transform=normalize_data)

dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

Download Susy Dataset: 424MiB [02:29, 3.15MiB/s] 

## Encoder

In [33]:
import torch
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dims, context_dim):
        super(Encoder, self).__init__()
        layers = []
        dims = [input_dim] + hidden_dims + [context_dim]
        for i in range(len(dims) - 1):
            layers.append(nn.Linear(dims[i], dims[i+1]))
            if i < len(dims) - 2:  # No activation in the final layer
                layers.append(nn.ReLU())
        self.encoder = nn.Sequential(*layers)

    def forward(self, x):
        return self.encoder(x)


## Decoder

In [34]:
class Decoder(nn.Module):
    def __init__(self, context_dim, hidden_dims, output_dim):
        super(Decoder, self).__init__()
        layers = []
        dims = [context_dim] + hidden_dims + [output_dim]
        for i in range(len(dims) - 1):
            layers.append(nn.Linear(dims[i], dims[i+1]))
            if i < len(dims) - 2:  # No activation in the final layer
                layers.append(nn.ReLU())
        self.decoder = nn.Sequential(*layers)

    def forward(self, x):
        return self.decoder(x)


## Train

In [35]:
# Verify if has cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [40]:
encoder = Encoder(input_dim=4, hidden_dims=[64, 32], context_dim=2)
decoder = Decoder(context_dim=2, hidden_dims=[32, 64], output_dim=4)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=0.001)

# Example Training Loop
for epoch in range(100):
    for batch in dataloader:
        x = batch  # Assuming x is your input data
        context = encoder(x)
        output = decoder(context)
        loss = criterion(output, x)  # Reconstruction loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")


Epoch 1, Loss: 0.32914501428604126
Epoch 2, Loss: 0.2802676260471344
Epoch 3, Loss: 0.2335808426141739
Epoch 4, Loss: 0.18161512911319733
Epoch 5, Loss: 0.13997283577919006
Epoch 6, Loss: 0.10327020287513733
Epoch 7, Loss: 0.06271777302026749
Epoch 8, Loss: 0.031497787684202194
Epoch 9, Loss: 0.014579257927834988
Epoch 10, Loss: 0.0156423207372427
Epoch 11, Loss: 0.012110820040106773
Epoch 12, Loss: 0.014665823429822922
Epoch 13, Loss: 0.01096057053655386
Epoch 14, Loss: 0.009064801968634129
Epoch 15, Loss: 0.008202067576348782
Epoch 16, Loss: 0.008607006631791592
Epoch 17, Loss: 0.007492826785892248
Epoch 18, Loss: 0.010673868469893932
Epoch 19, Loss: 0.00989864394068718
Epoch 20, Loss: 0.010322686284780502
Epoch 21, Loss: 0.011166345328092575
Epoch 22, Loss: 0.01033459510654211
Epoch 23, Loss: 0.009935668669641018
Epoch 24, Loss: 0.010846325196325779
Epoch 25, Loss: 0.009007207117974758
Epoch 26, Loss: 0.008254838176071644
Epoch 27, Loss: 0.008963996544480324
Epoch 28, Loss: 0.010301

# Visualize Context Vector

In [41]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

def extract_context_vectors(encoder, dataloader):
    """Extracts context vectors from the encoder for the entire dataset."""
    context_vectors = []
    encoder.eval()
    with torch.no_grad():
        for batch in dataloader:
            context = encoder(batch)
            context_vectors.append(context.numpy())
    return np.vstack(context_vectors)


def visualize_context_vectors(context_vectors, method="pca"):
    """Visualizes context vectors using PCA or t-SNE."""
    if method == "pca":
        reducer = PCA(n_components=2)
    elif method == "tsne":
        reducer = TSNE(n_components=2, random_state=42)
    else:
        raise ValueError("Method must be 'pca' or 'tsne'")

    reduced_context = reducer.fit_transform(context_vectors)
    plt.figure(figsize=(8, 6))
    plt.scatter(reduced_context[:, 0], reduced_context[:, 1], alpha=0.7, s=20, cmap='viridis')
    plt.title(f"Context Vectors Visualization ({method.upper()})")
    plt.xlabel("Component 1")
    plt.ylabel("Component 2")
    plt.show()

context_vectors = extract_context_vectors(encoder, dataloader)
visualize_context_vectors(context_vectors, method="pca")

ValueError: n_components=2 must be between 0 and min(n_samples, n_features)=1 with svd_solver='covariance_eigh'

In [42]:
context_vectors.shape

(150, 1)