In [3]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load data from CSV
file_path = '../Datasets/Machine.csv'
data = pd.read_csv(file_path)

# Separate text data and numerical features
text_data = data.iloc[:, 0]
labels = data.iloc[:, 0]  # Assuming labels are in the first column
rest_of_data = data.iloc[:, 1:]  # Assuming numerical features are in the rest of the columns

# Normalize the numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(rest_of_data)

# Convert the numerical features to PyTorch tensor
numerical_tensor = torch.tensor(scaled_features, dtype=torch.float32)

# Define a full autoencoder model to handle both encoding and decoding
class TabularAutoencoder(nn.Module):
    def __init__(self, input_dim, embedding_dim):
        super(TabularAutoencoder, self).__init__()
        # Encoder part
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, embedding_dim)  # Embedding layer
        )
        # Decoder part to reconstruct the input
        self.decoder = nn.Sequential(
            nn.Linear(embedding_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim)  # Reconstruct to original input size
        )

    def forward(self, x):
        embedding = self.encoder(x)  # Encode input to embeddings
        reconstruction = self.decoder(embedding)  # Decode to reconstruct input
        return reconstruction, embedding  # Return both the reconstruction and embeddings

# Set input size and embedding size
input_dim = numerical_tensor.shape[1]  # Number of numerical features
embedding_dim = 10  # Desired embedding size
model = TabularAutoencoder(input_dim=input_dim, embedding_dim=embedding_dim)

# Define loss function and optimizer
criterion = nn.MSELoss()  # Reconstruction loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 50
batch_size = 32
dataset = torch.utils.data.TensorDataset(numerical_tensor, numerical_tensor)  # Autoencoder setup
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

for epoch in range(num_epochs):
    running_loss = 0.0
    for inputs, _ in dataloader:
        optimizer.zero_grad()

        # Forward pass
        reconstruction, embeddings = model(inputs)  # Get reconstruction and embeddings
        loss = criterion(reconstruction, inputs)  # Reconstruction loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(dataloader):.4f}")

# After training, extract embeddings
with torch.no_grad():
    _, numerical_embeddings = model(numerical_tensor)  # Get only embeddings

# Convert embeddings to DataFrame
numerical_embeddings_df = pd.DataFrame(numerical_embeddings.numpy(), columns=[f'numerical_emb_{i+1}' for i in range(embedding_dim)])

# Combine the embeddings with the original labels
final_combined_data = pd.concat([labels.reset_index(drop=True), numerical_embeddings_df], axis=1)

# Save the combined data with embeddings to a CSV file
final_combined_data.to_csv('deep_learning_tabular_embeddings.csv', index=False)

print("Tabular embeddings saved to 'deep_learning_tabular_embeddings.csv'")

Epoch [10/50], Loss: nan
Epoch [20/50], Loss: nan
Epoch [30/50], Loss: nan
Epoch [40/50], Loss: nan
Epoch [50/50], Loss: nan
Tabular embeddings saved to 'deep_learning_tabular_embeddings.csv'
