In [5]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

# Load data from CSV
file_path = '../Datasets/Machine.csv'
data = pd.read_csv(file_path)

# Separate text data and rest of the features
text_data = data.iloc[:, 0]  # Assuming first column is text data
labels = data.iloc[:, 0]  # Assuming labels are in the first column
rest_of_data = data.iloc[:, 1:]  # The rest are numerical features

# Vectorize text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)  # Limit TF-IDF features to 1000
tfidf_features = vectorizer.fit_transform(text_data).toarray()

# Check shape of TF-IDF features
print(f"TF-IDF feature shape: {tfidf_features.shape}")

# Normalize the rest of the numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(rest_of_data)

# Convert the TF-IDF features to PyTorch tensor
tfidf_tensor = torch.tensor(tfidf_features, dtype=torch.float32)

# Define a simple Feed-Forward Neural Network for Text Embedding
class FFNTextEmbedder(nn.Module):
    def __init__(self, input_dim, embedding_dim):
        super(FFNTextEmbedder, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)  # Fully connected layer 1
        self.fc2 = nn.Linear(512, 256)  # Fully connected layer 2
        self.fc3 = nn.Linear(256, embedding_dim)  # Fully connected layer for embeddings
        self.fc4 = nn.Linear(embedding_dim, input_dim)  # Output layer (same size as input for reconstruction)

    def forward(self, x):
        x = torch.relu(self.fc1(x))  # Apply ReLU activation
        x = torch.relu(self.fc2(x))  # Apply ReLU activation
        embeddings = self.fc3(x)  # Generate embeddings
        reconstructed = self.fc4(embeddings)  # Reconstruct the original input
        return embeddings, reconstructed

# Initialize the Feed-Forward Network
input_dim = tfidf_features.shape[1]  # Number of TF-IDF features (16 in this case)
embedding_dim = 10  # Target embedding size
model = FFNTextEmbedder(input_dim=input_dim, embedding_dim=embedding_dim)

# Define Loss Function and Optimizer
criterion = nn.MSELoss()  # Reconstruction loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training the FNN
num_epochs = 50
batch_size = 256
dataset = torch.utils.data.TensorDataset(tfidf_tensor, tfidf_tensor)  # Using TF-IDF features as both input and target
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Training Loop
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (inputs, _) in enumerate(dataloader):
        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        embeddings, outputs = model(inputs)  # Get both embeddings and reconstructed output
        loss = criterion(outputs, inputs)  # Reconstruction loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(dataloader)}")

# After training, extract the learned embeddings
with torch.no_grad():
    embeddings, _ = model(tfidf_tensor)
    embeddings = embeddings.numpy()

# Combine the embeddings with the rest of the numerical features
embeddings_df = pd.DataFrame(embeddings, columns=[f'emb_{i+1}' for i in range(embedding_dim)])
rest_of_data_df = pd.DataFrame(scaled_features, columns=rest_of_data.columns)

# Combine labels, embeddings, and numerical features
final_data = pd.concat([labels.reset_index(drop=True), embeddings_df, rest_of_data_df], axis=1)
print(final_data.head(5))

# Save the combined data with embeddings to a CSV file
final_data.to_csv('fnn_text_embeddings_deep_learning.csv', index=False)

print("Data with FNN-based embeddings saved to 'fnn_text_embeddings_deep_learning.csv'")

TF-IDF feature shape: (76, 16)
Epoch [10/50], Loss: 0.05892486497759819
Epoch [20/50], Loss: 0.04219359531998634
Epoch [30/50], Loss: 0.03244723007082939
Epoch [40/50], Loss: 0.02644597925245762
Epoch [50/50], Loss: 0.022476201876997948
  machine     emb_1     emb_2   emb_3     emb_4     emb_5     emb_6     emb_7  \
0  boraxo -0.676626 -0.176914  0.5144  0.244138  1.338624 -1.346281  0.342902   
1  boraxo -0.676626 -0.176914  0.5144  0.244138  1.338624 -1.346281  0.342902   
2  boraxo -0.676626 -0.176914  0.5144  0.244138  1.338624 -1.346281  0.342902   
3  boraxo -0.676626 -0.176914  0.5144  0.244138  1.338624 -1.346281  0.342902   
4  boraxo -0.676626 -0.176914  0.5144  0.244138  1.338624 -1.346281  0.342902   

      emb_8     emb_9  ...  Apps_MASS3DPA  Apps_NODAL_ACCUMULATION_3D  \
0 -0.652312  0.354068  ...      -0.553700                   -0.508990   
1 -0.652312  0.354068  ...      -0.487246                    0.036973   
2 -0.652312  0.354068  ...      -0.470421                