In [10]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

# Load data from CSV
file_path = '../Datasets/ApplicationData/TestDFFT/TestDfft(Quartz-Corona).csv'
data = pd.read_csv(file_path)

# Separate text data into two columns
text_data_1 = data.iloc[:, 0]
text_data_2 = data.iloc[:, 1]
rest_of_data = data.iloc[:, 2:]  # The rest of the numerical features

# Vectorize text data using TF-IDF
vectorizer1 = TfidfVectorizer(max_features=1000)  # Limit TF-IDF features to 1000
tfidf_features_1 = vectorizer1.fit_transform(text_data_1).toarray()

vectorizer2 = TfidfVectorizer(max_features=1000)  # Limit TF-IDF features to 1000
tfidf_features_2 = vectorizer2.fit_transform(text_data_2).toarray()

# Normalize the rest of the numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(rest_of_data)

# Convert the TF-IDF features to PyTorch tensors
tfidf_tensor_1 = torch.tensor(tfidf_features_1, dtype=torch.float32)
tfidf_tensor_2 = torch.tensor(tfidf_features_2, dtype=torch.float32)

# Check shapes
print(f"TF-IDF features 1 shape: {tfidf_tensor_1.shape}")
print(f"TF-IDF features 2 shape: {tfidf_tensor_2.shape}")

# Define a Feed-Forward Neural Network for Text Embedding
class DNNTextEmbedder(nn.Module):
    def __init__(self, input_dim, embedding_dim):
        super(DNNTextEmbedder, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, embedding_dim)  # Embedding layer
        self.fc4 = nn.Linear(embedding_dim, 256)  # Decoder starts here
        self.fc5 = nn.Linear(256, 512)
        self.fc6 = nn.Linear(512, input_dim)  # Output layer to match input size

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        embeddings = torch.relu(self.fc3(x))  # Generate embeddings
        x = torch.relu(self.fc4(embeddings))  # Reconstruct from embeddings
        x = torch.relu(self.fc5(x))
        reconstructed = self.fc6(x)  # Match input size
        return reconstructed, embeddings  # Return both reconstructed and embeddings

# Initialize the DNN Models
input_dim_1 = tfidf_features_1.shape[1]  # Number of TF-IDF features for the first text column
print(f'Input dimension here {input_dim_1}')
embedding_dim_1 = 10  # Target embedding size for the first text column
model_1 = DNNTextEmbedder(input_dim=input_dim_1, embedding_dim=embedding_dim_1)

input_dim_2 = tfidf_features_2.shape[1]  # Number of TF-IDF features for the second text column
print(f'input dimension here {input_dim_2}')
embedding_dim_2 = 5  # Target embedding size for the second text column
model_2 = DNNTextEmbedder(input_dim=input_dim_2, embedding_dim=embedding_dim_2)

# Training function
def train_model(model, data_tensor, num_epochs=50, batch_size=256):
    criterion = nn.MSELoss()  # Reconstruction loss
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    dataset = torch.utils.data.TensorDataset(data_tensor, data_tensor)  # Use the same tensor for input and target
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, (inputs, _) in enumerate(dataloader):
            optimizer.zero_grad()
            reconstructed, _ = model(inputs)  # Get both reconstructed data and embeddings
            loss = criterion(reconstructed, inputs)  # Calculate loss on the reconstruction
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        if (epoch + 1) % 10 == 0:
            print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(dataloader)}")

# Train the models
print("Training model for text column 1...")
train_model(model_1, tfidf_tensor_1)

print("Training model for text column 2...")
train_model(model_2, tfidf_tensor_2)

# Extract embeddings after training
with torch.no_grad():
    _, embeddings_1 = model_1(tfidf_tensor_1)  # Only extract embeddings
    _, embeddings_2 = model_2(tfidf_tensor_2)

# Combine the embeddings with the rest of the numerical features
embeddings_df_1 = pd.DataFrame(embeddings_1, columns=[f'emb1_{i+1}' for i in range(embedding_dim_1)])
embeddings_df_2 = pd.DataFrame(embeddings_2, columns=[f'emb2_{i+1}' for i in range(embedding_dim_2)])
rest_of_data_df = pd.DataFrame(scaled_features, columns=rest_of_data.columns)

# Combine labels, embeddings, and numerical features
final_data = pd.concat([data.iloc[:, 2:], embeddings_df_1, embeddings_df_2, rest_of_data_df], axis=1)

# Save the combined data with embeddings to a CSV file
final_data.to_csv('DNNEmbeddings/TestDFFT/TestDfft(Quartz-Corona)_embeddings.csv', index=False)

print("Data with DNN-based embeddings saved to 'DNNEmbeddings/TestDFFT/TestDfft(Quartz-Corona)_embeddings.csv'")

TF-IDF features 1 shape: torch.Size([84, 1])
TF-IDF features 2 shape: torch.Size([84, 1])
Input dimension here 1
input dimension here 1
Training model for text column 1...
Epoch [10/50], Loss: 0.02161221019923687
Epoch [20/50], Loss: 0.01768951490521431
Epoch [30/50], Loss: 0.008260988630354404
Epoch [40/50], Loss: 0.0020060506649315357
Epoch [50/50], Loss: 0.00032211022335104644
Training model for text column 2...
Epoch [10/50], Loss: 0.11320708692073822
Epoch [20/50], Loss: 0.011499886400997639
Epoch [30/50], Loss: 0.002009041840210557
Epoch [40/50], Loss: 0.003954019863158464
Epoch [50/50], Loss: 0.0020194475073367357
Data with DNN-based embeddings saved to 'DNNEmbeddings/TestDFFT/TestDfft(Quartz-Corona)_embeddings.csv'
