In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

# Load data from CSV
file_path = '../Datasets/ApplicationData/TestDFFT/TestDfft(Quartz-Corona).csv'
data = pd.read_csv(file_path)

# Separate text data into two columns
text_data_1 = data.iloc[:, 0]
text_data_2 = data.iloc[:, 1]
rest_of_data = data.iloc[:, 2:]  # The rest of the numerical features

# Vectorize text data using TF-IDF
vectorizer1 = TfidfVectorizer(max_features=1000)  # Limit TF-IDF features to 1000
tfidf_features_1 = vectorizer1.fit_transform(text_data_1).toarray()

vectorizer2 = TfidfVectorizer(max_features=1000)  # Limit TF-IDF features to 1000
tfidf_features_2 = vectorizer2.fit_transform(text_data_2).toarray()

# Normalize the rest of the numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(rest_of_data)

# Convert the TF-IDF features to PyTorch tensors
tfidf_tensor_1 = torch.tensor(tfidf_features_1, dtype=torch.float32)
tfidf_tensor_2 = torch.tensor(tfidf_features_2, dtype=torch.float32)

In [2]:
class CNNTextEmbedder(nn.Module):
    def __init__(self, input_dim, embedding_dim):
        super(CNNTextEmbedder, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=128, kernel_size=3, stride=1, padding=1)
        
        # Calculate the size after conv1
        conv_output_size = (input_dim + 2 * 1 - 3) // 1 + 1  # After convolution
        
        # Use a smaller linear layer if needed or avoid pooling
        self.fc = nn.Linear(128 * conv_output_size, embedding_dim)

    def forward(self, x):
        x = x.unsqueeze(1)  # Add a channel dimension (batch_size, 1, input_dim)
        x = torch.relu(self.conv1(x))
        x = x.view(x.size(0), -1)  # Flatten the tensor
        embedding = self.fc(x)
        return embedding

# Initialize the CNN Models
input_dim_1 = tfidf_features_1.shape[1]  # Number of TF-IDF features for the first text column
embedding_dim_1 = 10  # Target embedding size for the first text column
model_1 = CNNTextEmbedder(input_dim=input_dim_1, embedding_dim=embedding_dim_1)

input_dim_2 = tfidf_features_2.shape[1]  # Number of TF-IDF features for the second text column
embedding_dim_2 = 5  # Target embedding size for the second text column
model_2 = CNNTextEmbedder(input_dim=input_dim_2, embedding_dim=embedding_dim_2)

In [3]:
# Training function
def train_model(model, data_tensor, num_epochs=50, batch_size=256):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    dataset = torch.utils.data.TensorDataset(data_tensor, data_tensor)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, (inputs, _) in enumerate(dataloader):
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, outputs)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        if (epoch + 1) % 10 == 0:
            print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(dataloader)}")

# Train the models
print("Training model for text column 1...")
train_model(model_1, tfidf_tensor_1)

print("Training model for text column 2...")
train_model(model_2, tfidf_tensor_2)

Training model for text column 1...
Epoch [10/50], Loss: 0.0
Epoch [20/50], Loss: 0.0
Epoch [30/50], Loss: 0.0
Epoch [40/50], Loss: 0.0
Epoch [50/50], Loss: 0.0
Training model for text column 2...
Epoch [10/50], Loss: 0.0
Epoch [20/50], Loss: 0.0
Epoch [30/50], Loss: 0.0
Epoch [40/50], Loss: 0.0
Epoch [50/50], Loss: 0.0


In [4]:
def calculate_conv_output_size(input_dim, kernel_size, padding, stride):
    return (input_dim + 2 * padding - kernel_size) // stride + 1

def calculate_pooled_output_size(conv_output_size, pool_size):
    return (conv_output_size - pool_size) // pool_size + 1

input_dim = tfidf_features_1.shape[1]
conv_output_size = calculate_conv_output_size(input_dim, kernel_size=3, padding=1, stride=1)
pool_output_size = calculate_pooled_output_size(conv_output_size, pool_size=2)

print(f"Convolution output size: {conv_output_size}")
print(f"Pooling output size: {pool_output_size}")

Convolution output size: 1
Pooling output size: 0


In [5]:
with torch.no_grad():
    embeddings_1 = model_1(tfidf_tensor_1).numpy()
    embeddings_2 = model_2(tfidf_tensor_2).numpy()

# Combine the embeddings with the rest of the numerical features
embeddings_df_1 = pd.DataFrame(embeddings_1, columns=[f'emb1_{i+1}' for i in range(embedding_dim_1)])
embeddings_df_2 = pd.DataFrame(embeddings_2, columns=[f'emb2_{i+1}' for i in range(embedding_dim_2)])
rest_of_data_df = pd.DataFrame(scaled_features, columns=rest_of_data.columns)
final_data = pd.concat([data.iloc[:, 2:], embeddings_df_1, embeddings_df_2, rest_of_data_df], axis=1)

# Save the combined data with embeddings to a CSV file
final_data.to_csv('CNNEmbeddings/TestDFFT/TestDfft(Quartz-Corona)_embeddings.csv', index=False)

print("Data with CNN-based embeddings saved to 'cnn_text_embeddings.csv'")

Data with CNN-based embeddings saved to 'cnn_text_embeddings.csv'
